In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from math import sqrt

In [2]:
headers=["idnum","age","workerclass","interestincome", "traveltimetowork", "vehicleoccupancy", "meansoftransport", 
"marital", "schoolenrollment", "educationalattain", "sex", "workarrivaltime", "hoursworkperweek", "ancestry", "degreefield", "industryworkedin", "wages"]
headerstest=["idnum","age","workerclass","interestincome", "traveltimetowork", "vehicleoccupancy", "meansoftransport", 
"marital", "schoolenrollment", "educationalattain", "sex", "workarrivaltime", "hoursworkperweek", "ancestry", "degreefield", "industryworkedin"]
trainData = pd.read_csv("census_train.csv",header=None, names=headers)
testData = pd.read_csv("census_test.csv",header=None, names=headerstest)

In [3]:
trainingFeatures = trainData.iloc[:,:-1]
trainingLabels = trainData['wages']
print ("Training Data Shape: ",trainingFeatures.shape)
trainingFeatures.head(5)

Training Data Shape:  (1184, 16)


Unnamed: 0,idnum,age,workerclass,interestincome,traveltimetowork,vehicleoccupancy,meansoftransport,marital,schoolenrollment,educationalattain,sex,workarrivaltime,hoursworkperweek,ancestry,degreefield,industryworkedin
0,0,58,3,0,40,2,1,1,1,22,1,84,40,32,2300,7870
1,1,34,3,0,10,1,1,5,1,16,1,168,40,51,?,7860
2,2,57,2,0,15,8,1,5,1,16,1,100,30,51,?,8390
3,3,17,?,0,?,?,?,5,2,15,2,?,?,714,?,?
4,4,42,1,0,20,1,1,1,1,22,1,92,45,21,5001,6990


In [4]:
testFeatures=testData
print ("Testing Data Shape: ",testFeatures.shape)

Testing Data Shape:  (500, 16)


In [5]:
testFeatures.head(5)

Unnamed: 0,idnum,age,workerclass,interestincome,traveltimetowork,vehicleoccupancy,meansoftransport,marital,schoolenrollment,educationalattain,sex,workarrivaltime,hoursworkperweek,ancestry,degreefield,industryworkedin
0,1405,62,1,20,?,?,?,2,1,18,2,?,?,32,?,4970
1,1406,61,1,0,30,1,1,2,1,18,2,112,40,51,?,7070
2,1407,38,4,0,15,1,1,1,1,24,1,101,40,148,3700,7870
3,1408,19,?,0,?,?,?,5,3,15,2,?,?,929,?,?
4,1410,37,1,0,35,?,2,1,1,21,2,119,42,720,6109,7970


In [6]:
trainingFeatures = trainingFeatures.replace('?', 0)
testFeatures = testFeatures.replace('?', 0)

In [7]:
bins = [0,12,35,55,75,95]
trainingFeatures['age'] = np.searchsorted(bins, trainingFeatures['age'].values)
testFeatures['age'] = np.searchsorted(bins, testFeatures['age'].values)

In [8]:
bins1 = [0,15,19,22,24]
trainingFeatures['educationalattain'] = np.searchsorted(bins1, trainingFeatures['educationalattain'].values)
testFeatures['educationalattain'] = np.searchsorted(bins1, testFeatures['educationalattain'].values)

In [9]:
trainingFeatures['industryworkedin']=trainingFeatures['industryworkedin'].astype(int)
bins2 = [-1,0,290,490,690,770,3990,4590,5790,6390,6780,7190,7790,7890,8290,8470,8690,9290,9590,9870,9920]
trainingFeatures['industryworkedin'] = pd.cut(trainingFeatures['industryworkedin'], bins=bins2)

testFeatures['industryworkedin']=testFeatures['industryworkedin'].astype(int)
testFeatures['industryworkedin'] = pd.cut(testFeatures['industryworkedin'], bins=bins2)

In [10]:
print ("Training Data Shape: ",trainingFeatures.shape)
print ("Testing Data Shape: ",testFeatures.shape)

Training Data Shape:  (1184, 16)
Testing Data Shape:  (500, 16)


In [11]:
def cleanData(df,unnecessaryColumns):
    """ DATA CLEANING """
    # Remove duplicates if any from the data. 
    df.drop_duplicates(inplace=True)
    
    df.drop(labels=unnecessaryColumns, axis=1, inplace=True)

    return df

In [12]:
unnecessaryColumns = ['idnum','traveltimetowork', 'vehicleoccupancy', 'marital', 'workarrivaltime', 'ancestry']
trainingFeatures = cleanData(trainingFeatures, unnecessaryColumns)
testFeatures = cleanData(testFeatures, unnecessaryColumns)


In [13]:
def oneHotEncode(df, encodeColumns, prefix=None):
    encodedLabels = pd.get_dummies(df[encodeColumns], prefix=prefix)
    df = df.drop(encodeColumns, axis=1)
    return df.join(encodedLabels)

In [14]:
encodeColumns = ["workerclass","meansoftransport","schoolenrollment","educationalattain","sex","degreefield","industryworkedin"]
trainingFeatures = oneHotEncode(trainingFeatures, encodeColumns)
testFeatures = oneHotEncode(testFeatures, encodeColumns)

In [15]:
min_max_scaler = MinMaxScaler()
trainingFeatures = min_max_scaler.fit_transform(trainingFeatures)
trainingFeatures = pd.DataFrame(trainingFeatures)
pca = PCA()
pca.fit(trainingFeatures)
trainingFeatures = pd.DataFrame(pca.transform(trainingFeatures))

In [16]:
min_max_scaler = MinMaxScaler()
testFeatures = min_max_scaler.fit_transform(testFeatures)
testFeatures = pd.DataFrame(testFeatures)
pca = PCA()
pca.fit(testFeatures)
testFeatures = pd.DataFrame(pca.transform(testFeatures))


In [17]:
print ("Training Data Shape: ",trainingFeatures.shape)
print ("Testing Data Shape: ",testFeatures.shape)

Training Data Shape:  (1184, 143)
Testing Data Shape:  (500, 110)


In [18]:
feature_difference = set(trainingFeatures)-set(testFeatures)

In [19]:
feature_difference_df = pd.DataFrame(data=np.zeros((testFeatures.shape[0], len(feature_difference))),
                                     columns=list(feature_difference))

In [20]:
testFeatures = testFeatures.join(feature_difference_df)

In [21]:
print ("Training Data Shape: ",trainingFeatures.shape)
print ("Testing Data Shape: ",testFeatures.shape)

Training Data Shape:  (1184, 143)
Testing Data Shape:  (500, 143)


In [107]:
trainX, testX, trainy, testy = train_test_split(trainingFeatures, trainingLabels, test_size=0.25)
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(max_depth=8,n_estimators=65,random_state=2)

In [108]:
rf.fit(trainX, trainy)
predictionsTrain = rf.predict(trainX)
predictionsTest = rf.predict(testX)
sqrt(mean_squared_error(trainy, predictionsTrain)), sqrt(mean_squared_error(testy, predictionsTest))

(34532.263733360494, 39628.08915219467)

In [22]:
def linearFitAndPredict(trainX, trainy, testX, testy):
    regr = linear_model.LinearRegression()
    regr.fit(trainX, trainy)
    predictionsTrain = regr.predict(trainX)
    predictionsTest = regr.predict(testX)
    
    return sqrt(mean_squared_error(trainy, predictionsTrain)), sqrt(mean_squared_error(testy, predictionsTest))

def polyFitAndPredict(trainX, trainy, testX, testy, alpha, degree):
    model = make_pipeline(PolynomialFeatures(degree), linear_model.ElasticNet(alpha=alpha, l1_ratio=0.2))
    model.fit(trainX, trainy)
    predictionsTrain = model.predict(trainX)
    predictionsTest = model.predict(testX)

    
    print("Training Mean squared error for alpha="+str(alpha)+" : %.2f" % sqrt(mean_squared_error(trainy, predictionsTrain)))
    print("Test Mean squared error for alpha="+str(alpha)+" : %.2f "  % sqrt(mean_squared_error(testy, predictionsTest)))
    
    return sqrt(mean_squared_error(trainy, predictionsTrain)), sqrt(mean_squared_error(testy, predictionsTest)), model



In [23]:
kf = KFold(n_splits=5)
kf.get_n_splits(trainingFeatures)
predictionsTrainErr = []
predictionsTestErr = []
for train_index, test_index in kf.split(trainingFeatures):
    X_train, X_test = trainingFeatures.iloc[train_index], trainingFeatures.iloc[test_index]
    y_train, y_test = trainingLabels.iloc[train_index], trainingLabels.iloc[test_index]
    train_err, test_err = linearFitAndPredict(X_train, y_train, X_test, y_test)
    predictionsTrainErr.append(train_err)
    predictionsTestErr.append(test_err)
    
print ("Root Mean Training Error for LinReg after 5 folds: ", np.mean(predictionsTrainErr))
print ("Root Mean Test Error for LinReg after 5 folds: ", np.mean(predictionsTestErr))

Root Mean Training Error for LinReg after 5 folds:  56408.46233610921
Root Mean Test Error for LinReg after 5 folds:  91449.91932904028


In [111]:
predictionsTrainErr = []
predictionsTestErr = []
model = {}
for train_index, test_index in kf.split(trainingFeatures):
    X_train, X_test = trainingFeatures.iloc[train_index], trainingFeatures.iloc[test_index]
    y_train, y_test = trainingLabels.iloc[train_index], trainingLabels.iloc[test_index]
    train_err, test_err, model = polyFitAndPredict(trainX, trainy, testX, testy, 0.2, 2)
    predictionsTrainErr.append(train_err)
    predictionsTestErr.append(test_err)

print ("Root Mean Training Error for PolyReg after 5 folds: ", np.mean(predictionsTrainErr))
print ("Root Mean Test Error for PolyReg after 5 folds: ", np.mean(predictionsTestErr))

Training Mean squared error for alpha=0.2 : 63707.26
Test Mean squared error for alpha=0.2 : 30575.12 
Training Mean squared error for alpha=0.2 : 63707.26
Test Mean squared error for alpha=0.2 : 30575.12 
Training Mean squared error for alpha=0.2 : 63707.26
Test Mean squared error for alpha=0.2 : 30575.12 
Training Mean squared error for alpha=0.2 : 63707.26
Test Mean squared error for alpha=0.2 : 30575.12 
Training Mean squared error for alpha=0.2 : 63707.26
Test Mean squared error for alpha=0.2 : 30575.12 
Root Mean Training Error for PolyReg after 5 folds:  63707.2621743
Root Mean Test Error for PolyReg after 5 folds:  30575.1183424


In [124]:
testPredictions = model.predict(testFeatures)

In [127]:
toFile = "ID,WAGES\n"
indices = testData["idnum"].values
# print testPredictions
for i, prediction in enumerate(testPredictions):
    toFile += str(indices[i])+","+str(prediction)+"\n"
#     
with open("Output.csv", "w") as text_file:
    text_file.write(toFile)