In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas

class RemoveColumnsWithLowCorrelation(BaseEstimator,TransformerMixin):
    def __init__(self, label_column, correlation_cutoff = .05, important_correlations = []):
        self.correlation_cutoff = correlation_cutoff
        self.label_column = label_column
        self.important_correlations = important_correlations
    def fit(self, X, y=None):
        return self
    def transform(self, X,y=None):
        if self.important_correlations == []:
            correlation_matrix = X.corr()
            correlation_matrix[self.label_column].sort_values(ascending=False)
            self.important_correlations = X.columns[abs(correlation_matrix[self.label_column]) > self.correlation_cutoff]
        return X[self.important_correlations].copy()
        
            
        

In [9]:
import pandas as pd

# data from https://archive.ics.uci.edu/ml/datasets/Census-Income+(KDD)

header_info = ["age",
              "class_of_worker",
              "industry code",
              "occupation code",
              "level of education",
              "wage per hour",
              "entrolled in education as of last week",
              "marital status",
              "major industry code",
              "major occupation code",
              "race",
              "hispanic origin",
              "sex",
              "member of a labor union",
              "reason for unemployment",
              "full or part time employment status",
              "captial gains",
              "capital losses",
              "dividends from stocks",
              "tax filler status",
              "region of previous residence",
              "state of previous residence",
              "detailed household and family status",
              "detailed household summary",
              "migration code",
              "migration code - change in region",
              "migration code - move within region",
              "live in this house one year ago",
              "migration - previous resident in sunbelt",
              "number of persons that worked for employer",
              "family members under 18",
              "country of birth father",
              "country of birth mother",
              "country of birth",
              "citizenship",
              "own business or self-employed",
              "fill included questionaire for veterans administration",
              "veterans benefits",
              "weeks worked in the year",
              "year of survey",
              "income less than or greater than 50,000",
              "number of years of education"]


my_census = pd.read_csv("./census-income.data", names=header_info)

test_data = pd.read_csv("./census-income.test", names=header_info)

#drop data if age < 18
#census = census.drop(census[census['age'] <= 18].index)

remove_low_corr = RemoveColumnsWithLowCorrelation(label_column="wage per hour")

my_important_census = remove_low_corr.transform(my_census)

NameError: name 'important_correlations' is not defined

AttributeError: 'RemoveColumnsWithLowCorrelation' object has no attribute 'head'

In [2]:
# we have our data... now let's look at it

#my_census.describe()

"""%matplotlib inline
import matplotlib.pyplot as plt
my_census.hist(bins=10, figsize=(20,15))
plt.show()"""



'%matplotlib inline\nimport matplotlib.pyplot as plt\nmy_census.hist(bins=10, figsize=(20,15))\nplt.show()'

In [3]:
#actually... just use sklearn's data splitting

import sklearn

from sklearn.model_selection import train_test_split

import pandas

my_census = pandas.get_dummies(my_census)
my_test = pandas.get_dummies(test_data)

train_set, test_set = my_census, my_test#train_test_split(my_census,test_size=.2,random_state=84)

label = "wage per hour"

test_label = test_set[label]
test = test_set.drop(label,axis=1)

test.shape

(99762, 409)

In [4]:
print("The train set is {} many examples. Whereas the test set contains {} examples.".format(len(train_set),len(test_set)))

#Note that we could do stratified sampling here if we knew which categories were most important

census = train_set.copy()

#drop anything with the word code
#bad_feature = [x for x in header_info if "code" in x]

#census = census.drop(axis=1, labels=bad_feature)
#census = census.drop(axis=1,labels=['number of persons that worked for employer','family members under 18','year of survey','fill included questionaire for veterans administration'])

#drop data if age < 18
#census = census.drop(census[census['age'] <= 18].index)



#test.head()

The train set is 199523 many examples. Whereas the test set contains 99762 examples.


In [5]:
correlation_matrix = census.corr()

correlation_matrix['wage per hour'].sort_values(ascending=False)

important_correlations = census.columns[abs(correlation_matrix['wage per hour']) > .05]



# for this first project, let's try to predict wage per hour!

label = "wage per hour"

census_label = census[label]
census = census.drop(label,axis=1)

columns_census = set(census.columns)
columns_test = set(test.columns)

not_shared = [x for x in columns_census if x not in columns_test]

census = census.drop(not_shared, axis=1)    

In [6]:
important_correlations

Index(['industry code', 'occupation code', 'wage per hour',
       'family members under 18',
       'fill included questionaire for veterans administration',
       'weeks worked in the year', 'year of survey',
       'class_of_worker_ Federal government',
       'class_of_worker_ Not in universe', 'class_of_worker_ Private',
       'level of education_ Children',
       'level of education_ High school graduate',
       'marital status_ Married-civilian spouse present',
       'marital status_ Never married',
       'major industry code_ Hospital services',
       'major industry code_ Manufacturing-durable goods',
       'major industry code_ Manufacturing-nondurable goods',
       'major industry code_ Not in universe or children',
       'major industry code_ Transportation',
       'major occupation code_ Adm support including clerical',
       'major occupation code_ Machine operators assmblrs & inspctrs',
       'major occupation code_ Not in universe',
       'major occupation

In [7]:
from sklearn.externals import joblib

#def check_if_model_exists_and_load_or_run_and_save(model_name):
    

In [8]:
import numpy as np

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(census,census_label)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [26]:
from sklearn.metrics import mean_squared_error
census_predictions = lin_reg.predict(test)

lin_mse = mean_squared_error(test_label, census_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

ziped = zip(lin_reg.coef_, census.columns)

zipped = sorted(ziped, reverse=True)
zipped

[(395.59903872285952, 'member of a labor union_ Yes'),
 (98.076145369549792,
  'detailed household and family status_ Child 18+ ever marr RP of subfamily'),
 (90.471108555179427,
  'detailed household and family status_ Child 18+ ever marr Not in a subfamily'),
 (89.348843413732951,
  'detailed household and family status_ Child 18+ never marr RP of subfamily'),
 (87.263178887582441,
  'detailed household and family status_ Child 18+ never marr Not in a subfamily'),
 (74.666643421450644, 'citizenship_ Panama'),
 (74.234903055077396,
  'detailed household and family status_ Child 18+ spouse of subfamily RP'),
 (72.928409382868196, 'state of previous residence_ Virginia'),
 (67.644429591084744, 'citizenship_ Hong Kong'),
 (66.614242296511819, 'member of a labor union_ No'),
 (66.563942233515704, 'major industry code_ Construction'),
 (59.486548856579766, 'major industry code_ Hospital services'),
 (57.900749102468112, 'major industry code_ Mining'),
 (56.547310262824801,
  'detailed hous

In [10]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(census,census_label)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [11]:
census_predictions2 = tree_reg.predict(test)
tree_mse = mean_squared_error(test_label, census_predictions2)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

278.42256537012105

In [12]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, census, census_label, scoring = "neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("scores: ", scores)
    print("mean: ", np.sqrt(-scores.mean()))
    print("std: " , scores.std())

    

In [13]:
display_scores(scores)

scores:  [-88765.5793615  -81103.34766702 -89376.72881271 -70504.93263833
 -80138.19882719 -67220.39384523 -85173.49428629 -75209.43118484
 -70853.29746391 -73875.43243785]
mean:  279.682111785
std:  7474.93652051


In [14]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(census, census_label)
forest_predict = forest_reg.predict(test)
forest_rmse = np.sqrt(mean_squared_error(test_label,forest_predict))
forest_rmse

206.26381148721063

In [15]:
#Now we will attempt a grid search

from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators':[10,20], 'max_features':[10,20]}]
              
forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(census, census_label)

my_pred = grid_search.predict(test)
grid_search_rmse = np.sqrt(mean_squared_error(my_pred,test_label))

grid_search_rmse

197.75989428559592

In [16]:
feature_importances = grid_search.best_estimator_.feature_importances_

#feature_importances

column_names = census.columns

important = sorted(zip(feature_importances,column_names),reverse=True)

num_important_columns = 150
my_important_column_list = []
for i in important[:num_important_columns]:
    my_important_column_list.append(i[1])

In [17]:
my_important_column_list

['member of a labor union_ Not in universe',
 'member of a labor union_ No',
 'member of a labor union_ Yes',
 'migration code',
 'age',
 'occupation code',
 'industry code',
 'family members under 18',
 'dividends from stocks',
 'year of survey',
 'level of education_ High school graduate',
 'fill included questionaire for veterans administration',
 'level of education_ Some college but no degree',
 'major industry code_ Hospital services',
 'major industry code_ Not in universe or children',
 'level of education_ Bachelors degree(BA AB BS)',
 'major occupation code_ Professional specialty',
 'sex_ Male',
 'captial gains',
 'sex_ Female',
 'major occupation code_ Precision production craft & repair',
 'class_of_worker_ Private',
 'number of years of education_ - 50000.',
 'detailed household summary_ Householder',
 'race_ White',
 'number of years of education_ 50000+.',
 'capital losses',
 'detailed household and family status_ Nonfamily householder',
 'detailed household and family 

## What does this mean?

Since being a member of a labor union *or* not being in a labor union *or* the status of being in a labor union unknown is the best predictor of wage -- I believe trying to filter the data to only include those greater than 18 years of age will produce more significant results.

Before we do that, let's try to use a neural network model on the data!

In [19]:
import tflearn
import tensorflow as tf


tf.logging.set_verbosity(tf.logging.INFO)

X, Y, testX, testY = census.as_matrix(), census_label.as_matrix(), test.as_matrix(), test_label.as_matrix()


Y= np.reshape(Y,(Y.shape[0],1))
testY = np.reshape(testY, (testY.shape[0],1))

print("The shape of X is: {}, while Y has the shape: {}".format(X.shape, Y.shape))

tf.reset_default_graph()




input_layer = tflearn.input_data(shape=[None,X.shape[1]])
dense1 = tflearn.fully_connected(incoming=input_layer, n_units=40, name="first_layer")
dropout1 = tflearn.dropout(dense1, .8,name="dropout1")
dense2 = tflearn.fully_connected(dropout1,40, name="layer2")
dropout2 = tflearn.dropout(dense2, .8, name="dropout2")
last = tflearn.fully_connected(dropout2, 1, activation="linear")

sgd = tflearn.SGD(learning_rate=.1, lr_decay=.96, decay_step=1000)
r2 = tflearn.metrics.R2()
net = tflearn.regression(last, optimizer=sgd, loss = 'mean_square',metric=r2)

model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir="/tmp/tensorboard")
model.fit(X, Y, n_epoch=5 ,run_id="dense_model" , validation_set = (testX,testY), show_metric=True, snapshot_epoch=True )

Training Step: 15589  | total loss: [1m[32m62153.63281[0m[0m | time: 25.087s
| SGD | epoch: 005 | loss: 62153.63281 - R2: nan -- iter: 199488/199523
Training Step: 15590  | total loss: [1m[32m60864.66016[0m[0m | time: 27.889s
| SGD | epoch: 005 | loss: 60864.66016 - R2: nan | val_loss: 87957.14388 - val_acc: inf -- iter: 199523/199523
--


In [20]:
np.sqrt(mean_squared_error(model.predict(testX),testY))

296.57569665460949