In [62]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas

class one_hot(BaseEstimator,TransformerMixin):
    def __init__(self):
        """does category to onehot encoding on dataframe"""
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return pandas.get_dummies(X)
    
class make_sure_columns_match(BaseEstimator,TransformerMixin):
    def __init__(self,X,testX):
        """Make sure that train and test set have the same columns"""
        self.X = X
        self.testX = testX
        self.overlap = set(X).intersection(testX)
    def fit(self,X,y=None):
        return fit
    def transform(self,X,y=None):
        return X[list(self.overlap)].copy()

    
        
class RemoveColumnsWithLowCorrelation(BaseEstimator,TransformerMixin):
    def __init__(self, label_column, correlation_cutoff = .05):
        self.correlation_cutoff = correlation_cutoff
        self.label_column = label_column
        self.important_correlations = []
    def fit(self, X, y=None):
        return self
    def transform(self, X,y=None):
        if len(self.important_correlations) == 0:
            correlation_matrix = X.corr()
            correlation_matrix[self.label_column].sort_values(ascending=False)
            self.important_correlations = X.columns[abs(correlation_matrix[self.label_column]) > self.correlation_cutoff]
        return X[self.important_correlations].copy()
        
class filter_datapoints(BaseEstimator,TransformerMixin):
    def __init__(self, column, conditional_func):
        self.column = column
        self.conditional_func = conditional_func
    def fit(self, X, y =None):
        return self
    def transform(self,X,y=None):
        return X[X[self.column].apply(self.conditional_func)]
        
            
class split_label(BaseEstimator,TransformerMixin):
    def __init__(self, label):
        self.label = label
    def fit(self, X,y=None):
        return self
    def transform(self,X,y=None):
        return X[self.label], X.drop(self.label, axis=1)

In [63]:
import pandas as pd

# data from https://archive.ics.uci.edu/ml/datasets/Census-Income+(KDD)

header_info = ["age",
              "class_of_worker",
              "industry code",
              "occupation code",
              "level of education",
              "wage per hour",
              "entrolled in education as of last week",
              "marital status",
              "major industry code",
              "major occupation code",
              "race",
              "hispanic origin",
              "sex",
              "member of a labor union",
              "reason for unemployment",
              "full or part time employment status",
              "captial gains",
              "capital losses",
              "dividends from stocks",
              "tax filler status",
              "region of previous residence",
              "state of previous residence",
              "detailed household and family status",
              "detailed household summary",
              "migration code",
              "migration code - change in region",
              "migration code - move within region",
              "live in this house one year ago",
              "migration - previous resident in sunbelt",
              "number of persons that worked for employer",
              "family members under 18",
              "country of birth father",
              "country of birth mother",
              "country of birth",
              "citizenship",
              "own business or self-employed",
              "fill included questionaire for veterans administration",
              "veterans benefits",
              "weeks worked in the year",
              "year of survey",
              "income less than or greater than 50,000",
              "number of years of education"]


my_census = pd.read_csv("./census-income.data", names=header_info)

test_data = pd.read_csv("./census-income.test", names=header_info)

#drop data if age < 18
#census = census.drop(census[census['age'] <= 18].index)



In [64]:
#actually... just use sklearn's data splitting

import sklearn

from sklearn.model_selection import train_test_split

import pandas

my_census = pandas.get_dummies(my_census)
my_test = pandas.get_dummies(test_data)

filter_non_workers = lambda x : x == 0
filter_label = 'major occupation code_ Not in universe'
filt = filter_datapoints(column= filter_label, conditional_func=filter_non_workers)

my_census = filt.transform(my_census)
my_test = filt.transform(my_test)

filter_non_workers = lambda x : x == 0
filter_label = 'member of a labor union_ Not in universe'
filt = filter_datapoints(column= filter_label, conditional_func=filter_non_workers)

my_census = filt.transform(my_census)
my_test = filt.transform(my_test)


make_match = make_sure_columns_match(my_census,my_test)

my_census = make_match.transform(my_census)
my_test = make_match.transform(my_test)



In [65]:
my_census.shape


(19064, 410)

In [66]:
#this is the column that we want to predict
label = "wage per hour"

remove_low_corr = RemoveColumnsWithLowCorrelation(label_column=label)

my_low_corr_census = remove_low_corr.transform(my_census)
my_low_corr_test = remove_low_corr.transform(my_test)

split = split_label(label=label)

test_label, test = split.transform(my_low_corr_test)
census_label, census = split.transform(my_low_corr_census)

In [67]:
my_low_corr_census.shape

(19064, 29)

In [56]:
#print("The train set is {} many examples. Whereas the test set contains {} examples.".format(len(train_set),len(test_set)))

#Note that we could do stratified sampling here if we knew which categories were most important

#census = train_set.copy()

#drop anything with the word code
#bad_feature = [x for x in header_info if "code" in x]

#census = census.drop(axis=1, labels=bad_feature)
#census = census.drop(axis=1,labels=['number of persons that worked for employer','family members under 18','year of survey','fill included questionaire for veterans administration'])

#drop data if age < 18
#census = census.drop(census[census['age'] <= 18].index)



#test.head()

In [7]:
from sklearn.externals import joblib

#def check_if_model_exists_and_load_or_run_and_save(model_name):
    

In [68]:
import numpy as np

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(census,census_label)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [70]:
from sklearn.metrics import mean_squared_error
census_predictions = lin_reg.predict(test)

lin_mse = mean_squared_error(test_label, census_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

#ziped = zip(lin_reg.coef_, census.columns)

#zipped = sorted(ziped, reverse=True)
#zipped

636.62840974371932

In [None]:
# let's go back to the drawing board and filter out "outliers" ... datapoints where "major occupation code_ Not in universe" = 1

In [71]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(census,census_label)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [72]:
census_predictions2 = tree_reg.predict(test)
tree_mse = mean_squared_error(test_label, census_predictions2)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

810.00996590733746

In [73]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, census, census_label, scoring = "neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("scores: ", scores)
    print("mean: ", np.sqrt(-scores.mean()))
    print("std: " , scores.std())

    

In [74]:
display_scores(scores)

scores:  [-660015.02695504 -759717.73788194 -529704.19673523 -627727.77533221
 -619199.72919002 -612691.34915998 -665865.29007619 -722991.1005205
 -707960.77527305 -697010.20611485]
mean:  812.581268996
std:  62877.9224108


In [75]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(census, census_label)
forest_predict = forest_reg.predict(test)
forest_rmse = np.sqrt(mean_squared_error(test_label,forest_predict))
forest_rmse

699.0790859136921

In [76]:
#Now we will attempt a grid search

from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators':[10,20], 'max_features':[10,20]}]
              
forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(census, census_label)

my_pred = grid_search.predict(test)
grid_search_rmse = np.sqrt(mean_squared_error(my_pred,test_label))

grid_search_rmse

686.53389673966205

In [77]:
feature_importances = grid_search.best_estimator_.feature_importances_

#feature_importances

column_names = census.columns

important = sorted(zip(feature_importances,column_names),reverse=True)

num_important_columns = 150
my_important_column_list = []
for i in important[:num_important_columns]:
    my_important_column_list.append(i[1])

In [78]:
my_important_column_list

['industry code',
 'occupation code',
 'family members under 18',
 'level of education_ High school graduate',
 'level of education_ Bachelors degree(BA AB BS)',
 'member of a labor union_ No',
 'level of education_ Masters degree(MA MS MEng MEd MSW MBA)',
 'major industry code_ Hospital services',
 'number of years of education_ 50000+.',
 'member of a labor union_ Yes',
 'level of education_ Associates degree-occup /vocational',
 'sex_ Female',
 'sex_ Male',
 'number of years of education_ - 50000.',
 'major occupation code_ Precision production craft & repair',
 'major industry code_ Education',
 'class_of_worker_ Federal government',
 'major occupation code_ Professional specialty',
 'major occupation code_ Executive admin and managerial',
 'major industry code_ Manufacturing-durable goods',
 'major industry code_ Finance insurance and real estate',
 'major occupation code_ Sales',
 'major industry code_ Retail trade',
 'major occupation code_ Technicians and related support',
 'ma

## What does this mean?

Since being a member of a labor union *or* not being in a labor union *or* the status of being in a labor union unknown is the best predictor of wage -- I believe trying to filter the data to only include those greater than 18 years of age will produce more significant results.

Before we do that, let's try to use a neural network model on the data!

In [79]:
import tflearn
import tensorflow as tf


tf.logging.set_verbosity(tf.logging.INFO)

X, Y, testX, testY = census.as_matrix(), census_label.as_matrix(), test.as_matrix(), test_label.as_matrix()


Y= np.reshape(Y,(Y.shape[0],1))
testY = np.reshape(testY, (testY.shape[0],1))

print("The shape of X is: {}, while Y has the shape: {}".format(X.shape, Y.shape))

tf.reset_default_graph()




input_layer = tflearn.input_data(shape=[None,X.shape[1]])
dense1 = tflearn.fully_connected(incoming=input_layer, n_units=40, name="first_layer")
dropout1 = tflearn.dropout(dense1, .8,name="dropout1")
dense2 = tflearn.fully_connected(dropout1,40, name="layer2")
dropout2 = tflearn.dropout(dense2, .8, name="dropout2")
last = tflearn.fully_connected(dropout2, 1, activation="linear")

sgd = tflearn.SGD(learning_rate=.1, lr_decay=.96, decay_step=1000)
r2 = tflearn.metrics.R2()
net = tflearn.regression(last, optimizer=sgd, loss = 'mean_square',metric=r2)

model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir="/tmp/tensorboard")
model.fit(X, Y, n_epoch=5 ,run_id="dense_model" , validation_set = (testX,testY), show_metric=True, snapshot_epoch=True )

Training Step: 1489  | total loss: [1m[32m484909.21875[0m[0m | time: 2.138s
| SGD | epoch: 005 | loss: 484909.21875 - R2: 0.4811 -- iter: 19008/19064
Training Step: 1490  | total loss: [1m[32m468948.50000[0m[0m | time: 3.241s
| SGD | epoch: 005 | loss: 468948.50000 - R2: 0.5614 | val_loss: 449100.07721 - val_acc: 0.6541 -- iter: 19064/19064
--


In [80]:
np.sqrt(mean_squared_error(model.predict(testX),testY))

670.14929446703525