In [23]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas

class one_hot(BaseEstimator,TransformerMixin):
    def __init__(self):
        """does category to onehot encoding on dataframe"""
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return pandas.get_dummies(X)
    
class make_sure_columns_match(BaseEstimator,TransformerMixin):
    def __init__(self,X,testX):
        """Make sure that train and test set have the same columns"""
        self.X = X
        self.testX = testX
        self.overlap = set(X).intersection(testX)
    def fit(self,X,y=None):
        return fit
    def transform(self,X,y=None):
        return X[list(self.overlap)].copy()

class remove_outliers_in_these_columns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_remove_outliers_from, absoute_z_score_limit = 3 , return_outliers = False):
        """fit this on the training set and remove outliers from both train and testing set"""
        self.abs_z_score_lim = absoute_z_score_limit
        self.check_these_columns = columns_to_remove_outliers_from
        self.mean_and_std = []
    def fit(self,X,y=None):
        columns = X.columns.values.tolist()
        for col in self.check_these_columns:
            if col in columns:
                self.mean_and_std[col] = [X[col].mean(),X[col].std()]
            else:
                print("This column ( {} ), is not in the database.".format(col))
        return self
    def transform(self,X,y=None):
        outliers = []
        tempX = X.copy()
        columns = X.columns.values.tolist()
        for col in self.check_these_columns:
            if col in columns:
                mu,sigma = self.mean_and_std[col]
                outlier_indicies = (tempX[col] - mu)/sigma > self.abs_z_score_lim
                if return_outliers: 
                    outliers.append(tempX[outlier_indicies])
                tempX.drop(outlier_indicies, axis = 0,inplace = True)
        if return_outliers:
            return outliers, tempX
        else:
            return tempX
        
class RemoveColumnsWithLowCorrelation(BaseEstimator,TransformerMixin):
    def __init__(self, label_column, correlation_cutoff = .05):
        self.correlation_cutoff = correlation_cutoff
        self.label_column = label_column
        self.important_correlations = []
    def fit(self, X, y=None):
        return self
    def transform(self, X,y=None):
        if len(self.important_correlations) == 0:
            correlation_matrix = X.corr()
            correlation_matrix[self.label_column].sort_values(ascending=False)
            self.important_correlations = X.columns[abs(correlation_matrix[self.label_column]) > self.correlation_cutoff]
        return X[self.important_correlations].copy()
        
class filter_datapoints(BaseEstimator,TransformerMixin):
    def __init__(self, column, conditional_func):
        self.column = column
        self.conditional_func = conditional_func
    def fit(self, X, y =None):
        return self
    def transform(self,X,y=None):
        return X[X[self.column].apply(self.conditional_func)]
        
            
class split_label(BaseEstimator,TransformerMixin):
    def __init__(self, label):
        self.label = label
    def fit(self, X,y=None):
        return self
    def transform(self,X,y=None):
        return X[self.label], X.drop(self.label, axis=1)

In [2]:
import pandas as pd

# data from https://archive.ics.uci.edu/ml/datasets/Census-Income+(KDD)

header_info = ["age",
              "class_of_worker",
              "industry code",
              "occupation code",
              "level of education",
              "wage per hour",
              "entrolled in education as of last week",
              "marital status",
              "major industry code",
              "major occupation code",
              "race",
              "hispanic origin",
              "sex",
              "member of a labor union",
              "reason for unemployment",
              "full or part time employment status",
              "captial gains",
              "capital losses",
              "dividends from stocks",
              "tax filler status",
              "region of previous residence",
              "state of previous residence",
              "detailed household and family status",
              "detailed household summary",
              "migration code",
              "migration code - change in region",
              "migration code - move within region",
              "live in this house one year ago",
              "migration - previous resident in sunbelt",
              "number of persons that worked for employer",
              "family members under 18",
              "country of birth father",
              "country of birth mother",
              "country of birth",
              "citizenship",
              "own business or self-employed",
              "fill included questionaire for veterans administration",
              "veterans benefits",
              "weeks worked in the year",
              "year of survey",
              "income less than or greater than 50,000",
              "number of years of education"]


my_census = pd.read_csv("./census-income.data", names=header_info)

test_data = pd.read_csv("./census-income.test", names=header_info)

#drop data if age < 18
#census = census.drop(census[census['age'] <= 18].index)



In [5]:
#actually... just use sklearn's data splitting

import sklearn

from sklearn.model_selection import train_test_split

import pandas

my_census = pandas.get_dummies(my_census)
my_test = pandas.get_dummies(test_data)

filter_non_workers = lambda x : x > 0 & x < 1500
filter_label = 'wage per hour'
filt = filter_datapoints(column= filter_label, conditional_func=filter_non_workers)

my_census = filt.transform(my_census)
my_test = filt.transform(my_test)

remove_outliers_from_these_columns = ["dividends from stocks", "capital losses", "wage per hour"]

"""

ADD OUTLIER REMOVER CODE HERE!!!!


"""


make_match = make_sure_columns_match(my_census,my_test)

my_census = make_match.transform(my_census)
my_test = make_match.transform(my_test)



In [19]:
#my_census.shape
import numpy as np


for Col in my_census.columns.values.tolist():
    if my_census[Col].std() > 1:
        list_of_outliers_for_col = my_census[((my_census[Col] - my_census[Col].mean())/ my_census[Col].std()).abs() > 3]
        if len(list_of_outliers_for_col) >= 10:
            print("some of the outliers in this column ( {} ) include: \n".format(Col))
            print(list_of_outliers_for_col[Col][:10])
            print("This column has a total of {} outliers".format(len(list_of_outliers_for_col)))
            print("\n\n")


some of the outliers in this column ( migration code ) include: 

1543    4856.38
3088    5204.51
3555    6525.17
4822    6548.73
6515    4877.97
7191    6133.56
7583    6844.27
7664    4966.00
7885    6254.23
8489    5295.64
Name: migration code, dtype: float64
This column has a total of 110 outliers



some of the outliers in this column ( dividends from stocks ) include: 

9190      5000
11971     8000
23026    10000
25262    40000
25913     4000
27057     4000
27816     6000
28311     5000
33869    10000
36966     4600
Name: dividends from stocks, dtype: int64
This column has a total of 78 outliers



some of the outliers in this column ( capital losses ) include: 

429     2205
1230    1602
2941    2001
4042    1887
4102    1672
4434    1887
4685    2206
5185    2129
5361    1887
5596    1602
Name: capital losses, dtype: int64
This column has a total of 249 outliers



some of the outliers in this column ( year of survey ) include: 

230     0
2235    2
2387    0
2430    0
2708   

In [None]:
#this is the column that we want to predict
label = "wage per hour"

remove_low_corr = RemoveColumnsWithLowCorrelation(label_column=label)

my_low_corr_census = remove_low_corr.transform(my_census)
my_low_corr_test = remove_low_corr.transform(my_test)

split = split_label(label=label)

test_label, test = split.transform(my_low_corr_test)
census_label, census = split.transform(my_low_corr_census)

In [None]:


my_low_corr_census.shape



In [None]:
#print("The train set is {} many examples. Whereas the test set contains {} examples.".format(len(train_set),len(test_set)))

#Note that we could do stratified sampling here if we knew which categories were most important

#census = train_set.copy()

#drop anything with the word code
#bad_feature = [x for x in header_info if "code" in x]

#census = census.drop(axis=1, labels=bad_feature)
#census = census.drop(axis=1,labels=['number of persons that worked for employer','family members under 18','year of survey','fill included questionaire for veterans administration'])

#drop data if age < 18
#census = census.drop(census[census['age'] <= 18].index)



#test.head()

In [None]:
from sklearn.externals import joblib

#def check_if_model_exists_and_load_or_run_and_save(model_name):
    

In [None]:
import numpy as np

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(census,census_label)



In [None]:
from sklearn.metrics import mean_squared_error
census_predictions = lin_reg.predict(test)

lin_mse = mean_squared_error(test_label, census_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

ziped = zip(lin_reg.coef_, census.columns)

zipped = sorted(ziped, reverse=True)
zipped



In [None]:
# let's go back to the drawing board and filter out "outliers" ... datapoints where "major occupation code_ Not in universe" = 1

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(census,census_label)

In [None]:
census_predictions2 = tree_reg.predict(test)
tree_mse = mean_squared_error(test_label, census_predictions2)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, census, census_label, scoring = "neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("scores: ", scores)
    print("mean: ", np.sqrt(-scores.mean()))
    print("std: " , scores.std())

    

In [None]:
display_scores(scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(census, census_label)
forest_predict = forest_reg.predict(test)
forest_rmse = np.sqrt(mean_squared_error(test_label,forest_predict))
forest_rmse

In [None]:
#Now we will attempt a grid search

from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators':[10,20], 'max_features':[10,20]}]
              
forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(census, census_label)

my_pred = grid_search.predict(test)
grid_search_rmse = np.sqrt(mean_squared_error(my_pred,test_label))

grid_search_rmse

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_

#feature_importances

column_names = census.columns

important = sorted(zip(feature_importances,column_names),reverse=True)

num_important_columns = 150
my_important_column_list = []
for i in important[:num_important_columns]:
    my_important_column_list.append(i[1])

In [None]:
my_important_column_list

## What does this mean?

Since being a member of a labor union *or* not being in a labor union *or* the status of being in a labor union unknown is the best predictor of wage -- I believe trying to filter the data to only include those greater than 18 years of age will produce more significant results.

Before we do that, let's try to use a neural network model on the data!

In [None]:
import tflearn
import tensorflow as tf


tf.logging.set_verbosity(tf.logging.INFO)

X, Y, testX, testY = census.as_matrix(), census_label.as_matrix(), test.as_matrix(), test_label.as_matrix()


Y= np.reshape(Y,(Y.shape[0],1))
testY = np.reshape(testY, (testY.shape[0],1))

print("The shape of X is: {}, while Y has the shape: {}".format(X.shape, Y.shape))

tf.reset_default_graph()




input_layer = tflearn.input_data(shape=[None,X.shape[1]])
dense1 = tflearn.fully_connected(incoming=input_layer, n_units=40, name="first_layer")
dropout1 = tflearn.dropout(dense1, .8,name="dropout1")
dense2 = tflearn.fully_connected(dropout1,X.shape[1], name="layer2")
dropout2 = tflearn.dropout(dense2, .8, name="dropout2")
dense3 = tflearn.fully_connected(dropout2, X.shape[1], activation="relu")
dropout3 = tflearn.dropout(dense3, .8, name="dropout3")
dense4 = tflearn.fully_connected(dropout3, X.shape[1], activation="relu")
last = tflearn.fully_connected(dense4, 1, activation="linear")

sgd = tflearn.SGD(learning_rate=.1, lr_decay=.96, decay_step=1000)
r2 = tflearn.metrics.R2()
net = tflearn.regression(last, optimizer=sgd, loss = 'mean_square',metric=r2)

model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir="/tmp/tensorboard")
model.fit(X, Y, n_epoch=2 ,run_id="dense_model" , validation_set = (testX,testY), show_metric=True, snapshot_epoch=True )

In [None]:
np.sqrt(mean_squared_error(model.predict(testX),testY))

