In [15]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sb

In [2]:
class ModifiedLabelEncoder(LabelEncoder):

    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(y).reshape(-1, 1)

    def transform(self, y, *args, **kwargs):
        return super().transform(y).reshape(-1, 1)

In [3]:
%matplotlib inline

In [4]:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
                'class']
columns_num = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
columns_cat = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] 
# dropped education as education-num is present

columns_corr_cat = ['marital-status', 'relationship', 'sex']
len(columns_num), len(columns_cat), len(column_names), len(columns_num) + len(columns_cat) 

(6, 7, 15, 13)

In [5]:
def load_data(directory, file_name, names=None, header=None, skiprows=0):
    return pd.read_csv(os.path.join(directory, file_name), names=names, header=header, skiprows=skiprows, skipinitialspace=True)

In [6]:
def using_model(model, scoring, cv):
    print("****************** {} ******************".format(model))
    if scoring is not None:
        scores_ = cross_val_score(model, X_train, Y_train, scoring=scoring, cv=cv)
        print("scoring={} cv={}".format(scoring, cv), scores_.mean(), scores_.std())
    model.fit(X_train, Y_train)
    Y_predict = model.predict(X_test)
    print("accuracy_score=", accuracy_score(Y_test, Y_predict))
    print("precision_score=", precision_score(Y_test, Y_predict))
    print("recall_score=", recall_score(Y_test, Y_predict))

In [7]:
base_directory = '~/workspace/personal/datasets/income_predictions'

In [8]:
adult_data_df = load_data(base_directory, 'adult.data', column_names, None, 0)
adult_test_df = load_data(base_directory, 'adult.test', names=column_names, skiprows=1)

In [9]:
adult_data_df.select_dtypes('object')

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,class
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,<=50K
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,>50K
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States,<=50K
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States,<=50K


In [10]:
class ColumnDropperTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns

    def transform(self, X, y=None):
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None):
        return self


class ColumnUnknownValueTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns

    def transform(self, X, y=None):
        for c in self.columns:
            X[c].replace(['?'], 'unknown_{}'.format(c), inplace=True)
        return X

    def fit(self, X, y=None):
        return self



In [11]:
preprocess_pipeline = Pipeline([
        ('dropper', ColumnDropperTransformer(['education'])),
        ('unknown_value_replacer', ColumnUnknownValueTransformer(['workclass', 'occupation', 'native-country']))
    ])

num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
ct = ColumnTransformer(transformers=[
    ("norm2", num_pipeline, columns_num),
    ('onehot', one_hot_encoder, columns_cat)
])
    
X_train = adult_data_df.drop(labels='class', axis=1)
Y_train = adult_data_df['class']

X_test = adult_test_df.drop(labels='class', axis=1)
Y_test = adult_test_df['class'].apply(lambda x: x.replace('.', ''))

label_encoder = LabelEncoder()
label_encoder.fit(Y_train)
X_train = ct.fit_transform(X_train)
Y_train = label_encoder.transform(Y_train)

X_test = ct.transform(X_test)
Y_test = label_encoder.transform(Y_test.apply(lambda x: x.replace('.', '')))

In [12]:
using_model(SGDClassifier(), "accuracy", 10)

****************** SGDClassifier() ******************
scoring=accuracy cv=10 0.8475477312520603 0.004499878516152031
accuracy_score= 0.8453411952582766
precision_score= 0.6734587251828631
recall_score= 0.6703068122724909


In [13]:
using_model(RandomForestClassifier(), "accuracy", 10)

****************** RandomForestClassifier() ******************
scoring=accuracy cv=10 0.8555022672769065 0.005605468598139749
accuracy_score= 0.8514219028315214
precision_score= 0.7235976183014728
recall_score= 0.6003640145605824


In [16]:
lg_newton_cg = LogisticRegression(max_iter=500, solver='newton-cg')
using_model(lg_newton_cg, "neg_mean_squared_error", 10)
using_model(lg_newton_cg, "accuracy", 10)

****************** LogisticRegression(max_iter=500, solver='newton-cg') ******************
scoring=neg_mean_squared_error cv=10 -0.14809123083225018 0.004578315099411212
accuracy_score= 0.8530802776242246
precision_score= 0.7296272899557802
recall_score= 0.6006240249609984
****************** LogisticRegression(max_iter=500, solver='newton-cg') ******************
scoring=accuracy cv=10 0.8519087691677498 0.004578315099411194
accuracy_score= 0.8530802776242246
precision_score= 0.7296272899557802
recall_score= 0.6006240249609984


In [17]:
lg_lgbfs = LogisticRegression(max_iter=1000)
using_model(lg_lgbfs, "neg_mean_squared_error", 10)
using_model(lg_lgbfs, "accuracy", 10)

****************** LogisticRegression(max_iter=1000) ******************
scoring=neg_mean_squared_error cv=10 -0.14812194336296272 0.004585036870064261
accuracy_score= 0.8530802776242246
precision_score= 0.7296272899557802
recall_score= 0.6006240249609984
****************** LogisticRegression(max_iter=1000) ******************
scoring=accuracy cv=10 0.8518780566370372 0.004585036870064243
accuracy_score= 0.8530802776242246
precision_score= 0.7296272899557802
recall_score= 0.6006240249609984


In [18]:
using_model(DecisionTreeClassifier(), "neg_mean_squared_error", 10)
using_model(DecisionTreeClassifier(), "accuracy", 10)

****************** DecisionTreeClassifier() ******************
scoring=neg_mean_squared_error cv=10 -0.18073746283755496 0.007470983026698642
accuracy_score= 0.8119894355383576
precision_score= 0.5987421383647799
recall_score= 0.6188247529901196
****************** DecisionTreeClassifier() ******************
scoring=accuracy cv=10 0.8182492405320161 0.010116677919342355
accuracy_score= 0.8125422271359253
precision_score= 0.5996485943775101
recall_score= 0.6211648465938637
