In [38]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sb

In [2]:
class ModifiedLabelEncoder(LabelEncoder):

    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(y).reshape(-1, 1)

    def transform(self, y, *args, **kwargs):
        return super().transform(y).reshape(-1, 1)

In [3]:
%matplotlib inline

In [4]:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
                'class']
columns_num = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
columns_cat = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] 
# dropped education as education-num is present

columns_corr_cat = ['marital-status', 'relationship', 'sex']
len(columns_num), len(columns_cat), len(column_names), len(columns_num) + len(columns_cat) 

(6, 7, 15, 13)

In [5]:
def load_data(directory, file_name, names=None, header=None, skiprows=0):
    return pd.read_csv(os.path.join(directory, file_name), names=names, header=header, skiprows=skiprows, skipinitialspace=True)

In [6]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [33]:
def scoring_using_cross_validation(m, x, y, s='accuracy'):
    print("************* Start of Cross Validation {} {} *************".format(s, m))
    scores_ = cross_val_score(m, x, y, scoring="accuracy", cv=10)
    rmse_scores_ = np.sqrt(scores_)
    display_scores(rmse_scores_)
    print("************* End of Cross Validation {} {}*************".format(s, m))


def print_scores(Y_test, Y_predictions):
    print("************* {} *************".format("start"))
    mse_ = mean_squared_error(Y_test, Y_predictions)
    print("mse ", mse_)
    acc_ = accuracy_score(Y_test, Y_predictions)
    print("acc ", acc_)
    precision_ = precision_score(Y_test, Y_predictions)
    print("precision_ ", precision_)
    recall_ = recall_score(Y_test, Y_predictions)
    print("recall_ ", recall_)
    f1_score_ = f1_score(Y_test, Y_predictions)
    print("f1_score_ ", f1_score_)
    print("****************************************")


In [8]:
base_directory = '~/workspace/personal/datasets/income_predictions'

In [9]:
adult_data_df = load_data(base_directory, 'adult.data', column_names, None, 0)
adult_test_df = load_data(base_directory, 'adult.test', names=column_names, skiprows=1)

In [10]:
class ColumnDropperTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns

    def transform(self, X, y=None):
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None):
        return self


class ColumnUnknownValueTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns

    def transform(self, X, y=None):
        for c in self.columns:
            X[c].replace(['?'], 'unknown_{}'.format(c), inplace=True)
        return X

    def fit(self, X, y=None):
        return self



In [11]:
preprocess_pipeline = Pipeline([
    ('unknown_value_replacer', ColumnUnknownValueTransformer(['workclass', 'occupation', 'native-country'])),
    ('dropper', ColumnDropperTransformer(['education']))
])

adult_data_df = preprocess_pipeline.fit_transform(adult_data_df)
adult_test_df = preprocess_pipeline.fit_transform(adult_test_df)

In [12]:
def get_preprocessor(num_cols, cat_cols):
    return ColumnTransformer([
        ('one-hot-encoder', OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ('standard_scaler', StandardScaler(), num_cols)])

In [40]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

split = ShuffleSplit(n_splits=20, test_size=0.25, random_state=42)
train_index, test_index = list(split.split(adult_data_df[columns_num]))[0]

train_data_set = adult_data_df.loc[train_index]
test_data_set = adult_data_df.loc[test_index]


X1 = train_data_set[columns_num].join(train_data_set[columns_cat])
X2 = train_data_set[columns_num].join(train_data_set[columns_corr_cat])
# print(X1.shape, len(columns_num + columns_cat))
# print(X2.shape, len(columns_num + columns_corr_cat))



X_train = train_data_set.drop(columns=['class'], axis=1)
Y_train = train_data_set['class']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, random_state=42)

lb = LabelEncoder()
Y_train = lb.fit_transform(Y_train)
Y_test = lb.transform(Y_test)

# Correlation matrix between class and num columns

In [41]:
preprocessor = get_preprocessor(columns_num, columns_cat)
model = make_pipeline(preprocessor, SGDClassifier())

In [42]:
model.fit(X_train, Y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['workclass',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country']),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  ['age', 'fnlwgt',
                                                   'education-num',
                                                   'capital-gain',
                                                   'capital-loss',
                                             

In [43]:
y_pred = model.predict(X_test)

In [44]:
print_scores(y_pred, Y_test)

************* start *************
mse  0.14856674856674856
acc  0.8514332514332514
precision_  0.6348005502063274
recall_  0.7105465742879138
f1_score_  0.6705412277515438
****************************************


In [49]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)

preprocessor = ColumnTransformer([
    ('categorical', ordinal_encoder, columns_cat)],
    remainder="passthrough")

model = make_pipeline(preprocessor, HistGradientBoostingClassifier())
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
print_scores(y_pred, Y_test)

************* start *************
mse  0.12596232596232596
acc  0.8740376740376741
precision_  0.6464924346629987
recall_  0.7866108786610879
f1_score_  0.7097017742544357
****************************************
