In [1]:
# Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import mltools as ml
import data_loader
import warnings
warnings.filterwarnings('ignore')

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

### Preprocess data (manually and via pipeline)

In [2]:
# load data (provided method)
train_data, valid_data = data_loader.load_train_data('Data/adult.data', valid_rate=0.1, is_df=True)
test_data = data_loader.load_test_data('Data/adult.test', is_df=True)

In [3]:
# drop unnecessary features (fnlwgt, education)
train_data = train_data.drop(columns = ['fnlwgt', 'education'])
valid_data = valid_data.drop(columns = ['fnlwgt', 'education'])
test_data = test_data.drop(columns = ['fnlwgt', 'education'])

In [4]:
# numerical processing pipeline
num_features = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
num_transformer = Pipeline(steps = [('scaler', StandardScaler())]) # currently just the scaler (can add more later)

In [5]:
# categorical processing pipeline
cat_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                'native-country']
cat_transformer = Pipeline(steps = [('imputer', SimpleImputer(missing_values = ' ?',strategy = 'most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

In [6]:
# create preprocessor containing above pipelines
preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_features), 
                                               ('cat', cat_transformer, cat_features)])

In [26]:
# create various pipelines, each with invoking the preprocessor and a different classifier
LogR_pipe = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LogisticRegression())])
#LinR_pipe = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LinearRegression())])

In [27]:
# convert target values to binary categories
train_data = train_data.replace(to_replace = ' >50K', value = 1)
train_data = train_data.replace(to_replace = ' <=50K', value = 0)
valid_data = valid_data.replace(to_replace = ' >50K', value = 1)
valid_data = valid_data.replace(to_replace = ' <=50K', value = 0)
test_data = test_data.replace(to_replace = ' >50K', value = 1)
test_data = test_data.replace(to_replace = ' <=50K', value = 0)

In [35]:
train_data.shape

(29315, 13)

### Train classifier, run on test data:

In [28]:
# remove target values
X = train_data.drop(columns = 'income')
Y = train_data['income']

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2)

In [30]:
LogR_pipe.fit(X_train,Y_train)

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]), ['age', 'education-num', 'capital...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [31]:
print('Model Score: ' + str(LogR_pipe.score(X_test, Y_test)))

Model Score: 0.8560463926317585


### Model Selection using Grid Search

In [32]:
# use parameter grid to pass a range of hyperparameters to the pipeline - automatically saves best result
# can add other hyperparemeters/preprocessing options here
param_grid = {
    'classifier__C': [0.1, 1.0, 10], # inverse of regularization strength (different from penalty)
    'classifier__solver': ['lbfgs', 'saga', 'liblinear', 'newton-cg'] # logistic regression algorithm
}

LogR_GS = GridSearchCV(LogR_pipe, param_grid, cv=10, iid=False)
LogR_GS.fit(X_train, Y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]), ['age', 'education-num', 'capital...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'classifier__C': [0.1, 1.0, 10], 'classifier__solver': ['lbfgs', 'saga', 'liblinear', 'newton-cg']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [34]:
# print best results (Logistic Regression)
print(('Highest Logistic Regression Score: ' + str(LogR_GS.score(X_test,Y_test)) + ' with:'))
print(LogR_GS.best_params_)
print(LogR_GS.best_estimator_)

Highest Logistic Regression Score: 0.855193586900904 with:
{'classifier__C': 0.1, 'classifier__solver': 'newton-cg'}
Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]), ['age', 'education-num', 'capital...ty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])
