In [1]:
# Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import mltools as ml
import data_loader
import warnings
warnings.filterwarnings('ignore')

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

### Preprocess data (manually and via pipeline)

In [2]:
# load data (provided method)
train_data, valid_data = data_loader.load_train_data('Data/adult.data', valid_rate=0.1, is_df=True)
test_data = data_loader.load_test_data('Data/adult.test', is_df=True)

In [3]:
# drop unnecessary features (fnlwgt, education)
train_data = train_data.drop(columns = ['fnlwgt', 'education'])
valid_data = valid_data.drop(columns = ['fnlwgt', 'education'])
test_data = test_data.drop(columns = ['fnlwgt', 'education'])

In [4]:
# numerical processing pipeline
num_features = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
num_transformer = Pipeline(steps = [('scaler', StandardScaler())]) # currently just the scaler (can add more later)

In [5]:
# categorical processing pipeline
cat_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                'native-country']
cat_transformer = Pipeline(steps = [('imputer', SimpleImputer(missing_values = ' ?',strategy = 'most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

In [6]:
# create preprocessor containing above pipelines
preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_features), 
                                               ('cat', cat_transformer, cat_features)])

In [7]:
# create a new pipeline containign the preprocessor and a classifier
full_pipe = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LogisticRegression(solver='lbfgs'))])

In [8]:
# convert target values to binary categories
train_data = train_data.replace(to_replace = ' >50K', value = 1)
train_data = train_data.replace(to_replace = ' <=50K', value = 0)
valid_data = valid_data.replace(to_replace = ' >50K', value = 1)
valid_data = valid_data.replace(to_replace = ' <=50K', value = 0)
test_data = test_data.replace(to_replace = ' >50K', value = 1)
test_data = test_data.replace(to_replace = ' <=50K', value = 0)

### Train classifier, run on test data:

In [9]:
# remove target values
X = train_data.drop(columns = 'income')
Y = train_data['income']

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2)

In [11]:
full_pipe.fit(X_train,Y_train)

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]), ['age', 'education-num', 'capital...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [12]:
print('Model Score: ' + str(full_pipe.score(X_test, Y_test)))

Model Score: 0.8574108818011257


### Model Selection using Grid Search

In [13]:
param_grid = {
    'classifier__C': [0.1, 1.0, 10, 100], # can add other hyperparemeters/preprocessing options here
}

GS = GridSearchCV(full_pipe, param_grid, cv=10, iid=False)
GS.fit(X_train, Y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]), ['age', 'education-num', 'capital...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'classifier__C': [0.1, 1.0, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
print(('Highest Logistic Regression Score: ' + str(GS.score(X_test,Y_test)) + ' with:'))
print((GS.best_params_))

Highest Logistic Regression Score: 0.8574108818011257 with:
{'classifier__C': 1.0}
