In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [14]:
# Loading and Cleaning Data
colnames = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
            'marital-status', 'occupation', 'relationship', 'race', 'sex', 
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
df = pd.read_csv('adult.csv', names = colnames)
df = df.replace([' ?', ' <=50K.', ' >50K.', ' <=50K', ' >50K'], [np.NaN, 1, 0, 1, 0])
df = df.dropna()
df = df.replace([])

X = df.drop(['income'], axis=1)
y = df['income']
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,1
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,1
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,1
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,1


In [20]:
# Transform X values into One Hot Encoding for categorical variables and Standardizing for numerical variables
cat = list(X.select_dtypes(['object']).columns)
cont = list(X.select_dtypes(['int64']).columns)

cont_transform = Pipeline(steps=[('scaler', StandardScaler())])

cat_transform = Pipeline(steps=[('categories', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('cont', cont_transform, cont),
                                               ('cat', cat_transform, cat)])
X = pd.DataFrame(preprocessor.fit_transform(X))
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
0,0.034201,-1.062295,1.128753,0.142888,-0.21878,-0.078120,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.866417,-1.007438,1.128753,-0.146733,-0.21878,-2.326738,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.041455,0.245284,-0.438122,-0.146733,-0.21878,-0.078120,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.093385,0.425853,-1.221559,-0.146733,-0.21878,-0.078120,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.798015,1.407393,1.128753,-0.146733,-0.21878,-0.078120,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45217,-0.419735,0.525154,1.128753,-0.146733,-0.21878,-0.078120,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
45218,0.034201,0.243135,1.128753,-0.146733,-0.21878,-0.411249,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
45219,-0.041455,1.753613,1.128753,-0.146733,-0.21878,0.754701,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
45220,0.412481,-1.001947,1.128753,0.579985,-0.21878,-0.078120,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# LOGISTIC REGRESSION

In [16]:
pipe = Pipeline(steps=[('classifier', LogisticRegression())])

parameters = [{'classifier': [LogisticRegression(max_iter=5000, n_jobs=-1)],
               'classifier__solver': ['saga'],
               'classifier__penalty': ['l1'],
               'classifier__C': np.logspace(-8,4,13)},
              {'classifier': [LogisticRegression(max_iter=5000, n_jobs=-1)],
               'classifier__solver': ['sag', 'saga'],
               'classifier__penalty': ['none']},
              {'classifier': [LogisticRegression(max_iter=5000, n_jobs=-1)],
               'classifier__solver': ['sag', 'saga'],
               'classifier__penalty': ['l2'],
               'classifier__C': np.logspace(-8,4,13)}]

clf = GridSearchCV(pipe, parameters, cv=KFold(n_splits=5), 
                   scoring=['accuracy', 'precision', 'roc_auc', 'f1'], refit=False)

In [5]:
num_trials = 5
train_metrics = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])
test_metrics = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])

for i in range(num_trials):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)
    hyperparams = clf.fit(X_train, y_train)
    results = hyperparams.cv_results_['params']
    solution = pd.DataFrame(results)
    
    # ACCURACY
    solution['Accuracy'] = hyperparams.cv_results_['mean_test_accuracy']
    best_accuracy = results[np.argmin(hyperparams.cv_results_['rank_test_accuracy'])]
    accuracy_model = LogisticRegression(penalty = best_accuracy['classifier__penalty'],
                                        C = best_accuracy['classifier__C'],
                                        solver = best_accuracy['classifier__solver'],
                                        max_iter = 5000,
                                        n_jobs = -1)
    accuracy_model.fit(X_train, y_train)
    y_acc_train = accuracy_model.predict(X_train)
    acc_train_score = accuracy_score(y_train, y_acc_train)
    
    accuracy_model.fit(X_test, y_test)
    y_acc_test = accuracy_model.predict(X_test)
    acc_test_score = accuracy_score(y_test, y_acc_test)
    
    # PRECISION
    solution['Precision'] = hyperparams.cv_results_['mean_test_precision']
    best_precision = results[np.argmin(hyperparams.cv_results_['rank_test_precision'])]
    precision_model = LogisticRegression(penalty = best_precision['classifier__penalty'],
                                         C = best_precision['classifier__C'],
                                         solver = best_precision['classifier__solver'],
                                         max_iter = 5000,
                                         n_jobs = -1)
    precision_model.fit(X_train, y_train)
    y_prec_train = precision_model.predict(X_train)
    prec_train_score = accuracy_score(y_train, y_prec_train)
    
    precision_model.fit(X_test, y_test)
    y_prec_test = precision_model.predict(X_test)
    prec_test_score = accuracy_score(y_test, y_prec_test)
    
    # ROC AUC
    solution['ROC AUC'] = hyperparams.cv_results_['mean_test_roc_auc']
    best_roc_auc = results[np.argmin(hyperparams.cv_results_['rank_test_roc_auc'])]
    roc_model = LogisticRegression(penalty = best_roc_auc['classifier__penalty'],
                                   C = best_roc_auc['classifier__C'],
                                   solver = best_roc_auc['classifier__solver'],
                                   max_iter = 5000,
                                   n_jobs = -1)
    roc_model.fit(X_train, y_train)
    y_roc_train = roc_model.predict(X_train)
    roc_train_score = accuracy_score(y_train, y_roc_train)
    
    roc_model.fit(X_test, y_test)
    y_roc_test = roc_model.predict(X_test)
    roc_test_score = accuracy_score(y_test, y_roc_test)
    
    # F1
    solution['F1'] = hyperparams.cv_results_['mean_test_f1']
    best_f1 = results[np.argmin(hyperparams.cv_results_['rank_test_f1'])]
    f1_model = LogisticRegression(penalty = best_f1['classifier__penalty'],
                                  C = best_f1['classifier__C'],
                                  solver = best_f1['classifier__solver'],
                                  max_iter = 5000,
                                  n_jobs = -1)
    f1_model.fit(X_train, y_train)
    y_f1_train = f1_model.predict(X_train)
    f1_train_score = accuracy_score(y_train, y_f1_train)
    
    f1_model.fit(X_test, y_test)
    y_f1_test = f1_model.predict(X_test)
    f1_test_score = accuracy_score(y_test, y_f1_test)
    
    train_metrics = train_metrics.append({'Accuracy': acc_train_score, 'Precision': prec_train_score, 
                                          'AUC': roc_train_score, 'F1': f1_train_score}, ignore_index=True)
    
    test_metrics = test_metrics.append({'Accuracy': acc_test_score, 'Precision': prec_test_score, 
                                          'AUC': roc_test_score, 'F1': f1_test_score}, ignore_index=True)

In [7]:
train_metrics

Unnamed: 0,Accuracy,Precision,AUC,F1
0,0.8552,0.8552,0.853,0.8552
1,0.8552,0.8552,0.8518,0.8518
2,0.8488,0.8482,0.842,0.8488
3,0.8628,0.8628,0.855,0.8628
4,0.8508,0.8508,0.8466,0.8508


In [9]:
test_metrics

Unnamed: 0,Accuracy,Precision,AUC,F1
0,0.848715,0.848715,0.848615,0.848715
1,0.848367,0.848367,0.848143,0.848143
2,0.850554,0.850579,0.850157,0.850554
3,0.849038,0.849063,0.848814,0.849038
4,0.849858,0.849784,0.849386,0.849858


# RANDOM FOREST CLASSIFIER

In [26]:
randomForest = RandomForestClassifier()

param_grid = {
    'n_estimators': [1024],
    'criterion': ['gini', 'entropy'],
    'max_features': [1,2,4,6,8,12,16,20]}

clf = GridSearchCV(estimator=randomForest, param_grid=param_grid, cv=KFold(n_splits=5), 
                   scoring=['accuracy', 'precision', 'roc_auc', 'f1'], refit=False)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': [1, 2, 4, 6, 8, 12, 16, 20],
                         'n_estimators': [1024]},
             refit=False, scoring=['accuracy', 'precision', 'roc_auc', 'f1'])


In [35]:
num_trials = 5
train_metrics_rf = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])
test_metrics_rf = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])

for i in range(num_trials):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)
    hyperparams = clf.fit(X_train, y_train)
    results = hyperparams.cv_results_['params']
    solution = pd.DataFrame(results)
    
    # ACCURACY
    solution['Accuracy'] = hyperparams.cv_results_['mean_test_accuracy']
    best_accuracy = results[np.argmin(hyperparams.cv_results_['rank_test_accuracy'])]
    accuracy_model = RandomForestClassifier(n_estimators = best_accuracy['n_estimators'],
                                            criterion = best_accuracy['criterion'],
                                            max_features = best_accuracy['max_features'])
    accuracy_model.fit(X_train, y_train)
    y_acc_train = accuracy_model.predict(X_train)
    acc_train_score = accuracy_score(y_train, y_acc_train)
    
    accuracy_model.fit(X_test, y_test)
    y_acc_test = accuracy_model.predict(X_test)
    acc_test_score = accuracy_score(y_test, y_acc_test)
    
    # PRECISION
    solution['Precision'] = hyperparams.cv_results_['mean_test_precision']
    best_precision = results[np.argmin(hyperparams.cv_results_['rank_test_precision'])]
    precision_model = RandomForestClassifier(n_estimators = best_precision['n_estimators'],
                                             criterion = best_precision['criterion'],
                                             max_features = best_precision['max_features'])
    precision_model.fit(X_train, y_train)
    y_prec_train = precision_model.predict(X_train)
    prec_train_score = accuracy_score(y_train, y_prec_train)
    
    precision_model.fit(X_test, y_test)
    y_prec_test = precision_model.predict(X_test)
    prec_test_score = accuracy_score(y_test, y_prec_test)
    
    # ROC AUC
    solution['ROC AUC'] = hyperparams.cv_results_['mean_test_roc_auc']
    best_roc_auc = results[np.argmin(hyperparams.cv_results_['rank_test_roc_auc'])]
    roc_model = RandomForestClassifier(n_estimators = best_roc_auc['n_estimators'],
                                       criterion = best_roc_auc['criterion'],
                                       max_features = best_roc_auc['max_features'])
    roc_model.fit(X_train, y_train)
    y_roc_train = roc_model.predict(X_train)
    roc_train_score = accuracy_score(y_train, y_roc_train)
    
    roc_model.fit(X_test, y_test)
    y_roc_test = roc_model.predict(X_test)
    roc_test_score = accuracy_score(y_test, y_roc_test)
    
    # F1
    solution['F1'] = hyperparams.cv_results_['mean_test_f1']
    best_f1 = results[np.argmin(hyperparams.cv_results_['rank_test_f1'])]
    f1_model = RandomForestClassifier(n_estimators = best_f1['n_estimators'],
                                      criterion = best_f1['criterion'],
                                      max_features = best_f1['max_features'])
    f1_model.fit(X_train, y_train)
    y_f1_train = f1_model.predict(X_train)
    f1_train_score = accuracy_score(y_train, y_f1_train)
    
    f1_model.fit(X_test, y_test)
    y_f1_test = f1_model.predict(X_test)
    f1_test_score = accuracy_score(y_test, y_f1_test)
    
    train_metrics_rf = train_metrics_rf.append({'Accuracy': acc_train_score, 'Precision': prec_train_score,
                                                'AUC': roc_train_score, 'F1': f1_train_score}, ignore_index=True)
    
    test_metrics_rf = test_metrics_rf.append({'Accuracy': acc_test_score, 'Precision': prec_test_score,
                                              'AUC': roc_test_score, 'F1': f1_test_score}, ignore_index=True)

KeyboardInterrupt: 

In [None]:
train_metrics_rf

In [None]:
test_metrics_rf

# DECISION TREE CLASSIFIER

In [None]:
decisionTree = DecisionTreeClassifier()

param_grid = {
    'criterion': ['gini', 'entropy']
    'splitter': 'best'}

clf = GridSearchCV(estimator=decisionTree, param_grid=param_grid, cv=KFold(n_splits=5), 
                   scoring=['accuracy', 'precision', 'roc_auc', 'f1'], refit=False)