In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline        import Pipeline
from sklearn.preprocessing   import StandardScaler, OneHotEncoder
from sklearn.impute          import SimpleImputer
from sklearn.compose         import ColumnTransformer
from sklearn.metrics         import *
from sklearn.base            import BaseEstimator

from sklearn.linear_model    import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes     import BernoulliNB, GaussianNB
from sklearn.tree            import DecisionTreeClassifier
from sklearn.ensemble        import RandomForestClassifier


## Research Question:
Can we use a machine learning model to predict whether or not a person will earn $50,000.00 per year based on their demographics?

## Load the data

In [2]:
data = pd.read_csv('income_data.csv')

In [3]:
y = data[data.columns[-1]]
X = data[data.columns[:-1]]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, random_state=0)

### Fit initial model

In [5]:
# Find the categorical columns
categorical_columns = (X.dtypes == object)

# Setup preprocessing pipelines based on data types
con_pipe = Pipeline([('scaler', StandardScaler()),
                     ('imputer', SimpleImputer(strategy='median', add_indicator=True))])

cat_pipe = Pipeline([('ohe', OneHotEncoder(handle_unknown='ignore')),
                     ('imputer', SimpleImputer(strategy='most_frequent', add_indicator=True))])

preprocessing = ColumnTransformer([('categorical', cat_pipe,  categorical_columns),
                                   ('continuous',  con_pipe, ~categorical_columns),
                                   ])

pipe = Pipeline([('preprocessing', preprocessing), 
                 ('clf', LogisticRegression(solver='liblinear'))])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_validation)
f1_weighted = f1_score(y_validation, y_pred, average = 'weighted')
f1 = f1_score(y_validation, y_pred)
bac = balanced_accuracy_score(y_validation, y_pred)

print(f'F1 score:          {f1:.4f}')
print(f'F1 weighted score: {f1_weighted:.4f}')
print(f'Balanced accuracy: {bac:.4f}')


F1 weighted score: 0.6808
Balanced accuracy: 0.6809
Area Under Curve:  0.6809


## Searching for best linear model

In [6]:
# Going to be used as a placeholder to test different algorithms
class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass

In [7]:
search = [{'algo': [LogisticRegression()],
           'algo__penalty': ['l1','l2'],
           'algo__class_weight': ['balanced', None],
           'algo__solver': ['liblinear', 'newton-cg']},
          
          {'algo': [RidgeClassifier()],
           'algo__normalize': [True,False],
           'algo__max_iter': [10,100,1000]},
          
          {'algo': [BernoulliNB()],
           'algo__alpha': [1e-6, 1e-3, 0, 0.5, 1, 1e3, 1e6],
           'algo__fit_prior': [True, False]},
          
          {'algo': [GaussianNB()],
           'algo__var_smoothing': [1e-9, 1e-6, 1e-3, 1, 1e3, 1e6]}]

In [8]:
pipe = Pipeline([('preprocessing', preprocessing), 
                 ('algo', DummyEstimator())])

In [9]:
rand_algos = RandomizedSearchCV(estimator=pipe, 
                                param_distributions=search, 
                                n_iter=30,
                                cv=5, 
                                scoring='f1_weighted',
                                n_jobs=-1,
                                verbose=1)

best_model = rand_algos.fit(X_train, y_train)
results = rand_algos.cv_results_

best_model.best_estimator_.get_params()['algo'], best_model.best_score_

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    9.5s finished


(LogisticRegression(penalty='l1', solver='liblinear'), 0.6816391661752492)

## Searching for best Trees models

In [10]:
trees = [{'algo': [DecisionTreeClassifier()],
          'algo__criterion': ['gini','entropy'],
          'algo__splitter': ['best','random'],
          'algo__max_depth': [None, 5, 10, 20],
          'algo__min_samples_leaf': [1,5,10]},
         
         {'algo': [RandomForestClassifier()],
          'algo__n_estimators': [50, 100, 150, 200],
          'algo__criterion': ['gini','entropy'],
          'algo__max_depth': [None, 5, 10, 25, 50],
          'algo__min_samples_leaf': [1,5,10],
          'algo__n_jobs': [-1],
          'algo__class_weight': [None, 'balanced'],
          'algo__oob_score': [True, False]}]

In [11]:
rand_trees = RandomizedSearchCV(estimator=pipe, 
                                param_distributions=trees, 
                                n_iter=50,
                                cv=5, 
                                scoring='f1_weighted',
                                n_jobs=-1,
                                verbose=1)

best_trees = rand_trees.fit(X_train, y_train)
results = rand_trees.cv_results_

best_trees.best_estimator_.get_params()['algo'], best_trees.best_score_

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  2.6min finished


(RandomForestClassifier(criterion='entropy', min_samples_leaf=5, n_jobs=-1,
                        oob_score=True),
 0.6989961753617548)

## Test Validation on best model

In [12]:
y_pred = best_trees.predict(X_validation)

f1_weighted = f1_score(y_validation, y_pred, average = 'weighted')
f1 = f1_score(y_validation, y_pred)
bac = balanced_accuracy_score(y_validation, y_pred)
roc_auc = roc_auc_score(y_validation, y_pred)

# print(f'F1 score:          {f1:.4f}')
print(f'F1 weighted score: {f1_weighted:.4f}')
print(f'Balanced accuracy: {bac:.4f}')
print(f'Area Under Curve:  {roc_auc:.4f}')

F1 weighted score: 0.7013
Balanced accuracy: 0.7015
Area Under Curve:  0.7015


### Final Model

In [None]:
pipe = Pipeline([('preprocessing', preprocessing), 
                 ('rf', RandomForestClassifier(criterion='entropy',
                                               min_samples_leaf=5,
                                               n_estimators=150,
                                               n_jobs=-1,
                                               oob_score=True))])
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)

f1_weighted = f1_score(y_test, y_pred, average = 'weighted')
f1 = f1_score(y_test, y_pred)
bac = balanced_accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'F1 score:          {f1:.4f}')
print(f'F1 weighted score: {f1_weighted:.4f}')
print(f'Balanced accuracy: {bac:.4f}')
print(f'Area Under Curve:  {roc_auc:.4f}')