In [148]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [239]:
df_train = pd.read_csv('train_data.csv')
df_test = pd.read_csv('test_data.csv')

In [86]:
df_train.head()

Unnamed: 0,Id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,16280,34,Private,204991,Some-college,10,Divorced,Exec-managerial,Own-child,White,Male,0,0,44,United-States,<=50K
1,16281,58,Local-gov,310085,10th,6,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K
2,16282,25,Private,146117,Some-college,10,Never-married,Machine-op-inspct,Not-in-family,White,Male,0,0,42,United-States,<=50K
3,16283,24,Private,138938,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K
4,16284,57,Self-emp-inc,258883,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,5178,0,60,Hungary,>50K


The objective here is to choose four different classifiers and apply to the BaseAdult dataset. I chose the following:
- Ranfom Forest
- Neural Network (Perceptron)
- Logistic Regression
- SVM

# To be fair

To be a fair comparison, I will pre-process the data exactly like I did with kNN dataset. Hence, these first few cells will be a direct copy and paste from the last assignment.

In [240]:
#grouping all the countries that are different from US, Mexico, and '?' in a new category called 'Others'
def countries_unite(country):
    if country !='United-States' and country != 'Mexico' and country != '?':
        country = 'Others'
    return country

# grouping together some similar values in the marital.status feature
def group_marital(status):
    if status == 'Married-AF-spouse' or status == 'Married-civ-spouse':
        return 'Married'
    if status == 'Divorced' or status == 'Separated':
        return 'Divorced'
    else:
        return status

def pre_process(df, target=True):
    '''
    Pre-process our data according to what we did in the last assignment.
    '''
    #defining X_train and y_train
    if target:
        X = df[df.columns[:-1]]
        y = df[df.columns[-1]]
        #encode income
        y = y.replace({'<=50K':0, '>50K':1})
        #dropping columns that we don't need
        X = X.drop(['Id', 'fnlwgt', 'education'], axis=1)

        #creating the new variable capital.net
        X['capital.net'] = X['capital.gain'] - X['capital.loss']

        #dropping 'capital.gain' and 'capital.loss' features
        X = X.drop(['capital.gain', 'capital.loss'], axis=1)

        #grouping countries
        X['native.country'] = X['native.country'].apply(countries_unite)

        #grouping marital status
        X['marital.status'] = X['marital.status'].apply(group_marital)

        #encoding sex feature
        X['sex'].replace({'Male':1, 'Female':0}, inplace=True)
        
        return X, y
    else:
        X = df
        
        X = X.drop(['Id', 'fnlwgt', 'education'], axis=1)

        #creating the new variable capital.net
        X['capital.net'] = X['capital.gain'] - X['capital.loss']

        #dropping 'capital.gain' and 'capital.loss' features
        X = X.drop(['capital.gain', 'capital.loss'], axis=1)

        #grouping countries
        X['native.country'] = X['native.country'].apply(countries_unite)

        #grouping marital status
        X['marital.status'] = X['marital.status'].apply(group_marital)

        #encoding sex feature
        X['sex'].replace({'Male':1, 'Female':0}, inplace=True)
    
        return X

In [241]:
X_train, y_train = pre_process(df_train, True)

# Encoding and normalizing data

It is necessary to encode and normalize our data. 

In [242]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#defining our categorical features:
X_train_cat = [i for i in X_train.columns if X_train.dtypes[i]=='object']

#defining our numerical features:
X_train_num = [i for i in X_train.columns if X_train.dtypes[i]=='int64' or X_train.dtypes[i]=='float64']

label_encoder = LabelEncoder()

#looping in our categorical columns, we encode them:
for i in X_train_cat:
    X_train[i] = label_encoder.fit_transform(X_train[i].astype(str))

#now we normalize our data
scaler = StandardScaler()
X_train[X_train_cat] = scaler.fit_transform(X_train[X_train_cat]) 
X_train[X_train_num] = scaler.fit_transform(X_train[X_train_num])

In [134]:
X_train.head()

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,native.country,capital.net
0,-0.335886,0.090121,-0.031325,-1.338107,-0.608434,0.966924,0.393675,0.703087,0.288524,0.301208,-0.133663
1,1.423589,-1.28364,-1.586131,-0.497871,1.756334,-0.900177,0.393675,0.703087,-0.03543,0.301208,-0.133663
2,-0.995689,0.090121,-0.031325,1.182601,0.100997,-0.27781,0.393675,0.703087,0.126547,0.301208,-0.133663
3,-1.069001,0.090121,-0.031325,-1.338107,-1.317864,-0.27781,0.393675,-1.422298,-0.03543,0.301208,-0.133663
4,1.350277,0.777002,-0.420027,-0.497871,1.756334,-0.900177,0.393675,0.703087,1.58434,-1.584818,0.565218


# Models

## Imports

In [149]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate

## kNN (Revisited)

For starters (and for the sake of comparison), we shall tune a kNN classifier again.

In [186]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

The GridSearchCV function uses a dictionary of parameters and trains the dataset using these parameters. Then, the best set of parameters (in the case of kNN there is only one) can be chosen. The "best" set is chosen based on the accuracy of the model, measured through a K-fold cross validation.

In [196]:
grid_search = dict(n_neighbors = np.arange(15, 35,1, dtype=int))

knn = KNeighborsClassifier()
knn_grid = GridSearchCV(knn,
                       grid_search,
                       cv=5,
                       n_jobs=-1)
knn_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': array([15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
       32, 33, 34])})

In [199]:
print('The best choice of parameters was:',knn_grid.best_params_)

The best choice of parameters was: {'n_neighbors': 26}


We can use this good estimation to train our model and estimate its statistics (accuracy, precision, recall, and f1-score) using cross validation again:

In [211]:
scoring = {'acc': 'accuracy',
           'prec_macro': 'precision_macro',
           'rec_macro': 'recall_macro',
          'f1' : 'f1_macro'}

knn = KNeighborsClassifier(n_neighbors=26)

knn_scores = cross_validate(knn, X_train, y_train, scoring=scoring, cv=10, n_jobs=-1)

print('The scores of the kNN Classifier were:')
print()
print('Average accuracy in cv=10: {:.2f} \n' .format(knn_scores['test_acc'].mean()))
print('Average precision in cv=10: {:.2f} \n' .format(knn_scores['test_prec_macro'].mean()))
print('Average recall in cv=10: {:.2f} \n' .format(knn_scores['test_rec_macro'].mean()))
print('Average F1 in cv=10: {:.2f} \n' .format(knn_scores['test_f1'].mean()))

The scores of the kNN Classifier were:

Average accuracy in cv=10: 0.84 

Average precision in cv=10: 0.80 

Average recall in cv=10: 0.74 

Average F1 in cv=10: 0.76 



## Random Forest

In [101]:
from sklearn.ensemble import RandomForestClassifier

I will use a randomized search for the best hyperparameters of our trees (in fact, this technique will be used for all the subsequent classifiers). RandomizedSearchCV chooses a random set of parameters from a dictionary that we create and use these parameters to train and test (using cross-validation) the model and look for the best set of parameters. In this case, "best" is the one that results in CV biggest accuracy.

In [109]:
#number of trees in the forest
n_estimators = np.arange(100, 3000, 50, dtype=int)

#maximum number of splittings
max_depth = [5, 10, 20, 50, 75, 100, 150, 200]

#minimum number of samples to split a node
min_samples_split = [int(x) for x in np.linspace(start=2, stop=10, num=9)]

#splitting criterion:
criterion = ['gini', 'entropy']

#creating the dictionary of parameters that the randomized search will use
random_grid = {'n_estimators':n_estimators,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'criterion':criterion}

rf = RandomForestClassifier()

#now we set our RandomizedSearchCV function, with 40 iterations and 5-fold cv
rf_random = RandomizedSearchCV(estimator=rf,
                              param_distributions=random_grid,
                              n_iter=40,
                              cv=5,
                              verbose=2,
                              random_state=1,
                              n_jobs=-1)
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 17.2min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=40,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [5, 10, 20, 50, 75, 100,
                                                      150, 200],
                                        'min_samples_split': [2, 3, 4, 5, 6, 7,
                                                              8, 9, 10],
                                        'n_estimators': array([ 100,  150,  200,  250,  300,  350,  400,  450,  500,  550,  600,
        650,  700,  750,  800,  850,  900,  950, 1000, 1050, 1100, 1150,
       1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700,
       1750, 1800, 1850, 1900, 1950, 2000, 2050, 2100, 2150, 2200, 2250,
       2300, 2350, 2400, 2450, 2500, 2550, 2600, 2650, 2700, 2750, 2800,
       2850, 2900, 2950])},
                   random_state=1, verbose=2)

After the search, the best parameters and the best score found were:

In [203]:
print('Best set of parameters:',rf_random.best_params_)

Best set of parameters: {'n_estimators': 1500, 'min_samples_split': 9, 'max_depth': 20, 'criterion': 'gini'}


Now we can use these parameters to find the metrics of our classifier

In [248]:
scoring = {'acc': 'accuracy',
           'prec_macro': 'precision_macro',
           'rec_macro': 'recall_macro',
          'f1' : 'f1_macro'}

random_forest = RandomForestClassifier(n_estimators = 1500, 
                                       min_samples_split=9,
                                       max_depth=20,
                                       criterion='gini')

rf_scores = cross_validate(random_forest, X_train, y_train, scoring=scoring, cv=10, n_jobs=-1)

print('The scores of the Random Forest Classifier were:')
print()
print('Average accuracy in cv=10: {:.2f} \n' .format(rf_scores['test_acc'].mean()))
print('Average precision in cv=10: {:.2f} \n' .format(rf_scores['test_prec_macro'].mean()))
print('Average recall in cv=10: {:.2f} \n' .format(rf_scores['test_rec_macro'].mean()))
print('Average F1 in cv=10: {:.2f} \n' .format(rf_scores['test_f1'].mean()))

The scores of the Random Forest Classifier were:

Average accuracy in cv=10: 0.87 

Average precision in cv=10: 0.83 

Average recall in cv=10: 0.79 

Average F1 in cv=10: 0.81 



## MLP (Neural Network)

In [151]:
from sklearn.neural_network import MLPClassifier

I will use the adam method, choosing from a tanh and a ReLU activation function

In [207]:
hidden_layer_sizes = [[25,25],[10,25],[50,]]

activation = ['tanh', 'relu']

random_grid = {'hidden_layer_sizes':hidden_layer_sizes,
              'activation':activation}

MLP = MLPClassifier()

MLP_random = RandomizedSearchCV(estimator=MLP,
                              param_distributions=random_grid,
                              n_iter=30,
                              cv=5,
                              verbose=2,
                              random_state=1,
                              n_jobs=-1)
MLP_random.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.1min finished


RandomizedSearchCV(cv=5, estimator=MLPClassifier(), n_iter=30, n_jobs=-1,
                   param_distributions={'activation': ['tanh', 'relu'],
                                        'hidden_layer_sizes': [[25, 25],
                                                               [10, 25],
                                                               [50]]},
                   random_state=1, verbose=2)

The best set of parameters found was:

In [208]:
print('Best set of MLP parameters:',MLP_random.best_params_)

Best set of MLP parameters: {'hidden_layer_sizes': [50], 'activation': 'tanh'}


Now we can use cross validation to estimate the statistics of our classifier:

In [213]:
MLP = MLPClassifier(hidden_layer_sizes=(50,), activation='tanh')

MLP_scores = cross_validate(MLP, X_train, y_train, scoring=scoring, cv=10, n_jobs=-1)

print('The scores of the MLP Classifier were:')
print()
print('Average accuracy in cv=10: {:.2f} \n' .format(MLP_scores['test_acc'].mean()))
print('Average precision in cv=10: {:.2f} \n' .format(MLP_scores['test_prec_macro'].mean()))
print('Average recall in cv=10: {:.2f} \n' .format(MLP_scores['test_rec_macro'].mean()))
print('Average F1 in cv=10: {:.2f} \n' .format(MLP_scores['test_f1'].mean()))

The scores of the MLP Classifier were:

Average accuracy in cv=10: 0.85 

Average precision in cv=10: 0.81 

Average recall in cv=10: 0.77 

Average F1 in cv=10: 0.78 



## Logistic Regression

In [183]:
from sklearn.linear_model import LogisticRegression

In [169]:
grid = dict(C=np.arange(0.1, 1.5, 0.1))
log_reg = LogisticRegression()
log_reg_grid = GridSearchCV(log_reg,
                           grid,
                           cv=5,
                           n_jobs=-1)
log_reg_grid.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4])})

In [170]:
print('Grid search best parameters was:', log_reg_grid.best_params_)

Grid search best parameters was: {'C': 0.4}


Implementing the logistic regression with this choice of parameter:

In [258]:
LogReg = LogisticRegression(C=0.4)

log_scores = cross_validate(LogReg, X_train, y_train, scoring=scoring, cv=10, n_jobs=-1)

print('The scores of the LogisticRegression Classifier were:')
print()
print('Average accuracy in cv=10: {:.2f} \n' .format(log_scores['test_acc'].mean()))
print('Average precision in cv=10: {:.2f} \n' .format(log_scores['test_prec_macro'].mean()))
print('Average recall in cv=10: {:.2f} \n' .format(log_scores['test_rec_macro'].mean()))
print('Average F1 in cv=10: {:.2f} \n' .format(log_scores['test_f1'].mean()))

The scores of the LogisticRegression Classifier were:

Average accuracy in cv=10: 0.82 

Average precision in cv=10: 0.77 

Average recall in cv=10: 0.69 

Average F1 in cv=10: 0.72 



## SVM

In [172]:
from sklearn.svm import SVC

In [178]:
C = np.arange(0.5, 2, 0.3)
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
break_ties = [True, False]
degree = [int(i) for i in range(1,4)]

random_grid = {'C': C,
              'kernel': kernel,
              'break_ties': break_ties,
              'degree': degree}

SVMClassifier = SVC()

SVM_random = RandomizedSearchCV(estimator=SVMClassifier,
                              param_distributions=random_grid,
                              n_iter=25,
                              cv=5,
                              verbose=2,
                              random_state=1,
                              n_jobs=-1)
SVM_random.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:  5.4min finished


RandomizedSearchCV(cv=5, estimator=SVC(), n_iter=25, n_jobs=-1,
                   param_distributions={'C': array([0.5, 0.8, 1.1, 1.4, 1.7]),
                                        'break_ties': [True, False],
                                        'degree': [1, 2, 3],
                                        'kernel': ['linear', 'poly', 'rbf',
                                                   'sigmoid']},
                   random_state=1, verbose=2)

In [215]:
print('Best set of parameters is:',SVM_random.best_params_)

Best set of parameters is: {'kernel': 'rbf', 'degree': 1, 'break_ties': True, 'C': 1.7000000000000002}


In [217]:
SVMClassifier = SVC(C=1.7, kernel='rbf', break_ties=True, degree=1)
SVM_scores = cross_validate(SVMClassifier, X_train, y_train, cv=10, scoring = scoring, n_jobs=-1)

print('The scores of the SVM Classifier were:')
print()
print('Average accuracy in cv=10: {:.2f} \n' .format(SVM_scores['test_acc'].mean()))
print('Average precision in cv=10: {:.2f} \n' .format(SVM_scores['test_prec_macro'].mean()))
print('Average recall in cv=10: {:.2f} \n' .format(SVM_scores['test_rec_macro'].mean()))
print('Average F1 in cv=10: {:.2f} \n' .format(SVM_scores['test_f1'].mean()))

The scores of the SVM Classifier were:

Average accuracy in cv=10: 0.85 

Average precision in cv=10: 0.82 

Average recall in cv=10: 0.74 

Average F1 in cv=10: 0.76 



# Comparing the models

Bellow I gathered all the statistics of each model to compare them.

In [230]:
classifiers = ['kNN', 'Random Forest', 'MLP', 'Logistic Regression', 'SVM']
accuracies = [knn_scores['test_acc'].mean(), 
             rf_scores['test_acc'].mean(),
             MLP_scores['test_acc'].mean(),
             log_scores['test_acc'].mean(),
             SVM_scores['test_acc'].mean()]

precisions = [knn_scores['test_prec_macro'].mean(), 
             rf_scores['test_prec_macro'].mean(),
             MLP_scores['test_prec_macro'].mean(),
             log_scores['test_prec_macro'].mean(),
             SVM_scores['test_prec_macro'].mean()]

recalls = [knn_scores['test_rec_macro'].mean(), 
             rf_scores['test_rec_macro'].mean(),
             MLP_scores['test_rec_macro'].mean(),
             log_scores['test_rec_macro'].mean(),
             SVM_scores['test_rec_macro'].mean()]

f1s = [knn_scores['test_f1'].mean(), 
             rf_scores['test_f1'].mean(),
             MLP_scores['test_f1'].mean(),
             log_scores['test_f1'].mean(),
             SVM_scores['test_f1'].mean()]

df_models = pd.DataFrame({'Accuracy':accuracies, 
              'Precision':precisions,
             'Recall':recalls,
             'F1-Score':f1s},
            index=classifiers)

df_models

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
kNN,0.840848,0.796086,0.739874,0.760798
Random Forest,0.866892,0.831383,0.787549,0.805623
MLP,0.852948,0.810314,0.767219,0.784503
Logistic Regression,0.821069,0.771576,0.694628,0.718216
SVM,0.846959,0.815951,0.73541,0.762372


By accuracy performance, the selected model will be the Random Forest.

# Prediction of the test dataset

Now, with our best performance model chosen, we can fit it with our train set, and use it to predict the target labels of our test dataset.

First, we pre-process and normalize the data in the test dataset.

In [245]:
X_test = pre_process(df_test, target=False)

#defining our categorical features:
X_test_cat = [i for i in X_test.columns if X_test.dtypes[i]=='object']

#defining our numerical features:
X_test_num = [i for i in X_test.columns if X_test.dtypes[i]=='int64' or X_test.dtypes[i]=='float64']

label_encoder = LabelEncoder()

#looping in our categorical columns, we encode them:
for i in X_test_cat:
    X_test[i] = label_encoder.fit_transform(X_test[i].astype(str))

#now we normalize our data
scaler = StandardScaler()
X_test[X_test_cat] = scaler.fit_transform(X_test[X_test_cat]) 
X_test[X_test_num] = scaler.fit_transform(X_test[X_test_num])

Finally, we fit and predict the target labels, creating a dataframe containing the final results.

In [298]:
final_result = pd.DataFrame(columns=['Id', 'income'])

final_result['Id'] = df_test['Id']
final_result['income'] = random_forest.fit(X_train, y_train).predict(X_test)

final_result['income'].replace({0: '<=50K', 1:'>50K'}, inplace=True)

final_result




Unnamed: 0,Id,income
0,0,<=50K
1,1,>50K
2,2,<=50K
3,3,<=50K
4,4,>50K
...,...,...
16275,16275,<=50K
16276,16276,<=50K
16277,16277,<=50K
16278,16278,<=50K


In [300]:
final_result.to_csv('predictions.csv', index=False)

