# Training ML Algorithms

In [1]:
import json  # for saving preprossing details
import joblib  # for saving the algorithm and proprocessing details

import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

## Loading Data
This dataset was provided by [pplonski](https://github.com/pplonski) in a [GitHub repository](https://github.com/pplonski/datasets-for-start/tree/master/adult).

In [29]:
df = pd.read_csv('https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv', skipinitialspace=True)
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


Select the features to be trained on $X$, and the prediction target $y$.

In [3]:
X = df.drop('income', axis=1)
y = df['income']

X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


Split the data into a set for training and a set for testing.

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)
X_train.shape

(22792, 14)

## Data Pre-Processing

The training algorithm we will use is **Random Forest** from `sklearn` which cannot handle missing values or categorical data. First we will fill missing values with the mode (most common value) in each feature.

### Fill Missing Values

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22792 entries, 29700 to 27439
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             22792 non-null  int64 
 1   workclass       21526 non-null  object
 2   fnlwgt          22792 non-null  int64 
 3   education       22792 non-null  object
 4   education-num   22792 non-null  int64 
 5   marital-status  22792 non-null  object
 6   occupation      21522 non-null  object
 7   relationship    22792 non-null  object
 8   race            22792 non-null  object
 9   sex             22792 non-null  object
 10  capital-gain    22792 non-null  int64 
 11  capital-loss    22792 non-null  int64 
 12  hours-per-week  22792 non-null  int64 
 13  native-country  22390 non-null  object
dtypes: int64(6), object(8)
memory usage: 2.6+ MB


In [6]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9769 entries, 1055 to 1439
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             9769 non-null   int64 
 1   workclass       9199 non-null   object
 2   fnlwgt          9769 non-null   int64 
 3   education       9769 non-null   object
 4   education-num   9769 non-null   int64 
 5   marital-status  9769 non-null   object
 6   occupation      9196 non-null   object
 7   relationship    9769 non-null   object
 8   race            9769 non-null   object
 9   sex             9769 non-null   object
 10  capital-gain    9769 non-null   int64 
 11  capital-loss    9769 non-null   int64 
 12  hours-per-week  9769 non-null   int64 
 13  native-country  9588 non-null   object
dtypes: int64(6), object(8)
memory usage: 1.1+ MB


In [7]:
def get_train_test_modes(X_train, X_test):
    train_mode = dict(X_train.mode().iloc[0])
    test_mode = dict(X_test.mode().iloc[0])
    
    return train_mode, test_mode

In [8]:
train_mode, test_mode = get_train_test_modes(X_train, X_test)
train_mode

{'age': 31.0,
 'workclass': 'Private',
 'fnlwgt': 121124,
 'education': 'HS-grad',
 'education-num': 9.0,
 'marital-status': 'Married-civ-spouse',
 'occupation': 'Prof-specialty',
 'relationship': 'Husband',
 'race': 'White',
 'sex': 'Male',
 'capital-gain': 0.0,
 'capital-loss': 0.0,
 'hours-per-week': 40.0,
 'native-country': 'United-States'}

In [9]:
test_mode

{'age': 35.0,
 'workclass': 'Private',
 'fnlwgt': 203488,
 'education': 'HS-grad',
 'education-num': 9.0,
 'marital-status': 'Married-civ-spouse',
 'occupation': 'Prof-specialty',
 'relationship': 'Husband',
 'race': 'White',
 'sex': 'Male',
 'capital-gain': 0.0,
 'capital-loss': 0.0,
 'hours-per-week': 40.0,
 'native-country': 'United-States'}

In [10]:
X_train = X_train.fillna(train_mode)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22792 entries, 29700 to 27439
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             22792 non-null  int64 
 1   workclass       22792 non-null  object
 2   fnlwgt          22792 non-null  int64 
 3   education       22792 non-null  object
 4   education-num   22792 non-null  int64 
 5   marital-status  22792 non-null  object
 6   occupation      22792 non-null  object
 7   relationship    22792 non-null  object
 8   race            22792 non-null  object
 9   sex             22792 non-null  object
 10  capital-gain    22792 non-null  int64 
 11  capital-loss    22792 non-null  int64 
 12  hours-per-week  22792 non-null  int64 
 13  native-country  22792 non-null  object
dtypes: int64(6), object(8)
memory usage: 2.6+ MB


In [11]:
X_test = X_test.fillna(test_mode)
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9769 entries, 1055 to 1439
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             9769 non-null   int64 
 1   workclass       9769 non-null   object
 2   fnlwgt          9769 non-null   int64 
 3   education       9769 non-null   object
 4   education-num   9769 non-null   int64 
 5   marital-status  9769 non-null   object
 6   occupation      9769 non-null   object
 7   relationship    9769 non-null   object
 8   race            9769 non-null   object
 9   sex             9769 non-null   object
 10  capital-gain    9769 non-null   int64 
 11  capital-loss    9769 non-null   int64 
 12  hours-per-week  9769 non-null   int64 
 13  native-country  9769 non-null   object
dtypes: int64(6), object(8)
memory usage: 1.1+ MB


### Encode Categorical Data
The original tutorial uses the `LabelEncoder` from `sklearn`, but the docs say it should not be used on input. Instead, we will use the `OrdinalEncoder` according to [this tutorial](https://machinelearningmastery.com/how-to-prepare-categorical-data-for-deep-learning-in-python/) which maps each unique label to an integer. The tutorial provides the following advice and `prepare_inputs` function:
>The best practice when encoding variables is to fit the encoding on the training dataset, then apply it to the train and test datasets.

In [12]:
def prepare_inputs(X_train, X_test, X=None):
    """
    Maps categorical data to integers using an OrdinalEncoder trained on the X_train set.
    
    If the unique values in X_test do not match those of X_train, then an error will be thrown.
    In that case, providing the original data set X will allow the encoder to train on all values.
    """
    oe = OrdinalEncoder()
    if X is not None:
        oe.fit(X)
    else:
        oe.fit(X_train)

    X_train = oe.transform(X_train)
    X_test = oe.transform(X_test)
    
    return X_train, X_test, oe

In [13]:
categories = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country'
]

X_train_encoded, X_test_encoded, encoder = prepare_inputs(X_train[categories], X_test[categories], X[categories])
X_train_encoded = pd.DataFrame(data=X_train_encoded, columns=categories)
X_test_encoded = pd.DataFrame(data=X_test_encoded, columns=categories)

encoder.feature_names_in_

array(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'], dtype=object)

In [14]:
for cat in categories:
    X_train[cat] = X_train_encoded[cat].values
    X_test[cat] = X_test_encoded[cat].values

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22792 entries, 29700 to 27439
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             22792 non-null  int64  
 1   workclass       22792 non-null  float64
 2   fnlwgt          22792 non-null  int64  
 3   education       22792 non-null  float64
 4   education-num   22792 non-null  int64  
 5   marital-status  22792 non-null  float64
 6   occupation      22792 non-null  float64
 7   relationship    22792 non-null  float64
 8   race            22792 non-null  float64
 9   sex             22792 non-null  float64
 10  capital-gain    22792 non-null  int64  
 11  capital-loss    22792 non-null  int64  
 12  hours-per-week  22792 non-null  int64  
 13  native-country  22792 non-null  float64
dtypes: float64(8), int64(6)
memory usage: 2.6 MB


In [15]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9769 entries, 1055 to 1439
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             9769 non-null   int64  
 1   workclass       9769 non-null   float64
 2   fnlwgt          9769 non-null   int64  
 3   education       9769 non-null   float64
 4   education-num   9769 non-null   int64  
 5   marital-status  9769 non-null   float64
 6   occupation      9769 non-null   float64
 7   relationship    9769 non-null   float64
 8   race            9769 non-null   float64
 9   sex             9769 non-null   float64
 10  capital-gain    9769 non-null   int64  
 11  capital-loss    9769 non-null   int64  
 12  hours-per-week  9769 non-null   int64  
 13  native-country  9769 non-null   float64
dtypes: float64(8), int64(6)
memory usage: 1.1 MB


## Algorithm Training
The next step is to train two ML algorithms: **Random Forest** and **Extra Trees**.

In [16]:
rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(X_train, y_train)
print("Random forest model ready")

Random forest model ready


In [17]:
et = ExtraTreesClassifier(n_estimators=100)
et = et.fit(X_train, y_train)
print("Extra trees model ready")

Extra trees model ready


In [18]:
y_pred_rf = rf.predict(X_test)
y_pred_et = et.predict(X_test)

In [19]:
from sklearn.metrics import confusion_matrix

cm_rf = confusion_matrix(y_test, y_pred_rf)
cm_et = confusion_matrix(y_test, y_pred_et)

In [20]:
cm_rf

array([[6913,  556],
       [ 895, 1405]])

In [21]:
cm_et

array([[6822,  647],
       [ 930, 1370]])

The **Random Forest** classifier seems to perform better with this dataset.

The final step is to save our preprocessing objects (e.g., mode values and encoders) and ML models. We use [joblib](https://joblib.readthedocs.io/) which is a package specially designed to handle efficient computing and storage of large data.

In [33]:
joblib.dump(train_mode, './train_mode.joblib', compress=True)
joblib.dump(test_mode, './test_mode.joblib', compress=True)
joblib.dump(encoder, './encoder.joblib', compress=True)
joblib.dump(rf, './random_forest.joblib', compress=True)
joblib.dump(et, './extra_trees.joblib', compress=True)

['./extra_trees.joblib']