In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# One hot encoding
from feature_engine.categorical_encoders import OneHotCategoricalEncoder
# Integer Encoding / Label Encoding
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder
# Count or frequency encoding
from feature_engine.categorical_encoders import CountFrequencyCategoricalEncoder
# Ordered Integer Encoding
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder
# Encoding using "Weight of evidence"
from feature_engine.categorical_encoders import WoERatioCategoricalEncoder

In [2]:
# let's load the titanic dataset

# we will only use these columns in the demo
cols = ['pclass', 'age', 'sibsp', 'parch', 'fare',
        'sex', 'cabin', 'embarked', 'survived']

data = pd.read_csv('titanic.csv', usecols=cols)

data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,cabin,embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [3]:
data.dropna(subset=['fare', 'embarked'], inplace=True)

In [4]:
# Now we extract the first letter of the cabin

data['cabin'] = data['cabin'].astype(str).str[0]

In [5]:
# drop observations with cabin = T, they are too few
data = data[data['cabin'] != 'T']

In [6]:
# Let's divide into train and test set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels='survived', axis=1),  # predictors
    data['survived'],  # target
    test_size=0.3,
    random_state=0)

In [7]:
data['cabin'].unique()

array(['n', 'C', 'E', 'G', 'D', 'A', 'B', 'F'], dtype=object)

In [8]:
# Let's replace null values in numerical variables by the mean


def impute_na(df, variable, value):
    df[variable].fillna(value, inplace=True)


impute_na(X_test, 'age', X_train['age'].mean())
impute_na(X_train, 'age',  X_train['age'].mean())



# note how I impute first the test set, this way the value of
# the median used will be the same for both train and test

In [9]:
X_train.isnull().sum()

pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
cabin       0
embarked    0
dtype: int64

In [10]:
# One hot encoding using Feature Engine Library
ohe_enc = OneHotCategoricalEncoder(
    top_categories=None,
    variables=['sex','cabin','embarked'], # we can select which variables to encode
    drop_last=True) # to return k-1, false to return k


ohe_enc.fit(X_train)
X_train_OHE = ohe_enc.transform(X_train)
X_test_OHE = ohe_enc.transform(X_test)

In [11]:
# Integer Encoding / Label Encoding using Feature Engine Library
ordinal_enc = OrdinalCategoricalEncoder(
    encoding_method='arbitrary',
    variables=['sex','cabin','embarked'])

ordinal_enc.fit(X_train)
X_train_le = ordinal_enc.transform(X_train)
X_test_le = ordinal_enc.transform(X_test)

In [12]:
count_enc = CountFrequencyCategoricalEncoder(
    encoding_method='count', # to do frequency ==> encoding_method='frequency'
    variables=['sex','cabin','embarked'])

count_enc.fit(X_train)

X_train_count_enc = count_enc.transform(X_train)
X_test_count_enc = count_enc.transform(X_test)

In [14]:
ordinal_enc = OrdinalCategoricalEncoder(
    # NOTE that we indicate ordered in the encoding_method, otherwise it assings numbers arbitrarily
    encoding_method='ordered',
    variables=['sex','cabin','embarked'])

ordinal_enc.fit(X_train, y_train)

X_train_ordinal_enc = ordinal_enc.transform(X_train)
X_test_ordinal_enc = ordinal_enc.transform(X_test)

In [15]:
woe_enc = WoERatioCategoricalEncoder(
    encoding_method = 'woe',
    variables=['cabin', 'sex', 'embarked'])

woe_enc.fit(X_train, y_train)

X_train_woe = woe_enc.transform(X_train)
X_test_woe = woe_enc.transform(X_test)

# Random Forest Performance

In [17]:
def run_randomForests(X_train, X_test, y_train, y_test):

    rf = RandomForestClassifier(n_estimators=50, random_state=39, max_depth=3)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

    print('Test set')
    pred = rf.predict_proba(X_test)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

In [18]:
run_randomForests(X_train_OHE,X_test_OHE,y_train,y_test)

Train set
Random Forests roc-auc: 0.8599348001482805
Test set
Random Forests roc-auc: 0.8656563217002776


In [19]:
run_randomForests(X_train_le,X_test_le,y_train,y_test)

Train set
Random Forests roc-auc: 0.8775485727992325
Test set
Random Forests roc-auc: 0.8745924405265064


In [20]:
run_randomForests(X_train_count_enc,X_test_count_enc,y_train,y_test)

Train set
Random Forests roc-auc: 0.8760494123290956
Test set
Random Forests roc-auc: 0.8819587006400194


In [21]:
run_randomForests(X_train_ordinal_enc,X_test_ordinal_enc,y_train,y_test)

Train set
Random Forests roc-auc: 0.8770960989118821
Test set
Random Forests roc-auc: 0.8806605482429659


In [22]:
run_randomForests(X_train_woe,X_test_woe,y_train,y_test)

Train set
Random Forests roc-auc: 0.8770960989118821
Test set
Random Forests roc-auc: 0.8806605482429659


# Logistic Regression Performance

In [16]:
def run_logistic(X_train, X_test, y_train, y_test):

    # function to train and test the performance of logistic regression
    logit = LogisticRegression(random_state=44, C=0.01)
    logit.fit(X_train, y_train)

    print('Train set')
    pred = logit.predict_proba(X_train)
    print(
        'Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

    print('Test set')
    pred = logit.predict_proba(X_test)
    print(
        'Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

In [23]:
run_logistic(X_train_OHE,X_test_OHE,y_train,y_test)



Train set
Logistic Regression roc-auc: 0.8254813667982294
Test set
Logistic Regression roc-auc: 0.8454896751600047


In [24]:
run_logistic(X_train_le,X_test_le,y_train,y_test)

Train set
Logistic Regression roc-auc: 0.8184707472906082
Test set
Logistic Regression roc-auc: 0.8409008573843739




In [25]:
run_logistic(X_train_count_enc,X_test_count_enc,y_train,y_test)

Train set
Logistic Regression roc-auc: 0.7678045748926057
Test set
Logistic Regression roc-auc: 0.8108320251177394




In [27]:
run_logistic(X_train_ordinal_enc,X_test_ordinal_enc,y_train,y_test)

Train set
Logistic Regression roc-auc: 0.8169988442835648
Test set
Logistic Regression roc-auc: 0.8359497645211931




In [28]:
run_logistic(X_train_woe,X_test_woe,y_train,y_test)

Train set
Logistic Regression roc-auc: 0.8446270088750301
Test set
Logistic Regression roc-auc: 0.8642072213500785


