<a href="https://colab.research.google.com/github/plthiyagu/Personnel/blob/master/Feature_Engineering_Variable_Magnititude.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
% matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [5]:
df= pd.read_csv('https://raw.githubusercontent.com/plthiyagu/Personnel/master/Dataset/titanic.csv',usecols = ['Pclass', 'Age', 'Fare', 'Survived'])
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [7]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,Fare
count,891.0,891.0,714.0,891.0
mean,0.383838,2.308642,29.699118,32.204208
std,0.486592,0.836071,14.526497,49.693429
min,0.0,1.0,0.42,0.0
25%,0.0,2.0,20.125,7.9104
50%,0.0,3.0,28.0,14.4542
75%,1.0,3.0,38.0,31.0
max,1.0,3.0,80.0,512.3292


In [8]:
# let's now calculate the range

for col in ['Pclass', 'Age', 'Fare']:
    print(col, '_range: ', df[col].max()-df[col].min())

Pclass _range:  2
Age _range:  79.58
Fare _range:  512.3292


In [10]:
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    df[['Pclass', 'Age', 'Fare']].fillna(0),
    df.Survived,
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((623, 3), (268, 3))

In [11]:
# scaling the features between 0 and 1. 

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
print('Mean: ', X_train_scaled.mean(axis=0))
print('Standard Deviation: ', X_train_scaled.std(axis=0))
print('Minimum value: ', X_train_scaled.min(axis=0))
print('Maximum value: ', X_train_scaled.max(axis=0))

Mean:  [0.64365971 0.30131421 0.06335433]
Standard Deviation:  [0.41999093 0.21983527 0.09411705]
Minimum value:  [0. 0. 0.]
Maximum value:  [1. 1. 1.]


In [13]:
# model build on unscaled variables

logit = LogisticRegression(random_state=44, C=1000) # c big to avoid regularization
logit.fit(X_train, y_train)
print('Train set')
pred = logit.predict_proba(X_train)
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = logit.predict_proba(X_test)
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
Logistic Regression roc-auc: 0.7134823539619531
Test set
Logistic Regression roc-auc: 0.7080952380952381


In [14]:
logit.coef_

array([[-0.92585764, -0.01822689,  0.00233577]])

In [15]:
# model built on scaled variables
logit = LogisticRegression(random_state=44, C=1000) # c big to avoid regularization
logit.fit(X_train_scaled, y_train)
print('Train set')
pred = logit.predict_proba(X_train_scaled)
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = logit.predict_proba(X_test_scaled)
print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
Logistic Regression roc-auc: 0.7134931997136721
Test set
Logistic Regression roc-auc: 0.7080952380952381


In [16]:
logit.coef_

array([[-1.85170244, -1.45782986,  1.19540159]])

In [17]:
# model build on data with plenty of categories in Cabin variable

SVM_model = SVC(random_state=44, probability=True)
SVM_model.fit(X_train, y_train)
print('Train set')
pred = SVM_model.predict_proba(X_train)
print('SVM roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = SVM_model.predict_proba(X_test)
print('SVM roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
SVM roc-auc: 0.6641558751437062
Test set
SVM roc-auc: 0.6845833333333333


In [18]:
SVM_model = SVC(random_state=44, probability=True)
SVM_model.fit(X_train_scaled, y_train)
print('Train set')
pred = SVM_model.predict_proba(X_train_scaled)
print('SVM roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = SVM_model.predict_proba(X_test_scaled)
print('SVM roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
SVM roc-auc: 0.7007819786989435
Test set
SVM roc-auc: 0.6742559523809524


In [19]:
# model built on unscaled features

NN_model = MLPClassifier(random_state=44, solver='sgd')
NN_model.fit(X_train, y_train)
print('Train set')
pred = NN_model.predict_proba(X_train)
print('Neural Network roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = NN_model.predict_proba(X_test)
print('Neural Network roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
Neural Network roc-auc: 0.6012288236697686
Test set
Neural Network roc-auc: 0.565


In [20]:

# model built on scaled features

NN_model = MLPClassifier(random_state=44, solver='sgd')
NN_model.fit(X_train_scaled, y_train)
print('Train set')
pred = NN_model.predict_proba(X_train_scaled)
print('Neural Network roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = NN_model.predict_proba(X_test_scaled)
print('Neural Network roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
Neural Network roc-auc: 0.7165300101950066
Test set
Neural Network roc-auc: 0.7124404761904761




In [21]:
#model built on unscaled features

KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X_train, y_train)
print('Train set')
pred = KNN.predict_proba(X_train)
print('KNN roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = KNN.predict_proba(X_test)
print('KNN roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
KNN roc-auc: 0.8694225721784778
Test set
KNN roc-auc: 0.6253571428571428


In [22]:
# model built on scaled

KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X_train_scaled, y_train)
print('Train set')
pred = KNN.predict_proba(X_train_scaled)
print('KNN roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = KNN.predict_proba(X_test_scaled)
print('KNN roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
KNN roc-auc: 0.8880555736318084
Test set
KNN roc-auc: 0.7017559523809525


In [23]:
# model built on unscaled features

rf = RandomForestClassifier(n_estimators=700, random_state=39)
rf.fit(X_train, y_train)
print('Train set')
pred = rf.predict_proba(X_train)
print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = rf.predict_proba(X_test)
print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
Random Forests roc-auc: 0.9914589705212469
Test set
Random Forests roc-auc: 0.760327380952381


In [24]:
# model built in scaled features
rf = RandomForestClassifier(n_estimators=700, random_state=39)
rf.fit(X_train_scaled, y_train)
print('Train set')
pred = rf.predict_proba(X_train_scaled)
print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = rf.predict_proba(X_test_scaled)
print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
Random Forests roc-auc: 0.991491507776404
Test set
Random Forests roc-auc: 0.7610119047619048


In [25]:
ada = AdaBoostClassifier(n_estimators=200, random_state=44)
ada.fit(X_train, y_train)
print('Train set')
pred = ada.predict_proba(X_train)
print('AdaBoost roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = ada.predict_proba(X_test)
print('AdaBoost roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
AdaBoost roc-auc: 0.8477364916162339
Test set
AdaBoost roc-auc: 0.7733630952380953


In [26]:
ada = AdaBoostClassifier(n_estimators=200, random_state=44)
ada.fit(X_train_scaled, y_train)
print('Train set')
pred = ada.predict_proba(X_train_scaled)
print('AdaBoost roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = ada.predict_proba(X_test_scaled)
print('AdaBoost roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
AdaBoost roc-auc: 0.8477364916162339
Test set
AdaBoost roc-auc: 0.7733630952380953
