## Classification
## Example: Predict survival on Titanic

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Working with data

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [26]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
full_data=[train, test]

#### Pclass

In [28]:
# There is no missing value on this feature and already a numerical value.
# let's check it's impact on our train set.

In [4]:
train.groupby('Pclass').size()

Pclass
1    216
2    184
3    491
dtype: int64

In [5]:
train[['Pclass', 'Survived']].groupby('Pclass').Survived.mean()

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

#### Sex

In [185]:
train.groupby('Sex').size()

Sex
0    314
1    577
dtype: int64

In [39]:
train[['Sex', 'Survived']].groupby('Sex', as_index = False).mean()

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


#### SibSp and Parch

In [165]:
# With the number of siblings/spouse and the number of children/parents 
# we can create new feature called Family Size

In [4]:
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [41]:
print(full_data[0].columns)
print(full_data[1].columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'FamilySize'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'FamilySize'],
      dtype='object')


In [43]:
train[['FamilySize', 'Survived']].groupby('FamilySize', as_index=False).mean()

Unnamed: 0,FamilySize,Survived
0,1,0.303538
1,2,0.552795
2,3,0.578431
3,4,0.724138
4,5,0.2
5,6,0.136364
6,7,0.333333
7,8,0.0
8,11,0.0


#### Embarked

In [44]:
train[train['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,1
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,1


In [45]:
train[(train['Fare']>79) & (train['Fare']<81) & (train['Pclass']==1)].groupby('Embarked').size()

Embarked
C    4
S    3
dtype: int64

In [5]:
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('C')

In [47]:
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

Unnamed: 0,Embarked,Survived
0,C,0.558824
1,Q,0.38961
2,S,0.336957


#### Fare

In [6]:
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

In [7]:
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)

In [10]:
train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean()

Unnamed: 0,CategoricalFare,Survived
0,"(-0.001, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31.0]",0.454955
3,"(31.0, 512.329]",0.581081


####  Age

In [180]:
# We have plenty of missing values in this feature. 
# Generate random numbers between (mean - std) and (mean + std). 
# Then we categorize age into 5 range.

In [8]:
np.random.seed(0)
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)  

train['CategoricalAge'] = pd.cut(train['Age'], 5)

print (train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean())

  CategoricalAge  Survived
0  (-0.08, 16.0]  0.504274
1   (16.0, 32.0]  0.355705
2   (32.0, 48.0]  0.376518
3   (48.0, 64.0]  0.434783
4   (64.0, 80.0]  0.090909


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Data preprocessing

In [105]:
full_data[0]['Sex'].head()

0      male
1    female
2    female
3    female
4      male
Name: Sex, dtype: object

In [9]:
  for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} )

In [107]:
full_data[0]['Sex'].head()

0    1
1    0
2    0
3    0
4    1
Name: Sex, dtype: int64

In [10]:
for dataset in full_data:
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'C': 0, 'Q': 1, 'S': 2} ).astype(int)

In [11]:
for dataset in full_data:   
    # Mapping FamilySize
    dataset.loc[ dataset['FamilySize'] == 1, 'FamilySize'] = 0
    dataset.loc[(dataset['FamilySize'] > 1) & (dataset['FamilySize'] <= 5), 'FamilySize'] = 1
    dataset.loc[dataset['FamilySize'] > 4, 'FamilySize'] = 2 

In [27]:
factors = train[['FamilySize','Embarked']]

In [29]:
from sklearn import preprocessing
# enc = preprocessing.OneHotEncoder(categorical_features = ['Embarked', 'FamilySize'])
enc = preprocessing.OneHotEncoder(dtype = 'int32')
enc.fit(factors)

OneHotEncoder(categorical_features='all', dtype='int32',
       handle_unknown='error', n_values='auto', sparse=True)

In [34]:
tfactors = enc.transform(factors).toarray()
tfactors[:5,]

array([[0, 1, 0, 0, 0, 1],
       [0, 1, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1]], dtype=int32)

dtype('float64')

In [35]:
train_fam_emb = pd.DataFrame(tfactors, columns = ('Fam_small', 'Fam_med', 'Fam_large', 'Emb_C', 'Emb_Q', 'Emb_S'))

In [36]:
train_fam_emb.head()

Unnamed: 0,Fam_small,Fam_med,Fam_large,Emb_C,Emb_Q,Emb_S
0,0,1,0,0,0,1
1,0,1,0,1,0,0
2,1,0,0,0,0,1
3,0,1,0,0,0,1
4,1,0,0,0,0,1


### Feature Selection

In [37]:
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'CategoricalAge', 'CategoricalFare', 'FamilySize', 'Embarked']
train = train.drop(drop_elements, axis = 1)


In [45]:
train = pd.concat([train,train_fam_emb], axis=1)

In [46]:
train.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Fam_small', 'Fam_med', 'Fam_large', 'Emb_C', 'Emb_Q', 'Emb_S'],
      dtype='object')

In [47]:
trainv = train.values

In [48]:
trainv.shape

(891, 13)

In [49]:
type(trainv)

numpy.ndarray

In [50]:
X = trainv[0:, 1:]
y = trainv[0:, 0]

## Modeling

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report

In [52]:
# Split to train and test
# 75% and 25% by default
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=2)
print(Xtrain.shape, Xtest.shape)

(668, 12) (223, 12)


In [53]:
np.unique(ytrain, return_counts = True)

(array([ 0.,  1.]), array([418, 250], dtype=int64))

In [54]:
np.unique(ytest, return_counts = True)

(array([ 0.,  1.]), array([131,  92], dtype=int64))

In [None]:
# http://scikit-learn.org

## LogisticRegression

In [None]:
# http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [55]:
# Model
model_lr = LogisticRegression(random_state = 1)
print(model_lr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [None]:
# C - Inverse of regularization strength; must be a positive float.
# Smaller values specify stronger regularization.

### Fit the model

In [56]:
model_lr.fit(Xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Model fit parameters

In [57]:
model_lr.coef_

array([[-0.81547336, -2.46131207, -0.03043557, -0.40000027, -0.00869862,
         0.00325333,  0.71487504,  1.25496152,  0.20860909,  0.85581994,
         0.72494029,  0.59768542]])

In [59]:
model_lr.coef_.reshape(12,)

array([-0.81547336, -2.46131207, -0.03043557, -0.40000027, -0.00869862,
        0.00325333,  0.71487504,  1.25496152,  0.20860909,  0.85581994,
        0.72494029,  0.59768542])

In [61]:
params = pd.Series(model_lr.coef_.reshape(12,), index=train.columns[1:])
params

Pclass      -0.815473
Sex         -2.461312
Age         -0.030436
SibSp       -0.400000
Parch       -0.008699
Fare         0.003253
Fam_small    0.714875
Fam_med      1.254962
Fam_large    0.208609
Emb_C        0.855820
Emb_Q        0.724940
Emb_S        0.597685
dtype: float64

In [62]:
model_lr.intercept_

array([ 2.17844565])

### Model validation

In [63]:
# Predict on train

ypred_train = model_lr.predict(Xtrain)
ypred_train_proba = model_lr.predict_proba(Xtrain)

In [64]:
# Predict on test

ypred = model_lr.predict(Xtest)
print(ypred[:10])

ypred_proba = model_lr.predict_proba(Xtest)
print(ypred_proba[:5,:])

# ypred_proba[:,0] - probability for class zero (not survived), 
# ypred_proba[:,1] - probability for class one - survived

[ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
[[ 0.72682068  0.27317932]
 [ 0.88387862  0.11612138]
 [ 0.18789277  0.81210723]
 [ 0.88944875  0.11055125]
 [ 0.82788342  0.17211658]]


#### Metrics: accuracy, confusion matrix, classification report, AUC
#### http://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics

In [None]:
# We can check our classification accuracy by comparing 
# the true values of the test set to the predictions:

In [65]:
# Accuracy on train
accuracy_score(ytrain, ypred_train)

0.81586826347305386

In [66]:
# Accuracy on test
accuracy_score(ytest, ypred)

0.7847533632286996

In [67]:
# Score for classification models is accuracy
model_lr.score(Xtest, ytest)

0.7847533632286996

In [None]:
# Accuracy doesn't tell us where we've gone wrong: 
# one nice way to do this is to use the confusion matrix

In [68]:
print(confusion_matrix(ytest, ypred))

[[115  16]
 [ 32  60]]


In [69]:
target_names = ['not survived', 'survived']
print(classification_report(ytest, ypred, target_names=target_names))

              precision    recall  f1-score   support

not survived       0.78      0.88      0.83       131
    survived       0.79      0.65      0.71        92

 avg / total       0.79      0.78      0.78       223



In [70]:
# AUC
# y_scores -  probability estimates of the positive class

print("AUC on traint =", roc_auc_score(ytrain, ypred_train_proba[:, 1]))
print("AUC on test =", roc_auc_score(ytest, ypred_proba[:, 1]))

AUC on traint = 0.863751196172
AUC on test = 0.838864918686


### K-fold Cross-Validation

In [72]:
from sklearn.model_selection import cross_val_score

In [73]:
scores = cross_val_score(model_lr, X, y, cv=5)
scores

array([ 0.78212291,  0.79888268,  0.76966292,  0.79213483,  0.81920904])

In [74]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.79 (+/- 0.03)


### Hyperparameters Grid Search

In [75]:
from sklearn.grid_search import GridSearchCV

param_grid = {'C': [.001, .01, 1, 10],
              'penalty': ['l1', 'l2']}
lr = LogisticRegression(random_state=1)
grid = GridSearchCV(lr, param_grid, cv=5)



In [76]:
grid.fit(X, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 1, 10], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [77]:
# Mean cross-validated score of the best_estimator
grid.best_score_

0.792368125701459

In [78]:
grid.best_params_

{'C': 1, 'penalty': 'l2'}

In [79]:
model = grid.best_estimator_

In [80]:
print(model)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [81]:
cross_val_score(model, X, y, cv=5).mean()

0.79240247578983802

### Save / load a model

In [82]:
from sklearn.externals import joblib
joblib.dump(model, 'model.pkl') 

['model.pkl']

In [83]:
model1 = joblib.load('model.pkl') 

In [84]:
print(model1)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [85]:
ypred = model1.predict(Xtest)
ypred[:10]

array([ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

## RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

## GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier