In [2]:
#Lets start with importing all the libraries that we used yesterday
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

  from numpy.core.umath_tests import inner1d


In [3]:
# Lets create a function that will perform all the basic pre-processing.
def preprocessing(dataset,bins=False,simplifyCategories=False,removeOne=False,return_test_train=True):
    dataset['Age']=dataset.Age.fillna(dataset.groupby("Sex")["Age"].transform("mean"))
    dataset['Sex']=np.where(dataset.Sex=='male',1,0)
    features=['Pclass','Sex','Fare','SibSp','Parch', 'Age']
    if bins==True:  
        bins = [0, 16, 24, 30, 40, 50, 100]
        labels = ["0-16","16-24","24-30","30-40","40-50","50+"]
        training['Age'] = pd.cut(training['Age'], bins=bins, labels=labels)
        dataset['SibSp'] = np.where(dataset.SibSp > 2, 3,dataset.SibSp)
        dataset['Parch'] = np.where(dataset.Parch > 2, 3,dataset.Parch)
        features=['Pclass','Sex','Fare','SibSp','Parch', 'Age']
    
    if simplifyCategories==True:
        #Subseting Cabin to get only the first letter of the Cabin as the type of the cabin
        dataset['CabinType']=dataset.Cabin.str[:1]
        features=['Pclass','Sex','Fare','SibSp','Parch', 'Age','Embarked','CabinType']
        
        
    y = dataset.Survived
    X = dataset[features]
    X=pd.get_dummies(X)
    if removeOne==True:
        X=X.drop(['Embarked_S','Age_50+','CabinType_T'],axis=1)
    if return_test_train==True:
        train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
        return train_X, val_X, train_y, val_y
    if return_test_train==False:
        return X

In [4]:
# Also lets create another function that provides us accuracy score for a given model
def get_scores(model,val_X,val_y):
    val_predictions=model.predict(val_X)
    print("accuracy score is %.4g" %accuracy_score(val_y,val_predictions))

In [5]:
# Lets read the data and process it
training=pd.read_csv("Data/train.csv")
train_X, val_X, train_y, val_y=preprocessing(training,bins=True,simplifyCategories=True,removeOne=True)

In [6]:
train_X.head()

Unnamed: 0,Pclass,Sex,Fare,SibSp,Parch,Age_0-16,Age_16-24,Age_24-30,Age_30-40,Age_40-50,Embarked_C,Embarked_Q,CabinType_A,CabinType_B,CabinType_C,CabinType_D,CabinType_E,CabinType_F,CabinType_G
105,3,1,7.8958,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
68,3,0,7.925,3,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0
253,3,1,16.1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
320,3,1,7.25,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
706,2,0,13.5,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [7]:
Decision_Tree_model = DecisionTreeClassifier()
Decision_Tree_model.fit(train_X, train_y)
get_scores(Decision_Tree_model,val_X,val_y)

accuracy score is 0.7937


In [8]:
rf = RandomForestClassifier(random_state=27)
# Fit on training data
rf.fit(train_X, train_y)
get_scores(rf,val_X,val_y)

accuracy score is 0.8161


In [9]:
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 27,
 'verbose': 0,
 'warm_start': False}


In [10]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [14]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, 
                               verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_X, train_y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   56.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  7.3min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [15]:
rf_random.best_params_

{'n_estimators': 1800,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': False}

In [12]:
rf_random = RandomForestClassifier(n_estimators=1800,min_samples_split=2,min_samples_leaf=4,max_features='auto',max_depth=50,bootstrap=False)

# Fit the random search model
rf_random.fit(train_X, train_y)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1800, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
base_model = RandomForestClassifier(n_estimators = 10, random_state = 27)
base_model.fit(train_X, train_y)
base_accuracy = get_scores(base_model, val_X,val_y)

accuracy score is 0.8161


In [16]:
random_accuracy = get_scores(rf_random,  val_X,val_y)

accuracy score is 0.843


In [33]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [30, 40, 50, 60],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [2, 3, 4,5],
    'n_estimators': [1780,1790, 1800,1810, 1820]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(train_X, train_y)

Fitting 3 folds for each of 480 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 22.5min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 34.7min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed: 965.0min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [True], 'max_depth': [30, 40, 50, 60], 'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5], 'min_samples_split': [2, 3, 4, 5], 'n_estimators': [1780, 1790, 1800, 1810, 1820]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [35]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 50,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'n_estimators': 1800}

In [17]:
best_grid = RandomForestClassifier(n_estimators=1800,min_samples_split=3,min_samples_leaf=3,max_features='auto',max_depth=50,bootstrap=True)

# Fit the random search model
best_grid.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=1800, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
grid_accuracy = get_scores(best_grid,  val_X,val_y)

accuracy score is 0.8386


In [19]:
test=pd.read_csv("Data/test.csv")
test['Age']=test.Age.fillna(test.groupby("Sex")["Age"].transform("mean"))
test['Sex']=np.where(test.Sex=='male',1,0)
features=['Pclass','Sex','Fare','SibSp','Parch', 'Age']

bins = [0, 16, 24, 30, 40, 50, 100]
labels = ["0-16","16-24","24-30","30-40","40-50","50+"]
test['Age'] = pd.cut(test['Age'], bins=bins, labels=labels)
test['SibSp'] = np.where(test.SibSp > 2, 3,test.SibSp)
test['Parch'] = np.where(test.Parch > 2, 3,test.Parch)

test['CabinType']=test.Cabin.str[:1]
features=['Pclass','Sex','Fare','SibSp','Parch', 'Age','Embarked','CabinType']
test=test[features]
test=pd.get_dummies(test)
test_processed=test.drop(['Embarked_S','Age_50+'],axis=1)
test_processed=test_processed.fillna(0)

In [21]:
test_predictions = rf_random.predict(test_processed)
dataset = pd.DataFrame({'Survived': test_predictions[:,]})
test=pd.read_csv("Data/test.csv")
test[['PassengerId']].join(dataset).to_csv("results_random.csv",index=False)

In [22]:
test_predictions = best_grid.predict(test_processed)
dataset = pd.DataFrame({'Survived': test_predictions[:,]})
test=pd.read_csv("Data/test.csv")
test[['PassengerId']].join(dataset).to_csv("results_grid.csv",index=False)