In [98]:
### Test tweeking model parameters: n_estimator, max_depth, max_leaf_nodes

In [69]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

### Model Hyper-Parameter Tuning:
#### Test changing multiple model parameters in the RandomForestClassifier. And find the best hyper-paramters set by cross validation grid search.
#### GridSearchCV Explained in here : https://stackoverflow.com/questions/19335165/what-is-the-difference-between-cross-validation-and-grid-search

Recall: Benchmark Model parameter
 - bootstrap=True, 
 - class_weight=None, 
 - criterion='gini',
 - max_depth=None, 
 - max_features='auto', 
 - max_leaf_nodes=None,
 - min_impurity_decrease=0.0, 
 - min_impurity_split=None,
 - min_samples_leaf=1, 
 - min_samples_split=12,
 - min_weight_fraction_leaf=0.0, 
 - n_estimators=80, 
 - n_jobs=None,
 - oob_score=True, random_state=22, verbose=0, warm_start=False)

In [106]:
param_grid = {"max_depth": [None,2,6],
              "max_features": [1, 5, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True],
              "oob_score": [False, True],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}

In [107]:
RFC5=RandomForestClassifier()

In [108]:
from sklearn import model_selection
kfold = model_selection.StratifiedKFold(n_splits=5)

In [70]:
# first load training data

In [109]:
tdata = pd.read_csv('cleaned_train.csv',index_col='PassengerId')
tdata['familySize']=tdata['SibSp']+tdata['Parch']+1
tdata=tdata.drop(columns=['Survived'],axis =1)

In [110]:
y= pd.read_csv('train.csv',usecols=['Survived'],index_col=None)

In [111]:
gsRFC = model_selection.GridSearchCV(RFC5,param_grid = param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)

In [112]:
gsRFC.fit(tdata,y)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    8.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   27.6s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 1620 out of 1620 | elapsed:  3.4min finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
       error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'max_depth': [None, 2, 6], 'max_features': [1, 5, 10], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [True], 'oob_score': [False, True], 'n_estimators': [100, 300], 'criterion': ['gini']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [113]:
#. check on the best estimator
gsRFC_best = gsRFC.best_estimator_
print(gsRFC_best)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [114]:
print("best estimator accuracy score:",gsRFC.best_score_*100,"%")

best estimator accuracy score: 84.6240179573513 %


In [115]:
# use this model to check on the confusion matrix
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tdata,y,test_size=0.3,random_state=22)
y_pred = gsRFC_best.predict(X_test)

In [116]:
print("Classification Accuracy is :",metrics.accuracy_score(y_test, y_pred)*100,"%")
print("Confusion Matrix is:\n",metrics.confusion_matrix(y_test, y_pred))
print("Classification Error is:\n",100-metrics.accuracy_score(y_test, y_pred)*100,"%")

Classification Accuracy is : 91.04477611940298 %
Confusion Matrix is:
 [[157   7]
 [ 17  87]]
Classification Error is:
 8.955223880597018 %


In [118]:
# use the same model parameters to do a five fold cv:
RFC6=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',max_features=10, max_leaf_nodes=None,min_impurity_decrease=0.0, min_impurity_split=None,min_samples_leaf=3, min_samples_split=2,min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,oob_score=False, random_state=None, verbose=0,warm_start=False)
RFC6.fit(X_train,y_train)
scores6= cross_val_score(RFC6,tdata,y,cv=5)
print("Average accuracy score from cv:",np.mean(scores6)*100,"%")
print("Classification Accuracy score variance from cv:",np.var(scores6))

  This is separate from the ipykernel package so we can avoid doing imports until
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Average accuracy score from cv: 84.29144116204522 %
Classification Accuracy score variance from cv: 0.0005786526960218155


### Update testing summary

| Model Type | Parameters | Classification Accuracy(%) | Confusion Matrix | CV model variance |
| --- | --- | --- | --- | --- |
| Pure Guess | Guess based on Majority(Deceased) | 61.62 | [[549,0][342,0]]| None |
| Random Forest | Default cleaned data, gini, min_samples_split=12, min_samples_leaf=1, 80 estimators | cv average 83.95 | [[147,17][33,71]]| 8.6*10^-4 |
| Random Forest | Combined FamilySize, RankC | cv average 84.18 | [[143,21][33,71]]| 5.5*10^-4 |
| Random Forest | Combined FamilySize, RankC, frequencyEcode 'Embarked'| cv average 83.62 | [[144,20][32,79]]| 5.52*10^-4 |
| Random Forest | Data features remain, but remapped 'Name' into less categories | cv average 83.05 | [[147,17][33,71]] | 6.2*10^-4
| Random Forest | feature tuned, model hyper-parameter best choice | grid search 84.62 | [[157,7][17,87]] | 5.8*10^-4 |

### Use the final best parameter model to predict the result on test data.

In [123]:
RFC6=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',max_features=10, max_leaf_nodes=None,min_impurity_decrease=0.0, min_impurity_split=None,min_samples_leaf=3, min_samples_split=2,min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,oob_score=False, random_state=None, verbose=0,warm_start=False)
RFC6.fit(tdata,y)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [119]:
#load test data
testd = pd.read_csv('cleaned_test.csv',index_col='PassengerId')
testd['familySize']=testd['SibSp']+testd['Parch']+1

In [124]:
test_pred = RFC6.predict(testd)

In [125]:
fea_eg_res = pd.read_csv('cleaned_test.csv',usecols=['PassengerId'])
fea_eg_res['Survived']=test_pred

In [126]:
fea_eg_res.to_csv('prediction4.csv',index=None)

#### Above I used the hyper-parameter given by GridsearchCV, Random Forest model to do a prediction. Submit to Kaggle, The result yields score 0.77990. Did not gain improvement.The highest ranking gets to #1218 (10.7%) ??? Don't know why.