In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
import pandas as pd
import numpy as np
import imp
import matplotlib.pyplot as plt
import xgboost
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import seaborn as sns
%matplotlib inline
from HelperClass.DataProcessing import *

In [3]:
trainDataFile = 'Data/train.csv'
testDataFile = 'Data/test.csv'
dataTypes = {
    'PassengerId' : 'int64',
    'Survived' : 'int64',
    'Pclass' : 'int64',
    'Name' : 'object',
    'Sex' : 'object',
    'Age' : 'float64',
    'SibSp' : 'int64',
    'Parch' : 'int64',
    'Ticket' : 'object',
    'Fare' : 'float64',
    'Cabin' : 'object',
    'Embarked' : 'object'
}
sep = ','
target = 'Survived'
randomSeed = 83213
testRatio = 0.20
numCores = 10

In [4]:
dataProc = DataProcessing(trainDataFile, dataTypes, sep)
dataProc.ReadFile()
dataProc.AllData = dataProc.AllData[~dataProc.AllData.Embarked.isnull()]

submitDataProc = DataProcessing(testDataFile, dataTypes, sep)
submitDataProc.ReadFile()
submitDataProc.AllData = submitDataProc.AllData[~submitDataProc.AllData.Embarked.isnull()]

Findings
1. Age makes difference, F_onewayResult(statistic=4.271194933815904, pvalue=0.03912465401348333)
2. Fare makes difference, F_onewayResult(statistic=63.03076422804448, pvalue=6.120189341921873e-15)
3. PassengerId doesn't make difference, F_onewayResult(statistic=0.022284812266068058, pvalue=0.8813657768798144)
4. Pclass makes difference, F_onewayResult(statistic=115.03127218827665, pvalue=2.5370473879805644e-25)
5. SibSp doesn't make difference, F_onewayResult(statistic=1.110572204113227, pvalue=0.29224392869817906)
6. Parch makes difference, F_onewayResult(statistic=5.963463836603541, pvalue=0.0147992453747224)

In [5]:
def PopulateFeatures(data):
    data.Cabin.fillna('NA', inplace=True)
    data['CabinType'] = data.Cabin.fillna('NA').str.get(0)
    return data

In [6]:
dataProc.AllData = PopulateFeatures(dataProc.AllData)
submitDataProc.AllData = PopulateFeatures(submitDataProc.AllData)

In [7]:
dataProc.AllData['CabinType_B_D_E'] = np.where(dataProc.AllData.CabinType.isin(['B', 'D', 'E']), 2, 1)
submitDataProc.AllData['CabinType_B_D_E'] = np.where(submitDataProc.AllData.CabinType.isin(['B', 'D', 'E']), 2, 1)

In [8]:
dataProc.AllData.Age.fillna(-999, inplace=True)
submitDataProc.AllData.Age.fillna(-999, inplace=True)

In [9]:
bins = [-10000, 0, 20, 60, 1000]
dataProc.AllData['AgeGroup'] = pd.cut(dataProc.AllData.Age, bins).astype(str)
submitDataProc.AllData['AgeGroup'] = pd.cut(submitDataProc.AllData.Age, bins).astype(str)

In [10]:
dataProc.AllData['GenderAgeGroup'] = dataProc.AllData.Sex + '_' + dataProc.AllData.AgeGroup
submitDataProc.AllData['GenderAgeGroup'] = submitDataProc.AllData.Sex + '_' + submitDataProc.AllData.AgeGroup

In [11]:
dataProc.AllData.groupby(['GenderAgeGroup']).agg({'Survived' : ['mean', 'count']}).sort_values(by=('Survived', 'mean'), 
                                                                                               ascending=False)

Unnamed: 0_level_0,Survived,Survived
Unnamed: 0_level_1,mean,count
GenderAgeGroup,Unnamed: 1_level_2,Unnamed: 2_level_2
"female_(60, 1000]",1.0,2
"female_(20, 60]",0.777778,180
"female_(0, 20]",0.688312,77
"female_(-10000, 0]",0.679245,53
"male_(0, 20]",0.284314,102
"male_(20, 60]",0.186747,332
"male_(-10000, 0]",0.129032,124
"male_(60, 1000]",0.105263,19


In [12]:
genderAgeGroups = {
    'GenderAgeGroup' : {
        'female_(20, 60]' : 7, 
        'female_(60, 1000]' : 8,
        'female_(0, 20]' : 6, 
        'female_(-10000, 0]' : 5,
        'male_(0, 20]' : 4, 
        'male_(20, 60]' : 3, 
        'male_(-10000, 0]' : 2, 
        'male_(60, 1000]' : 1
    }
}

In [13]:
dataProc.AllData.replace(genderAgeGroups, inplace=True)
submitDataProc.AllData.replace(genderAgeGroups, inplace=True)

In [14]:
dataProc.AllData['Sex'] = np.where(dataProc.AllData['Sex'] == 'male', 1, 2)
submitDataProc.AllData['Sex'] = np.where(submitDataProc.AllData['Sex'] == 'male', 1, 2)

In [15]:
embarkedEncoding = {
    'Embarked' : {
       'S' : 1,
       'Q' : 1,
       'C' : 2
    }
}

In [16]:
dataProc.AllData.replace(embarkedEncoding, inplace=True)
submitDataProc.AllData.replace(embarkedEncoding, inplace=True)

In [17]:
numCols = ['Fare', 'Pclass', 'Parch', 'CabinType_B_D_E', 'GenderAgeGroup', 'Embarked', 'Sex']

ageFilledInData

dataProc.AllData['Ticket_Num_Ind'] = dataProc.AllData['Ticket'].str.isnumeric()

In [18]:
dataProc.PopulateFeatureColumns(numCols)
submitDataProc.PopulateFeatureColumns(numCols)

In [19]:
dataProc.PopulateFeatureAndLabel(target)
submitDataProc.X = submitDataProc.AllData[numCols]

In [20]:
dataProc.RandomSplitTrainTestData(testRatio, randomSeed, False)

In [21]:
averageFareThridClass = submitDataProc.AllData[(submitDataProc.AllData.Pclass == 3) & 
                                               (submitDataProc.AllData.CabinType == 'N')].Fare.mean()
submitDataProc.X.Fare.fillna(averageFareThridClass, inplace=True)

In [22]:
sortedCols = ['CabinType_B_D_E', 'Embarked', 'Fare', 'GenderAgeGroup', 'Parch', 'Pclass', 'Sex']

## Random forest model

In [23]:
preProc = ColumnTransformer([
    #('StdScaler', StandardScaler(), numCols),
    #('OneHot', OneHotEncoder(handle_unknown='ignore'), catCols)
])
rf = RandomForestClassifier(random_state = randomSeed)
rfpipeline = Pipeline([#('preProcessing', preProc),
                       #('ReduceDim', SelectKBest(f_classif)),
                       ('rf', rf)])
params = {
   # 'ReduceDim__k' : range(5, 11),
    'rf__n_estimators' : range(3, 20),
    'rf__max_depth' : range(1, 5),
    'rf__max_features' : ['sqrt', 'log2', None]
}
rfcv = RandomizedSearchCV(rfpipeline, n_iter= 1000, scoring = 'accuracy', 
                          param_distributions=params, verbose=1, cv = 10, n_jobs = numCores, random_state=randomSeed)

## XGB model

In [24]:
param_grid = {
    'xgb__max_depth' : [3, 4, 5, 6],
    'xgb__learning_rate' : [0.05, 0.1, 0.2],
    'xgb__n_estimators' : range(5, 21),
    'xgb__subsample' : [0.8, 0.9, 0.95],
    #'xgb__colsample_bytree' : [0.8, 0.9, 0.95],
    'xgb__reg_alpha' : [0.05, 0.1, 0.2, 0.4],
    'xgb__reg_lambda' : [0.05, 0.1, 0.2, 0.4],
    'xgb__gamma' : [0.0001, 0.001, 0.01, 0.1]
}

xgb = xgboost.XGBClassifier(seed=randomSeed)
xgbpipeline = Pipeline([#('preProcessing', preProc),
                        ('xgb', xgb)])
xgbcv = RandomizedSearchCV(xgbpipeline, n_iter = 500, cv=10, param_distributions =param_grid,
                        scoring='accuracy', verbose = 1, n_jobs = numCores, random_state=randomSeed)

## Model stacking

In [25]:
from mlens.ensemble import BlendEnsemble
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

estimators = [rfcv, xgbcv]
ensemble = BlendEnsemble()
ensemble.add(estimators, proba=True)   # Specify 'proba' here
ensemble.add_meta(LogisticRegression())

[MLENS] backend: threading


BlendEnsemble(array_check=None, backend=None,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=None, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=BlendIndex(X=None, raise_on_exception=...rer=None)],
   n_jobs=-1, name='group-1', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=None, sample_size=20, scorer=None, shuffle=False,
       test_size=0.5, verbose=False)

In [26]:
ensemble.fit(dataProc.X_train.reindex(columns=sortedCols), dataProc.y_train)

Fitting 10 folds for each of 204 candidates, totalling 2040 fitsFitting 10 folds for each of 204 candidates, totalling 2040 fits

Fitting 10 folds for each of 500 candidates, totalling 5000 fits
Fitting 10 folds for each of 500 candidates, totalling 5000 fits


[Parallel(n_jobs=10)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=10)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=10)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=10)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=10)]: Done 2040 out of 2040 | elapsed:  2.2min finished
[Parallel(n_jobs=10)]: Done 5000 out of 5000 | elapsed:  2.7min finished
[Parallel(n_jobs=10)]: Done 2040 out of 2040 | elapsed:  2.7min finished
[Parallel(n_jobs=10)]: Done 5000 out of 5000 | elapsed:  3.0min finished


BlendEnsemble(array_check=None, backend=None,
       layers=[Layer(backend='threading', dtype=<class 'numpy.float32'>, n_jobs=-1,
   name='layer-1', propagate_features=None, raise_on_exception=True,
   random_state=None, shuffle=False,
   stack=[Group(backend='threading', dtype=<class 'numpy.float32'>,
   indexer=BlendIndex(X=None, raise_on_exception=...rer=None)],
   n_jobs=-1, name='group-1', raise_on_exception=True, transformers=[])],
   verbose=0)],
       model_selection=False, n_jobs=None, raise_on_exception=True,
       random_state=None, sample_size=20, scorer=None, shuffle=False,
       test_size=0.5, verbose=False)

In [27]:
y_test_stacking_pred = ensemble.predict(dataProc.X_test.reindex(columns=sortedCols))
print('Stacking Random Forest and XGB %.3f ' % accuracy_score(dataProc.y_test, y_test_stacking_pred))

Stacking Random Forest and XGB 0.809 


In [31]:
submitDataProc.AllData[target] = ensemble.predict(submitDataProc.X.reindex(columns=sortedCols)).astype('int')
submitDataProc.AllData[['PassengerId', 'Survived']].to_csv('Output/submit.txt', index=False)