In [81]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,ShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from re import sub
from sklearn import metrics

import random
import json
from datetime import datetime
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [82]:
import os
os.chdir(r'E:/Kaggle Competition/titanic')

In [83]:
train_df = pd.read_csv('train.csv')
score_df = pd.read_csv('test.csv')

In [84]:
y = train_df['Survived']


In [85]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [86]:
def impute_mean(impute_df,agg_col,group_col_list):
    
    agg_obj = impute_df.groupby(group_col_list)[agg_col].transform('mean')
    imputed = impute_df[agg_col].fillna(agg_obj)
    
    
    return(imputed)
def impute_mode(series):
    return(series.fillna(series.mode()))    

In [87]:
def name_title_creator(name):
    title = name.apply(lambda x : x.split('.')[0].split(' ')[-1] )
    title = title.replace('Lady','Mrs').replace('Mlle','Mrs')
    title = title.replace('Miss','Ms').replace('Mme','Ms')
    return(title)

In [88]:
def rel_age_creator(train_df,score_df):
    
    train_df = train_df.drop(['Survived'], axis = 1)
    complete_df = pd.concat([train_df,score_df],axis = 0)
    complete_df['Title'] = name_title_creator(complete_df['Name'])
    
    complete_df['Age'] = impute_mean(complete_df,'Age',['Title'])
    complete_df['Surname'] = complete_df['Name'].apply(lambda x : x.split(',')[0].strip())
    rel_age_df= complete_df[(complete_df['SibSp']>0) & (complete_df['Parch']>0)][['Ticket','Age','Surname']]
    rel_age_df = rel_age_df.rename(columns = {'Age':'Relative_Age'})
    return(rel_age_df)

In [114]:
def data_preparation(df,rel_age_df):
    df['Surname'] = df['Name'].apply(lambda x : x.split(',')[0].strip())
    df['CabinGroup'] = df['Cabin'].apply(lambda x: str(x)[0].upper())
    df['Title'] = name_title_creator(df['Name'])
    if 'Survived' in df.columns:
        df = df.drop(['Survived'],axis = 1)
    
    df['Embarked'] = df.groupby(['Title'], sort=False)['Embarked'].apply(lambda x: x.fillna(x.mode()[0]))
    
    df['Fare'] = impute_mean(df,'Fare',['Pclass','Embarked'])
    df['Age'] = impute_mean(df,'Age',['Title'])
    df['TicketGroup'] = df['Ticket'].apply(lambda x : str(x[:1]))
    
    df = df.drop(['Name','Cabin'],axis = 1)
    
    
    
    df = pd.merge(df,rel_age_df,on = ['Ticket','Surname'],how = 'left')
    
    df['Relative_Age'] = np.where(df['Relative_Age'].isna(),df['Age'],df['Relative_Age'])
    df = df.groupby([col for col in df.columns if col!='Relative_Age'] ).agg({'Relative_Age' : [np.mean,np.max,np.min]}).reset_index(level = None,drop = False)
    
    df.columns = ['_'.join(col).strip('_') for col in df.columns.values]
    df['Relative_Age_amin'] = np.minimum(df['Relative_Age_amin'],df['Age'])
    df['Relative_Age_amax']= np.maximum(df['Relative_Age_amax'],df['Age'])
    
    print(df.columns)
    df.sort_values(['PassengerId'],inplace = True)
    df = df.drop(['PassengerId','Surname','Ticket'],axis = 1)
    df['Pclass'] = df['Pclass'].apply(lambda x : str(x))
    df['SibSp'] = df['SibSp'].apply(lambda x : str(x))
    df['Parch'] = df['Parch'].apply(lambda x : str(x))
    
    col_list = df.select_dtypes(['object']).columns
    for cols in col_list:
        df = pd.concat([df,pd.get_dummies(df[cols],prefix = cols)],axis = 1 )
        df = df.drop([cols],axis = 1)
    is_NaN = df.isnull()
    row_has_NaN = is_NaN.any(axis=1)
    rows_with_NaN = df[row_has_NaN]
    print(rows_with_NaN)
    print(len(df.index))
    return(df)

In [115]:
rel_age_df = rel_age_creator(train_df,score_df)

In [116]:
prepped_df = data_preparation(train_df,rel_age_df)

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Embarked', 'Surname', 'CabinGroup', 'Title', 'TicketGroup',
       'Relative_Age_mean', 'Relative_Age_amax', 'Relative_Age_amin'],
      dtype='object')
Empty DataFrame
Columns: [Age, Fare, Relative_Age_mean, Relative_Age_amax, Relative_Age_amin, Pclass_1, Pclass_2, Pclass_3, Sex_female, Sex_male, SibSp_0, SibSp_1, SibSp_2, SibSp_3, SibSp_4, SibSp_5, SibSp_8, Parch_0, Parch_1, Parch_2, Parch_3, Parch_4, Parch_5, Parch_6, Embarked_C, Embarked_Q, Embarked_S, CabinGroup_A, CabinGroup_B, CabinGroup_C, CabinGroup_D, CabinGroup_E, CabinGroup_F, CabinGroup_G, CabinGroup_N, CabinGroup_T, Title_Capt, Title_Col, Title_Countess, Title_Don, Title_Dr, Title_Jonkheer, Title_Major, Title_Master, Title_Mr, Title_Mrs, Title_Ms, Title_Rev, Title_Sir, TicketGroup_1, TicketGroup_2, TicketGroup_3, TicketGroup_4, TicketGroup_5, TicketGroup_6, TicketGroup_7, TicketGroup_8, TicketGroup_9, TicketGroup_A, TicketGroup_C, Ti

In [117]:
x = prepped_df.copy()

In [118]:
def model_training(x_train,y_train,model_name,params):
    if model_name=='rf':
        model = RandomForestClassifier().set_params(**params)
    elif model_name == 'xgb':
        model = XGBClassifier(use_label_encoder = False).set_params(**params)
    elif model_name == 'svm':
        model = SVC().set_params(**params)
    return(model.fit(x_train,y_train))

In [119]:
def itergrid(model_name):
    if model_name == 'rf':
        return({'n_estimators' : [100,300,500,700,1000],
                'max_depth' : [5,7,9,11],
                'criterion' : ['gini','entropy'],
                'max_features' : ['auto','sqrt','log2']})
    elif model_name =='xgb':
        return({'eta' : np.linspace(0,1,5),
                'gamma' : np.linspace(0,10,1) ,
                'max_depth' : [5,7,9,11],
                'sub_sample' : [0.1,0.2,0.5,0.8],
                'process_type' : ['default']})
    elif model_name == 'svm':
        return({'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear','poly','sigmoid']})
    
    

In [141]:
def auc_calculator(actuals,prediction):
    print('-'*50)
    auc = metrics.roc_auc_score(actuals,prediction)
    print(auc)
    print('-'*50)
    return(auc)

In [142]:
def result_creator (conf_matrix,model_name,params,threshold):
# save confusion matrix and slice into four pieces
    TP = conf_matrix[1][1]
    TN = conf_matrix[0][0]
    FP = conf_matrix[0][1]
    FN = conf_matrix[1][0]
    print('True Positives:', TP)
    print('True Negatives:', TN)
    print('False Positives:', FP)
    print('False Negatives:', FN)
    
    
    # calculate accuracy
    conf_accuracy = (float (TP+TN) / float(TP + TN + FP + FN))
    
    # calculate mis-classification
    conf_misclassification = 1- conf_accuracy
    
    # calculate the sensitivity
    conf_sensitivity = (TP / float(TP + FN))
    # calculate the specificity
    conf_specificity = (TN / float(TN + FP))
    
    # calculate precision
    conf_precision = (TN / float(TN + FP))
    # calculate f_1 score
    conf_f1 = 2 * ((conf_precision * conf_sensitivity) / (conf_precision + conf_sensitivity))
    print('-'*50)
    print(threshold)
    print(f'Accuracy: {round(conf_accuracy,2)}') 
    print(f'Mis-Classification: {round(conf_misclassification,2)}') 
    print(f'Sensitivity: {round(conf_sensitivity,2)}') 
    print(f'Specificity: {round(conf_specificity,2)}') 
    print(f'Precision: {round(conf_precision,2)}')
    print(f'f_1 Score: {round(conf_f1,2)}')
    return(pd.DataFrame({'Model_Name': model_name,'Params' : str(json.dumps(params)),'Accuracy': round(conf_accuracy,2),'F1_Score':round(conf_f1,2),'AUC' : round(auc,3),'threshold' : threshold},index = [0]))
    

In [143]:
cv_split = ShuffleSplit(1,test_size = 0.2,random_state = 0)
all_results = []
all_models = ['rf']
threshold_list = [0.3,0.4,0.5]
for model_name in all_models:

    for _ in range(100):
        current_params = {}
        grid = itergrid(model_name)
        for key in grid.keys():
            current_params[key] = random.choice(grid[key])
        for train_index,test_index in cv_split.split(x):
            x_train,x_test = x.loc[train_index],x.loc[test_index]
            y_train,y_test = y.loc[train_index],y.loc[test_index]
            
            model = model_training(x_train,y_train,model_name,current_params)
            
            
            prob_predict= model.predict_proba(x_test)[:, 1]
            auc = auc_calculator(y_test,prob_predict)
            for thresh in threshold_list:
                cm_threshold = metrics.confusion_matrix(y_test,(prob_predict>=thresh).astype(bool))
                
                all_results.append(result_creator(cm_threshold,model_name,current_params,thresh))
        
# x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)

--------------------------------------------------
0.8959156785243743
--------------------------------------------------
True Positives: 59
True Negatives: 88
False Positives: 22
False Negatives: 10
--------------------------------------------------
0.3
Accuracy: 0.82
Mis-Classification: 0.18
Sensitivity: 0.86
Specificity: 0.8
Precision: 0.8
f_1 Score: 0.83
True Positives: 53
True Negatives: 94
False Positives: 16
False Negatives: 16
--------------------------------------------------
0.4
Accuracy: 0.82
Mis-Classification: 0.18
Sensitivity: 0.77
Specificity: 0.85
Precision: 0.85
f_1 Score: 0.81
True Positives: 50
True Negatives: 101
False Positives: 9
False Negatives: 19
--------------------------------------------------
0.5
Accuracy: 0.84
Mis-Classification: 0.16
Sensitivity: 0.72
Specificity: 0.92
Precision: 0.92
f_1 Score: 0.81
--------------------------------------------------
0.9110671936758893
--------------------------------------------------
True Positives: 61
True Negatives: 86

In [144]:
all_results_df = pd.concat(all_results,axis = 0)

In [145]:
grouped_results_df = all_results_df.groupby(['Model_Name','Params','threshold'])['F1_Score'].mean().reset_index(drop = False)


In [146]:
print(max(grouped_results_df.F1_Score))

0.84


In [147]:
best_model = dict(grouped_results_df[grouped_results_df['F1_Score']==max(grouped_results_df['F1_Score'])].reset_index(drop = True))
best_model_name = best_model['Model_Name'][0]
best_params = json.loads(best_model['Params'][0])
best_threshold = best_model['threshold'][0]

In [148]:
trained_model = model_training(x,y,best_model_name,best_params)


In [149]:
test_df = pd.read_csv('test.csv')

In [150]:
test_transformed_df = data_preparation(test_df,rel_age_df)

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Embarked', 'Surname', 'CabinGroup', 'Title', 'TicketGroup',
       'Relative_Age_mean', 'Relative_Age_amax', 'Relative_Age_amin'],
      dtype='object')
Empty DataFrame
Columns: [Age, Fare, Relative_Age_mean, Relative_Age_amax, Relative_Age_amin, Pclass_1, Pclass_2, Pclass_3, Sex_female, Sex_male, SibSp_0, SibSp_1, SibSp_2, SibSp_3, SibSp_4, SibSp_5, SibSp_8, Parch_0, Parch_1, Parch_2, Parch_3, Parch_4, Parch_5, Parch_6, Parch_9, Embarked_C, Embarked_Q, Embarked_S, CabinGroup_A, CabinGroup_B, CabinGroup_C, CabinGroup_D, CabinGroup_E, CabinGroup_F, CabinGroup_G, CabinGroup_N, Title_Col, Title_Dona, Title_Dr, Title_Master, Title_Mr, Title_Mrs, Title_Ms, Title_Rev, TicketGroup_1, TicketGroup_2, TicketGroup_3, TicketGroup_4, TicketGroup_6, TicketGroup_7, TicketGroup_9, TicketGroup_A, TicketGroup_C, TicketGroup_F, TicketGroup_L, TicketGroup_P, TicketGroup_S, TicketGroup_W]
Index: []
418


In [151]:
# Get missing columns in the training test
missing_cols = set( x.columns ) - set( test_transformed_df.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    test_transformed_df[c] = 0
test_transformed_df = test_transformed_df[x.columns]

In [159]:
predictions = (trained_model.predict_proba(test_transformed_df)[:,1]>=best_threshold).astype(int)

In [160]:
len(test_df)

418

In [161]:
len(predictions)

418

In [162]:
date = str(datetime.today())[:10].replace('-','_')
pd.DataFrame({'PassengerId':list(test_df.PassengerId),'Survived' : list(predictions)}).to_csv(f'{date}_Predictions.csv',index = False)

In [163]:
train_predictions = trained_model.predict(x)
train_df['Prediction'] = train_predictions
train_df.to_csv(f'{date}_TrainedPrediction.csv')