# Kaggle Titanic Machine Learning
- source of competition: https://www.kaggle.com/c/titanic
- Data Dictionary: https://www.kaggle.com/c/titanic/data
- useful link for saving to GitHub: https://www.kaggle.com/questions-and-answers/72234

In [None]:
# Importing libraries
%matplotlib inline
import numpy as np 
import pandas as pd 
import pandas_profiling

# Setting Random Seed For Reproducibility
import random
random.seed(123)

# Displaying Max rows
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)

# Listing Files
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv') # for final evaluation/submission only

In [None]:
df_train.head()

In [None]:
df_train.info()

# Data Wrangling/Cleaning

In [None]:
# Creating train/val/test split prior to transformations (avoid data leakage)

X = df_train.drop(['Survived'],axis = 1)
y = df_train.Survived

from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.15, random_state = 3) # test set 15% train
# X_train, X_val, y_train, y_val   = train_test_split(X_train, y_train, test_size = 0.15, random_state = 3) #validation set 15% train

# Model already selection, rerunning with 99% of data 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.01, random_state = 3) # test set 1% train
X_train, X_val, y_train, y_val   = train_test_split(X_train, y_train, test_size = 0.01, random_state = 3) #validation set 1% train

In [None]:
for i in [X_train,X_test,X_val]:
    print(i.shape)

# Exploratory Data Analysis

General thoughts based on the profile below
- PassengerId - removing due to ID variable
- Missing values: Age, Cabin, Fare, Embarked
- Correlations in Fare-Class-Age

In [None]:
#combining the train feature/target data for EDA/Data Wrangling

df_train_split = pd.concat([X_train, y_train], axis = 1)
df_train_split.head(2)

In [None]:
#making use of the profile package for EDA plots/stats/...

profile = pandas_profiling.ProfileReport(df_train_split, title = "EDA Profile Train Data Report")

In [None]:
profile.to_widgets()

# Data Wrangling and Feature Engineering
- only on training dataset, will use a pipeline for val/test and final submission test set

In [None]:
# Missing Values Handling
print(df_train_split.Embarked.value_counts())

#Embarked only 1 missing, fill with most common of S, C, Q (will be S)
df_train_split.Embarked = df_train_split.Embarked.fillna(df_train_split.Embarked.value_counts().index[0]) #using value_counts top record

In [None]:
# dropping passenger id (is an id)
df_train_split.drop(['PassengerId'], axis = 1, inplace = True)

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categories='auto')
feature_array = ohe.fit_transform(df_train_split[['Parch','Pclass','Sex','SibSp','Embarked']]).toarray()
#feature_labels = ohe.categories_

In [None]:
features = pd.DataFrame(feature_array, columns=ohe.get_feature_names())
print(features.shape)
features.head()

In [None]:
df_train_split = df_train_split.drop(['Parch','Pclass','Sex','SibSp','Embarked'], axis = 1)


In [None]:
df_train_split = pd.concat([df_train_split.reset_index(drop=True),features.reset_index(drop=True)], axis = 1)

In [None]:
# Encoding the Age Missing Values that are 'S' with the training data median Age 
median_age_train = df_train_split.Age.median()
df_train_split['Age'] = df_train_split['Age'].apply(lambda x : median_age_train if pd.isnull(x) else x)
df_train_split.Age.isna().sum()

### Cabin Missing Values

In [None]:
#INPROGRESS #Missing Values Cabin - taking the initial value
df_train_split.Cabin = df_train_split[['Cabin']].fillna(value= 'Z')
df_train_split['Cabin_augment'] = df_train_split.Cabin.apply(lambda x : x[0]) # augmenting dataset, only want the first letter (numbers not matter)
df_train_split.Cabin_augment.value_counts()

In [None]:
df_train_split[['Cabin_augment','Fare']].groupby(['Cabin_augment']).mean().round(2)

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe_Cabin_augment = OneHotEncoder(categories='auto')
feature_array_Cabin_augment = ohe_Cabin_augment.fit_transform(df_train_split[['Cabin_augment']]).toarray()
#feature_labels = ohe.categories_
features_Cabin_augment = pd.DataFrame(feature_array_Cabin_augment, columns=ohe_Cabin_augment.get_feature_names())


In [None]:
df_train_split.drop(['Cabin','Cabin_augment'], axis =1, inplace = True)
df_train_split = pd.concat([df_train_split,features_Cabin_augment], axis = 1)

In [None]:
df_train_split.drop(['Name','Ticket'], axis = 1, inplace = True)

In [None]:
df_train_split.head()

In [None]:
# checking that all missing values are taken care of
print(df_train_split.isna().sum().sum())
df_train_split.shape

In [None]:
df_train_split_X = df_train_split.drop(['Survived'],axis = 1)
df_train_split_y = df_train_split[['Survived']]

### Imbalance Correction via SMOTE

In [None]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
import collections

print('Prior', collections.Counter(np.squeeze(df_train_split_y)))

In [None]:
# Smote Operation

smote_instance = ADASYN(random_state=0)
X_train_resampled, y_train_resampled = smote_instance.fit_sample(df_train_split_X, df_train_split_y)

print('Post', collections.Counter(np.squeeze(y_train_resampled)))

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_resampled = sc.fit_transform(X_train_resampled)
X_train_resampled.shape

In [None]:
# Performing Save Data Wrangling Steps on the Val/Test Data

#combining the train feature/target data for EDA/Data Wrangling
df_val_split = pd.concat([X_val, y_val], axis = 1)
df_val_split.Embarked = df_val_split.Embarked.fillna('S')
df_val_split.drop(['PassengerId'], axis = 1, inplace = True)

feature_array = ohe.transform(df_val_split[['Parch','Pclass','Sex','SibSp','Embarked']]).toarray()
features = pd.DataFrame(feature_array, columns=ohe.get_feature_names())
df_val_split = df_val_split.drop(['Parch','Pclass','Sex','SibSp','Embarked'], axis = 1)
df_val_split = pd.concat([df_val_split.reset_index(drop=True),features.reset_index(drop=True)], axis = 1)

df_val_split['Age'] = df_val_split['Age'].apply(lambda x : median_age_train if pd.isnull(x) else x)

df_val_split.Cabin = df_val_split[['Cabin']].fillna(value= 'Z')
df_val_split['Cabin_augment'] = df_val_split.Cabin.apply(lambda x : x[0])

feature_array_Cabin_augment = ohe_Cabin_augment.transform(df_val_split[['Cabin_augment']]).toarray()
features_Cabin_augment = pd.DataFrame(feature_array_Cabin_augment, columns=ohe_Cabin_augment.get_feature_names())

df_val_split.drop(['Cabin','Cabin_augment'], axis =1, inplace = True)
df_val_split = pd.concat([df_val_split,features_Cabin_augment], axis = 1)

df_val_split.drop(['Name','Ticket'], axis = 1, inplace = True)

print(df_val_split.shape)
df_val_split.head()

In [None]:
df_val_split_X = df_val_split.drop(['Survived'],axis = 1)
df_val_split_y = df_val_split[['Survived']]

In [None]:
df_val_split_X = sc.transform (df_val_split_X)
df_val_split_X.shape

# Model Developement 

In [None]:
# Baseline Model 
import xgboost as xgb
from sklearn.metrics import accuracy_score

eval_set = [(df_val_split_X,df_val_split_y.values.ravel())]

model_xgb = xgb.XGBClassifier(learning_rate = 0.01)
model_xgb.fit(X_train_resampled, y_train_resampled.values.ravel(), early_stopping_rounds=10, eval_metric="error", eval_set= eval_set,verbose = 0)

print("Training Accuracy:", accuracy_score(model_xgb.predict(X_train_resampled),y_train_resampled))
print("Validation Accuracy:", accuracy_score(model_xgb.predict(df_val_split_X),df_val_split_y))

In [None]:
# Adding Parameter Tuning
from sklearn.model_selection import GridSearchCV

eval_set = [(df_val_split_X,df_val_split_y.values.ravel())]

param_grid = {
    "learning_rate": [0.1,0.05],
    'max_depth': [2,3,4,5,6],
    'min_child_weight': [1, 2,4,6,8,10],
    'subsample': [0.5, 0.7, 0.9],
    'n_estimators': [5, 30, 100, 250, 500],
}

grid_clf = GridSearchCV(xgb.XGBClassifier() , param_grid, scoring='accuracy', cv=None)
grid_clf.fit(X_train_resampled, y_train_resampled.values.ravel() , early_stopping_rounds=10 , eval_metric="error", eval_set= eval_set,verbose = False)

best_parameters = grid_clf.best_params_

print('Grid Search found the following optimal parameters: ')
for param_name in sorted(best_parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))
    
print("Training Accuracy:", accuracy_score(grid_clf.predict(X_train_resampled),y_train_resampled.values.ravel()))
print("Validation Accuracy:", accuracy_score(grid_clf.predict(df_val_split_X),df_val_split_y.values.ravel()))


In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(X_train_resampled, y_train_resampled.values.ravel())

print("Training Accuracy:", accuracy_score(gnb.predict(X_train_resampled),y_train_resampled))
print("Validation Accuracy:", accuracy_score(gnb.predict(df_val_split_X),df_val_split_y))

In [None]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(random_state=0, max_iter = 1000).fit(X_train_resampled, y_train_resampled.values.ravel())

print("Training Accuracy:", accuracy_score(clf_log.predict(X_train_resampled),y_train_resampled))
print("Validation Accuracy:", accuracy_score(clf_log.predict(df_val_split_X),df_val_split_y))

# Model Stacking

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('knn', KNeighborsClassifier()),
    ('cart',DecisionTreeClassifier()),
    ('svr', make_pipeline(LinearSVC(random_state=42))),
    ('svc', SVC(gamma='auto'))]

clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

clf_stack = clf.fit(X_train_resampled, y_train_resampled.values.ravel())

print("Training Accuracy:", accuracy_score(clf_stack.predict(X_train_resampled),y_train_resampled.values.ravel()))
print("Validation Accuracy:", accuracy_score(clf_stack.predict(df_val_split_X),df_val_split_y.values.ravel()))

print("Confusion Matrix:\n",confusion_matrix(clf_stack.predict(df_val_split_X),df_val_split_y.values.ravel()))

In [None]:
#gridsearch (next steps)

# estimators = [
#     ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
#     ('knn', KNeighborsClassifier()),
#     ('cart',DecisionTreeClassifier()),
#     ('svr', make_pipeline(LinearSVC(random_state=42)))]

# sclf = StackingClassifier(estimators= estimators , final_estimator= LogisticRegression()) # =DecisionTreeClassifier())

# # params = {'rf__n_estimators': [5,10,20],
# #           'rf__max_features': [5,10,20],
# #           'rf__max_depth': [1,3,5,7],
# #           'rf__min_samples_leaf': [10,25,50],
# #           'knn__n_neighbors': [3,5,7],
# #           'knn__algorithm':['ball_tree','kd_tree']}

# params = {'rf__n_estimators': [5,10,20],
#           'rf__max_features': [5,10],
#           'rf__max_depth': [3,5,7],
#           'knn__n_neighbors': [3,5],
#           'knn__algorithm':['ball_tree','kd_tree']}

# grid = GridSearchCV(estimator=sclf, param_grid=params, cv=5)
# grid.fit(X_train_resampled, y_train_resampled.values.ravel())


# print("Training Accuracy:", accuracy_score(grid.predict(X_train_resampled),y_train_resampled.values.ravel()))
# print("Validation Accuracy:", accuracy_score(grid.predict(df_val_split_X),df_val_split_y.values.ravel()))

# print("Confusion Matrix:\n",confusion_matrix(grid.predict(df_val_split_X),df_val_split_y.values.ravel()))

# Predicting the test submission data

In [None]:
#transforming the test data like train (make a pipeline later...)

df_test = pd.read_csv('/kaggle/input/titanic/test.csv') # for final evaluation/submission only
df_test_ids = df_test[['PassengerId']] # for creating the csv

df_test.Embarked = df_test.Embarked.fillna('S')
df_test.drop(['PassengerId'], axis = 1, inplace = True)

df_test['Fare'].fillna((df_test['Fare'].mean()), inplace=True) # fare in test has 1 missing value, set to mean
df_test.loc[df_test['Parch'] == 9, 'Parch'] = 0 #9 is contained in Parch only in the test, so setting to most common, cause next ohe transfor to break

feature_array = ohe.transform(df_test[['Parch','Pclass','Sex','SibSp','Embarked']]).toarray()
features = pd.DataFrame(feature_array, columns=ohe.get_feature_names())
df_test = df_test.drop(['Parch','Pclass','Sex','SibSp','Embarked'], axis = 1)
df_test = pd.concat([df_test.reset_index(drop=True),features.reset_index(drop=True)], axis = 1)

df_test['Age'] = df_test['Age'].apply(lambda x : median_age_train if pd.isnull(x) else x)

df_test.Cabin = df_test[['Cabin']].fillna(value= 'Z')
df_test['Cabin_augment'] = df_test.Cabin.apply(lambda x : x[0])

feature_array_Cabin_augment = ohe_Cabin_augment.transform(df_test[['Cabin_augment']]).toarray()
features_Cabin_augment = pd.DataFrame(feature_array_Cabin_augment, columns=ohe_Cabin_augment.get_feature_names())

df_test.drop(['Cabin','Cabin_augment'], axis =1, inplace = True)
df_test = pd.concat([df_test,features_Cabin_augment], axis = 1)

df_test.drop(['Name','Ticket'], axis = 1, inplace = True)

df_test = sc.transform(df_test)

print(df_test.shape)

In [None]:
# predicting using the clf_stack

predictions = clf_stack.predict(df_test)

In [None]:
# Dynamic Filename Creation

from datetime import datetime
from pytz import timezone

timestr = datetime.now(timezone('EST')).strftime("%Y%m%d_%H%M%S")
file_name = 'rad_submission_' + timestr + '.csv'
file_name

In [None]:
#Creating the sumission object CSV

df_submit = pd.DataFrame(data=np.column_stack((df_test_ids, predictions)),columns=['PassengerId','Survived'])
df_submit.to_csv(file_name, index=False)

# Potential Next Steps / Changes to Consider / Resources Referenced
- Potentially use K-Fold Cross validation due to small size 
- Feature engineering (Class x sex), (Class x Parch)
- Add more model types, more hyperparameters
- Add model stacking
- https://alexforrest.github.io/you-might-be-leaking-data-even-if-you-cross-validate.html
- https://machinelearningmastery.com/data-preparation-without-data-leakage/
- http://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/