In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libraries

In [None]:
# visualization libraries
import seaborn as sns
sns.set(rc={'figure.figsize':(12,8)})
import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib inline
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objects as go
import plotly.express as px
import cufflinks as cf
cf.go_offline()
init_notebook_mode(connected=True)
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
# import function
from sklearn.linear_model import LogisticRegression
# peformance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report,roc_auc_score
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,GridSearchCV,RepeatedStratifiedKFold
from yellowbrick.features import FeatureImportances
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
#Import svm model
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import optuna
from xgboost import XGBClassifier
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Objective

### The goal is to predict which passengers survived the Titanic shipwreck.

# Data

## Load the needed dataset

In [None]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
train_df.sample(10)

In [None]:
train_df.info()

In [None]:
train_df.shape

In [None]:
# Any null features
train_df.isnull().sum() # Age, Cabin and Embarked fields have null values

In [None]:
# dropping the Id column
train_df = train_df.drop('PassengerId',axis=1)

In [None]:
train_df.head()

In [None]:
train_df.describe().T

In [None]:
train_df.Survived.value_counts()

# Exploratory Data Analysis

## Univariate Analysis, Bivariate Analysis

#### Pclass

In [None]:
train_df.Pclass.value_counts()

In [None]:
train_df.Pclass.value_counts().plot.barh()

In [None]:
sns.histplot(data=train_df, x=train_df["Pclass"], hue="Survived", multiple="dodge", shrink=.8)

In [None]:
pd.crosstab(train_df["Survived"],train_df["Pclass"])

In [None]:
pd.crosstab(train_df["Pclass"],train_df["Survived"]).plot(kind="bar", figsize=(10,6),  color=["salmon", "lightblue"]);
plt.title("Survived vs Passenger Class")
plt.xlabel("0 = No Survival, 1 = Survival")
plt.ylabel("PClass")
plt.legend(["No Survival", "Survival"])
plt.xticks(rotation=0);

#### Sex feature

In [None]:
train_df.Sex.value_counts()

In [None]:
train_df.Sex.value_counts().plot.barh()

In [None]:
sns.histplot(data=train_df, x=train_df["Sex"], hue="Survived", multiple="dodge", shrink=.8)

In [None]:
pd.crosstab(train_df["Survived"],train_df["Sex"])

#### Age

In [None]:
train_df.Age.isnull().sum()

In [None]:
train_df.Age.value_counts()

In [None]:
sns.histplot(data=train_df, x=train_df["Age"], binwidth=5 , kde=True)

In [None]:
sns.violinplot("Survived", "Age", data=train_df, palette=["lightblue", "lightpink"]);

In [None]:
sns.displot(data=train_df, x='Age', hue='Survived', kind='kde', fill=True)

#### SibSp

In [None]:
train_df.SibSp.value_counts()

In [None]:
train_df.SibSp.value_counts().plot.barh()

In [None]:
pd.crosstab(train_df["Survived"],train_df["SibSp"])

#### Parch

In [None]:
train_df.Parch.value_counts()

In [None]:
train_df.Parch.value_counts().plot.barh()

In [None]:
pd.crosstab(train_df["Survived"],train_df["Parch"])

#### Ticket

In [None]:
train_df.Ticket.value_counts()

In [None]:
train_df = train_df.drop('Ticket',axis=1)

#### Fare

In [None]:
sns.histplot(data=train_df, x=train_df["Fare"], binwidth=10 , kde=True)

In [None]:
LogFare = np.log(train_df.Fare + 1.0) # Adding 1 to accomodate zero fares : log(0) is not defined

In [None]:
# Histogram of LogFare
LogFare.plot(kind='hist', color='c', bins=20);

In [None]:
sns.violinplot("Survived", "Fare", data=train_df, palette=["lightblue", "lightpink"]);

In [None]:
# box-whisker plot
train_df.Fare.plot(kind='box')

#### Cabin

In [None]:
train_df.Cabin.value_counts()

In [None]:
train_df.Cabin.unique()

#### Embarked

In [None]:
train_df.Embarked.value_counts()

In [None]:
train_df.Embarked.value_counts().plot.barh()

# Feature Selection

In [None]:
# Function to extract the title from the name 
def GetTitle(name):
    first_name_with_title = name.split(',')[1]
    title = first_name_with_title.split('.')[0]
    title = title.strip().lower()
    return title

In [None]:
# use map function to apply the function on each Name value row i
train_df.Name.map(lambda x : GetTitle(x)) 

In [None]:
train_df.Name.map(lambda x : GetTitle(x)).unique()

In [None]:
train_df.Name.map(lambda x : GetTitle(x)).unique()

In [None]:
# Function to extract the title from the name 
def GetTitle(name):
    title_group = {'mr' : 'Mr', 
               'mrs' : 'Mrs', 
               'miss' : 'Miss', 
               'master' : 'Master',
               'don' : 'Sir',
               'rev' : 'Sir',
               'dr' : 'Officer',
               'mme' : 'Mrs',
               'ms' : 'Mrs',
               'major' : 'Officer',
               'lady' : 'Lady',
               'sir' : 'Sir',
               'mlle' : 'Miss',
               'col' : 'Officer',
               'capt' : 'Officer',
               'the countess' : 'Lady',
               'jonkheer' : 'Sir',
               'dona' : 'Lady'
                 }
    first_name_with_title = name.split(',')[1]
    title = first_name_with_title.split('.')[0]
    title = title.strip().lower()
    return title_group[title]

In [None]:
# create Title feature
train_df['Title'] =  train_df.Name.map(lambda x : GetTitle(x))

In [None]:
# binning
pd.qcut(train_df.Fare, 4)

In [None]:
pd.qcut(train_df.Fare, 4, labels=['very_low','low','high','very_high']) # discretization

In [None]:
# create fare bin feature
train_df['Fare_Bin'] = pd.qcut(train_df.Fare, 4, labels=['very_low','low','high','very_high'])

In [None]:
# AgeState based on Age
train_df['AgeState'] = np.where(train_df['Age'] >= 18, 'Adult','Child')

In [None]:
# AgeState Counts
train_df['AgeState'].value_counts()

In [None]:
train_df.groupby(['Pclass']).Fare.median()

In [None]:
train_df.groupby(['Pclass']).Age.median()

In [None]:
train_df.groupby(['Pclass'])['Fare','Age'].median()

In [None]:
train_df.groupby(['Pclass']).agg({'Fare' : 'mean', 'Age' : 'median'})

In [None]:
# pivot table
train_df.pivot_table(index='Sex',columns = 'Pclass',values='Age', aggfunc='mean')

In [None]:
# Family : Adding Parents with Siblings
train_df['FamilySize'] = train_df.Parch + train_df.SibSp + 1 # 1 for self

In [None]:
# explore the family feature
train_df['FamilySize'].plot(kind='hist', color='c');

In [None]:
train_df.dtypes

In [None]:
train_df.plot.scatter(x='Age', y='Fare', color='c', title='scatter plot : Age vs Fare');

In [None]:
train_df.pivot_table(index='Sex',columns = 'Pclass',values='Age', aggfunc='mean')

In [None]:
train_df.dtypes

In [None]:
train_df = pd.get_dummies(train_df,columns=['Sex', 'Pclass','Title', 'Fare_Bin', 'Embarked','AgeState'])

In [None]:
train_df.info()

In [None]:
# drop columns
train_df.drop(['Cabin','Name','Parch','SibSp'], axis=1, inplace=True)

In [None]:
#### the KNN Imptuer is a distance-based imputation method and it requires us to normalize our data. 
imputer = KNNImputer()
scaler = MinMaxScaler()
train_df = pd.DataFrame(scaler.fit_transform(train_df), columns = train_df.columns)
train_df.head()

In [None]:
train_df.isnull().sum()

In [None]:
imputer = KNNImputer(n_neighbors=5)
train_df = pd.DataFrame(imputer.fit_transform(train_df),columns = train_df.columns)

In [None]:
train_df.isnull().sum()

# Model training and evaluation

In [None]:
X = train_df.drop('Survived',axis=1)
y = train_df['Survived']

In [None]:
X.head()

In [None]:
y

In [None]:
X.shape,y.shape

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, test_size=0.2, random_state=0)


In [None]:
 X_train.shape, y_train.shape,X_test.shape, y_test.shape

In [None]:
# average survival in train and test
print ('mean survival in train : {0:.3f}'.format(np.mean(y_train)))
print ('mean survival in test : {0:.3f}'.format(np.mean(y_test)))

# Logistic Regression

In [None]:
# create model
model = LogisticRegression(random_state=0)

In [None]:
# train model
model.fit(X_train,y_train)

In [None]:
# evaluate model
print ('score for logistic regression - version 1 : {0:.2f}'.format(model.score(X_test, y_test)))

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
precision_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
# model coefficients
model.coef_

In [None]:
print(X_train.columns)

#### Shows the features ranked according to the explained variance each feature contributes to the model. In this case the features are plotted against their relative importance, that is the percent importance of the most important feature

In [None]:
fig, ax = plt.subplots(figsize=(16, 14))
visualization = FeatureImportances(model)
visualization.fit(X, y)
visualization.poof()

# Stratified Crossvalidation

In [None]:
for model in [LogisticRegression]:
     skflr = model()
     skf = StratifiedKFold(n_splits=10, random_state=42)
     s = cross_val_score(skflr, X, y, scoring="roc_auc", cv=skf)
     print("Accuracy = ", s.mean())

In [None]:
# Loads test data set
test = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
test.head()

In [None]:
test.drop(["PassengerId"], axis=1, inplace=True)

In [None]:
test.drop(["Ticket"], axis=1, inplace=True)

In [None]:
test.isnull().sum()

In [None]:
# create Title feature
test['Title'] =  test.Name.map(lambda x : GetTitle(x))

# create fare bin feature
test['Fare_Bin'] = pd.qcut(test.Fare, 4, labels=['very_low','low','high','very_high'])

# AgeState based on Age
test['AgeState'] = np.where(test['Age'] >= 18, 'Adult','Child')

# AgeState Counts
test['AgeState'].value_counts()



In [None]:
# Family : Adding Parents with Siblings
test['FamilySize'] = test.Parch + test.SibSp + 1 # 1 for self



In [None]:
test = pd.get_dummies(test,columns=['Sex', 'Pclass','Title', 'Fare_Bin', 'Embarked','AgeState'])

# drop columns
test.drop(['Cabin','Name','Parch','SibSp'], axis=1, inplace=True)

In [None]:

test = pd.DataFrame(scaler.fit_transform(test), columns = test.columns)
test.head()


In [None]:
test = pd.DataFrame(imputer.fit_transform(test),columns = test.columns)

test.isnull().sum()


In [None]:
submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [None]:
submission.head()

In [None]:
submission.info()

In [None]:
# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state= 40)
val_acc = []
test_predictions = []
submission_predictions = []
model = LogisticRegression(random_state=0)

In [None]:
for fold, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    x_train_fold, x_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    print('Fold', fold )
    
    model.fit(x_train_fold, y_train_fold)
    
    print("score : ",model.score(x_train_fold, y_train_fold))
    
    y_pred = model.predict(x_test_fold)
    print("Validation score : ",accuracy_score(y_test_fold, y_pred))
    
    preds = model.predict(X_test)
    test_predictions.append(preds)
    
    submission_preds = model.predict(test)
    submission_predictions.append(submission_preds)
    

In [None]:
y_pred = np.mean(np.column_stack(test_predictions), axis=1)

In [None]:
y_pred = y_pred.astype('int32')
y_test = y_test.astype('int32')

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
submission_preds = np.mean(np.column_stack(submission_predictions), axis=1)
submission_preds = submission_preds.astype('int32')

In [None]:
submit_df =  pd.DataFrame({'PassengerId': submission['PassengerId'],
                          'Survived': submission_preds})

In [None]:
submit_df.head(10)

In [None]:
submit_df.to_csv('submission.csv',index=False)

# Naive Bayes

In [None]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
test_preds = classifier.predict(test)
test_preds = test_preds.astype('int32')

In [None]:
submission["Survived"] = test_preds

In [None]:
submission

In [None]:
submission.to_csv('submission.csv',index=False) # 0.76794

# KNN

In [None]:
knc = KNeighborsClassifier()
knc.fit(X_train, y_train)



In [None]:
knc.score(X_test, y_test)


In [None]:
y_pred = knc.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
test_preds = knc.predict(test)
test_preds = test_preds.astype('int32')

In [None]:
submission["Survived"] = test_preds

In [None]:
submission

In [None]:
submission.to_csv('submission.csv',index=False) # 0.76794

# Linear Discriminant Analysis

In [None]:
# LDA
lda = LDA()
lda.fit(X_train,y_train)

In [None]:
y_pred=lda.predict(X_test)

In [None]:
lda.score(X_test, y_test)

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
test_preds = lda.predict(test)
test_preds = test_preds.astype('int32')

In [None]:
submission["Survived"] = test_preds

In [None]:
submission

In [None]:
submission.to_csv('submission.csv',index=False) # 0.77511

#### Tuning LDA Hyperparameters solver and shrinkage with sklearn GridSearchCV

#### An important hyperparameter is the solver, which defaults to ‘svd‘ but can also be set to other values for solvers that support the shrinkage capability.

#### Reading ... https://machinelearningmastery.com/linear-discriminant-analysis-with-python/

In [None]:
# define model
model = LDA()
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['solver'] = ['svd', 'lsqr', 'eigen']
# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, y)
# summarize
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

In [None]:
# define model
model =LDA(solver='lsqr')
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['shrinkage'] = np.arange(0, 1, 0.01)
# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X, y)
# summarize
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

# Support Vector Machine 

#### The linear kernel

In [None]:
svm_model = svm.SVC(kernel='linear', class_weight='balanced') 
svm_model.fit(X_train, y_train)

In [None]:
y_pred=svm_model.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
test_preds = svm_model.predict(test)
test_preds = test_preds.astype('int32')

In [None]:
submission["Survived"] = test_preds
submission

In [None]:
submission.to_csv('submission.csv',index=False) # 0.77033

#### Polynomial Kernel can distinguish curved or nonlinear input space.
#### SVM classifier using a third-degree polynomial kernel. the hyperparameter coef0 controls how much the model is influenced by high degree polynomials versus low degree polynomials

In [None]:
# SVM Classifier model
svm_model = svm.SVC(kernel="poly", degree=3, coef0=1, C=5,class_weight='balanced')
svm_model.fit(X_train, y_train)

In [None]:
y_pred=svm_model.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
test_preds = svm_model.predict(test)
test_preds = test_preds.astype('int32')

In [None]:
submission["Survived"] = test_preds
submission

In [None]:
submission.to_csv('submission.csv',index=False) # 0.73444

In [None]:
# SVM Classifier model with RBF kernel for non linear separability
svm_model = svm.SVC(kernel="rbf",class_weight='balanced')
svm_model.fit(X_train, y_train)

In [None]:
y_pred=svm_model.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
test_preds = svm_model.predict(test)
test_preds = test_preds.astype('int32')

In [None]:
submission["Survived"] = test_preds
submission

In [None]:
submission.to_csv('submission.csv',index=False) #0.75598

# Hyperparameter tuning with RandomSearch

#### We can use a random search cross-validation to explore combinations of parameters. 


a) Kernel: The main function of the kernel is to transform the given dataset input data into the required form. There are various types of functions such as linear, polynomial, and radial basis function (RBF). Polynomial and RBF are useful for non-linear hyperplane. Polynomial and RBF kernels compute the separation line in the higher dimension. 

b) Regularization: C is the penalty parameter, which represents misclassification or error term. The misclassification or error term tells the SVM optimization how much error is bearable. This is how you can control the trade-off between decision boundary and misclassification term. A smaller value of C creates a small-margin hyperplane and a larger value of C creates a larger-margin hyperplane.

c) Gamma: A lower value of Gamma will loosely fit the training dataset, whereas a higher value of gamma will exactly fit the training dataset, which causes over-fitting

d) degree: It is the degree of the polynomial kernel function (‘poly’) default value is 3.

In [None]:

params = { 'C':[0.1,1,10,100,1000],'kernel':['rbf','poly','sigmoid','linear'],'degree':[1,2,3,4,5,6],'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}


In [None]:
svm_model = svm.SVC()

In [None]:
random_search = RandomizedSearchCV(svm_model, params, n_iter =10, cv=9)

In [None]:
random_search.fit(X_train,y_train)

In [None]:
random_search.best_params_

In [None]:

random_search.best_score_

#### training the model with the selectec parameters

In [None]:
svm_model = svm.SVC(kernel="poly",gamma=1,degree=1,C=10,class_weight='balanced')
svm_model.fit(X_train, y_train)

In [None]:
y_pred=svm_model.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
test_preds = svm_model.predict(test)
test_preds = test_preds.astype('int32')
submission["Survived"] = test_preds
submission
submission.to_csv('submission.csv',index=False) # 0.77033

# RandomForest

In [None]:
# with the default hyperparameters setting
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)


In [None]:
y_pred = rf_model.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

# Hyperparameter tuning with Optuna

#### the objective function

In [None]:
# the objective function takes the hyperparameter space as input
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    criterion = trial.suggest_categorical("criterion", ['gini', 'entropy'])
    max_depth = trial.suggest_int("max_depth", 1, 4)
    min_samples_split = trial.suggest_float("min_samples_split", 0.01, 1)

    model = RandomForestClassifier(
            n_estimators=n_estimators,
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
        )
    
    score = cross_val_score(model, X_train, y_train, cv=5)
    accuracy = score.mean()
    
    return accuracy
    

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
study.trials_dataframe()

In [None]:
# training the model with the hyperparameter values
model = RandomForestClassifier(
            n_estimators=214,
            criterion='gini',
            max_depth=4,
            min_samples_split=0.31977425965640455,
        )
    
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

#### Feature importances

In [None]:
importances = model.feature_importances_
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]

In [None]:
plt.figure(figsize=(10,8), dpi=80)
plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]), importances[sorted_indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[sorted_indices], rotation=90)
plt.tight_layout()
plt.show()

In [None]:
test_preds = model.predict(test)
test_preds = test_preds.astype('int32')
submission["Survived"] = test_preds
submission
submission.to_csv('submission.csv',index=False) #0.77

# XGBoost

In [None]:
# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state= 40)

In [None]:
# Performs cross validation on XGB Classifier

model = XGBClassifier(n_estimators=500,objective='binary:logistic', eval_metric='auc',tree_method='gpu_hist')
model_score = cross_val_score(model, X, y, scoring='roc_auc', cv=skf.split(X, y), n_jobs=-1, verbose=10)

In [None]:
print(model_score.mean())


In [None]:
del model_score, model

## Tuning with Hyperopt

In [None]:
fold_no = 1
for train_index, test_index in skf.split(X, y):
    print('Fold = ',fold_no)
    y_val = y.iloc[test_index]
    dtrain = xgb.DMatrix(data=X.iloc[train_index], label=y.iloc[train_index])
    dval = xgb.DMatrix(data=X.iloc[test_index], label=y.iloc[test_index])
    fold_no +=1

In [None]:
hyperparameter_space = { 
                        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
                        'max_depth': hp.quniform("max_depth", 2, 6, 1),
                        'min_child_weight' : hp.quniform('min_child_weight', 1, 8, 1),
                        'reg_alpha' : hp.uniform('reg_alpha', 1e-8, 100),
                        'reg_lambda' : hp.uniform('reg_lambda', 1e-8, 100),
                        'gamma': hp.uniform ('gamma', 0.0, 1.0),
                        'subsample': hp.uniform("subsample", 0.1, 1.0),
                        'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1.0)
                       }

In [None]:
def optimize_hyppara(hyperparameter_space):
    # Converts parameter value to int as required by XGBoost
    hyperparameter_space["max_depth"] = int(hyperparameter_space["max_depth"])
    hyperparameter_space["objective"] = "binary:logistic"
    hyperparameter_space["eval_metric"] = "auc"
    hyperparameter_space["tree_method"] = "gpu_hist"
    
    model = xgb.train(
        hyperparameter_space, 
        dtrain, 
        num_boost_round=2000, 
        evals=[(dtrain, 'train'), (dval, 'eval')],
        early_stopping_rounds=50, verbose_eval=False)
    
    predictions = model.predict(dval)
    
    roc_auc = roc_auc_score(y_val, predictions)
    
    del predictions, model, hyperparameter_space
    
    return {"loss": -roc_auc, "status": STATUS_OK}

In [None]:
# Starts hyperparameters tuning
trials = Trials()
best_model_params = fmin(fn=optimize_hyppara,space=hyperparameter_space, max_evals=50,algo=tpe.suggest,trials=trials)

In [None]:
best_model_params

In [None]:
del dtrain, dval,y_val

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dval = xgb.DMatrix(data=X_test, label=y_test)
#del X_train,y_train,X_test,y_test
params = {'colsample_bytree': 0.9912835539334705,
 'gamma': 0.9947467688258089,
 'learning_rate': 0.17756388494635836,
 'max_depth': 6.0,
 'min_child_weight': 4.0,
 'reg_alpha': 1.0806687020657577,
 'reg_lambda': 56.99895595690155,
 'subsample': 0.7218949376758498}

params["max_depth"] = int(params["max_depth"])
params["objective"] = "binary:logistic"
params["eval_metric"] = "auc"
params["tree_method"] = "gpu_hist"
    
model = xgb.train(
        params, 
        dtrain, 
        num_boost_round=2000, 
        evals=[(dtrain, 'train'), (dval, 'eval')],
        early_stopping_rounds=50, verbose_eval=200)

In [None]:
# Adds other important parameters
best_model_params["max_depth"] = int(best_model_params["max_depth"])
best_model_params["objective"] = "binary:logistic"
best_model_params["eval_metric"] = "auc"
best_model_params["tree_method"] = "gpu_hist"

In [None]:
dtest = xgb.DMatrix(data=test)
predictions = model.predict(dtest)

In [None]:
predictions  = predictions > 0.5  
predictions = predictions.astype(int)  

In [None]:
submission["Survived"] = predictions
submission

In [None]:
submission.to_csv("./submission.csv", index=False) # 0.78468

In [None]:
del model, dtest, predictions

In [None]:
# Gets the model trained over cross validation and predictions 
# against each iteration is stored

test_predictions = []

dtest = xgb.DMatrix(data=test)

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print("fold", fold)

    dtrain = xgb.DMatrix(data=X.iloc[train_index], label=y.iloc[train_index])
    dval = xgb.DMatrix(data=X.iloc[val_index], label=y.iloc[val_index])
    
    model = xgb.train(
        best_model_params, 
        dtrain, 
        num_boost_round=2000, 
        evals=[(dtrain, 'train'), (dval, 'eval')],
        early_stopping_rounds=50, verbose_eval=200)
    
    predictions = model.predict(dtest)
    
    test_predictions.append(predictions)
    
    del predictions, model, dval, dtrain

In [None]:
del dtest

In [None]:
submission_preds = np.mean(np.column_stack(submission_predictions), axis=1)
submission_preds  = submission_preds > 0.5  
submission_preds = submission_preds.astype(int)  

In [None]:
submission["Survived"] = submission_preds
submission

In [None]:
submission.to_csv("./submission.csv", index=False) # 0.78468

# Classification with Keras Sequential API

In [None]:
X_train.shape # 24 features hence we start with the first dense model having 24 neurons 

In [None]:
keras_model = Sequential()
keras_model.add(Dense(units=24,activation='relu'))
keras_model.add(Dropout(0.5))
keras_model.add(Dense(units=12,activation='relu'))
keras_model.add(Dropout(0.5))
keras_model.add(Dense(units=1,activation='sigmoid'))
# For a binary classification problem
keras_model.compile(loss='binary_crossentropy', optimizer='adam')

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=25)

In [None]:
keras_model.fit(x=X_train, 
          y=y_train, 
          epochs=500,
          validation_data=(X_test, y_test), verbose=1,callbacks=[early_stop]
          )

In [None]:
model_loss = pd.DataFrame(keras_model.history.history)

In [None]:
model_loss.plot()

In [None]:
predictions = keras_model.predict(X_test)

In [None]:
predictions  = predictions > 0.5  
predictions = predictions.astype(int)  

In [None]:
print(classification_report(y_test,predictions))

In [None]:
test_preds = keras_model.predict(test)

In [None]:
test_preds  = test_preds > 0.5  
test_preds = test_preds.astype(int)  

In [None]:
submission["Survived"] = test_preds
submission

In [None]:
submission.to_csv("./submission.csv", index=False) # 0.77751

# Ensembling and Stacking

In [None]:
# logistic regression, random forest and xgboost
logreg = LogisticRegression()
rf = RandomForestClassifier()
xgbc = XGBClassifier()

In [None]:
# fit all models on X_train
logreg.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgbc.fit(X_train, y_train)

In [None]:
# predicting all the models on X_test
# taking the probability for class 1
pred_logreg = logreg.predict_proba(X_test)[:, 1]
pred_rf = rf.predict_proba(X_test)[:, 1]
pred_xgbc = xgbc.predict_proba(X_test)[:, 1]


In [None]:
# creating an average of all the predictions
avg_pred = (pred_logreg + pred_rf + pred_xgbc) / 3

In [None]:
# storing all the predictions in an array
test_preds = np.column_stack((
 pred_logreg,
 pred_rf,
 pred_xgbc,
 avg_pred
))

In [None]:
# calculating and storing individual AUC values
auc_test = []
for i in range(test_preds.shape[1]):
 auc = roc_auc_score(y_test, test_preds[:, i])
 auc_test.append(auc)
print(f"LR AUC = {auc_test[0]}")
print(f"RF AUC = {auc_test[1]}")
print(f"XGB AUC = {auc_test[2]}")
print(f"Average Pred AUC = {auc_test[3]}")

In [None]:
# predict all models on test
# take probability for class 1
pred_logreg = logreg.predict_proba(test)[:, 1]
pred_rf = rf.predict_proba(test)[:, 1]
pred_xgbc = xgbc.predict_proba(test)[:, 1]

In [None]:
# create an average of all predictions
# that is the simplest ensemble
test_preds = (pred_logreg + pred_rf + pred_xgbc) / 3

In [None]:
test_preds

In [None]:
test_preds  = test_preds > 0.5  
test_preds = test_preds.astype(int)  

In [None]:
test_preds

In [None]:
submission["Survived"] = test_preds
submission

In [None]:
submission.to_csv("./submission.csv", index=False) # 0.76315