In [None]:
## Importing the relevant libraries

In [None]:
# For Faster run time of sklearn classes and modules
!pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
import numpy as np 
import pandas as pd
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, 
                              GradientBoostingClassifier, VotingClassifier, 
                              AdaBoostClassifier)
import scipy.stats as sst
import math
import tensorflow as tf

In [None]:
dev = tf.config.list_physical_devices('GPU')
if len(dev) > 0:
    tf.config.experimental.set_memory_growth(dev[0], 'True')

In [None]:
!nvidia-smi

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
combine = [train, test]
train.describe()

In [None]:
train.head()

#### The below function has been directly taken from [this](https://www.kaggle.com/yassineghouzam/titanic-top-4-with-ensemble-modeling) amazing notebook. Do check it out!

In [None]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   
out = detect_outliers(train, 2, ['Fare', 'SibSp', 'Parch', 'Age'])

In [None]:
train.drop(out, axis = 0, inplace = True)

In [None]:
train.info()

In [None]:
test1 = test.copy()
test1['Survived'] = np.nan
data = pd.concat([train, test1]).reset_index(drop = True)
len(data)

In [None]:
train.describe()

In [None]:
data.info()

### Checking the death frequency of the classes of different attributes

In [None]:
plt.figure(figsize = [20, 15])
plt.subplot(3, 3, 1)
sns.histplot(data = train, x = train['Sex'], hue = train['Survived'], multiple = 'dodge')
plt.subplot(3, 3, 2)
sns.histplot(data = train, x = train['Pclass'], hue = train['Survived'], multiple = 'dodge')
plt.subplot(3, 3, 3)
sns.histplot(data = train, x = train['Embarked'], hue = train['Survived'], multiple = 'dodge')
plt.subplot(3, 3, 4)
sns.histplot(data = train, x = train['Age'], hue = train['Survived'], multiple = 'dodge')
plt.subplot(3, 3, 5)
sns.histplot(data = train, x = train['SibSp'], hue = train['Survived'], multiple = 'dodge')
plt.subplot(3, 3, 6)
sns.histplot(data = train, x = train['Parch'], hue = train['Survived'], multiple = 'dodge')
plt.plot();

# Data Preprocessing

### Adding Title attribute to the data

#### People with some titles were more likely to live than others

In [None]:
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand = False)
pd.crosstab(data['Title'], data.Sex)

In [None]:
col_rep = ['Capt', 'Col', 'Countess', 'Don', 'Dona', 'Dr', 'Jonkheer', 'Lady',
           'Major', 'Rev', 'Sir']

data['Title'].replace(col_rep, 'Rare', inplace = True)
data['Title'].replace(['Mlle', 'Ms'], 'Miss', inplace = True)
data['Title'].replace(['Mme'], 'Mrs', inplace = True)
pd.crosstab(data['Title'], data['Sex'])

In [None]:
data.groupby('Title').mean()['Survived']

### Encoding the Sex attribute

#### Encoding the Sex attribute with 0 for Males, 1 for Females and 2 for Master, i.e, male children because they had higher chance of survival compared to adult males

In [None]:
data.loc[data['Sex'] == 'male', 'Sex'] = 0
data.loc[data['Sex'] == 'female', 'Sex'] = 1
data.loc[data['Title'] == 'Master', 'Sex'] = 2

### Replacing the null values in Age attribute with median values of the respective passenger classes

#### Different Passenger Classes had different median ages, so I account for that below. Also, I consider the Sex of a passenger as well as there were different median ages for different sexes as well.

In [None]:
index_NaN_age = list(data["Age"][data["Age"].isnull()].index)

for i in index_NaN_age :
    age_pred = np.nanmedian(data["Age"][(data['Sex'] == data.iloc[i]["Sex"]) &  (data['Pclass'] == data.iloc[i]["Pclass"])])
    data.loc[i, 'Age'] = age_pred
data.info() 

# There are better and optimized ways to implement what I did above. As you can see, on every loop the nanmedian function calculates the median
# What we can do is store the medians of all the different possibilites in a matrix and access it in a constant time on every loop instead of calculating it everytime
# At the end, you will be left with 9 different values based on 3 sexes and 3 passenger classes.

### Breaking the Age attribute into 5 different classes

In [None]:
data['Age_group'] = 0
data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age_group'] = 1
data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age_group'] = 2
data.loc[(data['Age'] > 48) & (data['Age'] <= 60), 'Age_group'] = 3
data.loc[(data['Age'] > 60), 'Age_group'] = 4
data.groupby('Age_group').mean()['Survived']

In [None]:
pd.value_counts(data['Age_group'])

### Adding unknown value to the null values of the Cabin Attribute

#### Cabin can be considered a part of the Titanic. Like a Deck. Different parts of Titanic had different survival rates

In [None]:
f = data['Cabin'].str.extract('(^.{0,1})')
data['Cabin'] = f
data.loc[data['Cabin'].isnull(), 'Cabin'] = 'U' # U is unknown

In [None]:
pd.value_counts(data['Cabin'])

In [None]:
data.loc[data['Cabin'] == 'T', 'Cabin'] = 'A'
sns.histplot(data = data[:len(train)], x = data[:len(train)]['Cabin'], hue = 'Survived', multiple = 'stack');

In [None]:
data['Deck'] = 'U'
data.loc[(data['Cabin'] == 'A') | (data['Cabin'] == 'B') | (data['Cabin'] == 'C'), 'Deck'] = 'ABC'
data.loc[(data['Cabin'] == 'D') | (data['Cabin'] == 'E'), 'Deck'] = 'DE'
data.loc[(data['Cabin'] == 'F') | (data['Cabin'] == 'G'), 'Deck'] = 'FG'
data.loc[(data['Cabin'] == np.nan), 'Deck'] = 'U'

In [None]:
data.groupby('Deck').mean()

### Adding Relatives attribute

#### Adding the number of Siblings/Spouse and number Parents and Children will give us the number of relatives that a passenger has

In [None]:
data['Relatives'] = data['SibSp'] + data['Parch'] + 1 # I add 1 to account for the passenger themself

In [None]:
data.loc[data['Relatives'] == 1, 'Family_Size'] = 0
data.loc[(data['Relatives'] >= 2) & (data['Relatives'] < 5), 'Family_Size'] = 1
data.loc[(data['Relatives'] >= 5) & (data['Relatives'] <= 6), 'Family_Size'] = 2
data.loc[(data['Relatives'] >= 7), 'Family_Size'] = 3

In [None]:
data.groupby('Family_Size').mean()

### Ticket Frequency

#### This is number of people with the same ticket. Families and Friends bought tickets together and they got the same ticket number and id. This counts how many people held the same ticket

In [None]:
data['Ticket_Frequency'] = data.groupby('Ticket')['Ticket'].transform('count')

In [None]:
data.loc[data['Ticket_Frequency'] == 1, 'TF'] = 0
data.loc[(data['Ticket_Frequency'] >= 2) & (data['Ticket_Frequency'] <= 4), 'TF'] = 1
data.loc[data['Ticket_Frequency'] > 4, 'TF'] = 2

### Adding Surname Attribute

#### This adds the last name of all the people aboard the Titanic. This will helps us know how many members there are in a family

In [None]:
data['Surname'] = data['Name'].map(lambda i: i.split(',')[0])

### Calculating the survival rate of families and people travelling in group

#### If someone travelled in a group or a family then they have a higher chance of survival, if people from that group/family also survived

In [None]:
# Below is the average survival rate of all the people aboard the Titanic. 
# If someone didn't have a group or a family then this is their survival rate
data['Family_Survival'] = 0.385

'''
We check for all the people with families as to how many people survived from that family
This is their average survival rate..  sr = (survived / total family members)
'''
for gid, gdf in data.groupby('Surname'):
    if len(gdf) > 1:
        for i, r in gdf.iterrows():
            fam = gdf.drop(i)
            sur = list(fam['Survived'])
            ans = np.nanmean(sur)
            if not math.isnan(ans):
                data.loc[data['PassengerId'] == r['PassengerId'], 'Family_Survival'] = ans

                
# We do the same as above except in this case it is with ticket frequency, i.e, number of people having the same ticket  
data['Ticket_Survival'] = 0.385

for gid, gdf in data.groupby('Ticket'):
    if len(gdf) > 1:
        for i, r in gdf.iterrows():
            fam = gdf.drop(i)
            sur = list(fam['Survived'])
            ans = np.nanmean(sur)
            if not math.isnan(ans):
                data.loc[data['PassengerId'] == r['PassengerId'], 'Ticket_Survival'] = ans

'''
Here I calculate the average survival rate of a particular person 
based on their family survival rate and their ticket survival rate
'''
num = (data['Family_Size'] * data['Family_Survival']) + (data['Ticket_Frequency'] * data['Ticket_Survival'])
den = data['Family_Size'] + data['Ticket_Frequency']
data['Survival'] = num / den

In [None]:
plt.figure(figsize = (8, 6))
sns.histplot(data['Survival'], bins = 25);

#### Here I divide the Survival rate/probability into four classes

In [None]:
data.loc[(data['Survival'] <= 0.35), 'SP'] = 0
data.loc[(data['Survival'] > 0.35) & (data['Survival'] <= 0.5), 'SP'] = 1
data.loc[(data['Survival'] > 0.5) & (data['Survival'] <= 0.8), 'SP'] = 2
data.loc[(data['Survival'] > 0.8) & (data['Survival'] <= 1), 'SP'] = 3
data.groupby('SP').mean()

#### I tried dividing the group in such a way that all the classes had a decent number of data points. I experimented to find those classes above..

In [None]:
pd.value_counts(data['SP'])

### Replacing null values in Embarked attribute with the most frequent value

In [None]:
data[data['Embarked'].isnull()]

In [None]:
plt.figure(figsize = (10, 8))
sns.boxplot(data = data, x = 'Embarked', y = 'Fare', hue = 'Pclass');
data.loc[data['Embarked'].isnull(), 'Embarked'] = 'C'

### Imputing missing values in Fare

In [None]:
data[data['Fare'].isnull()]

#### Here I calculate the median value of Fare for the people for Passenger class 3 because the passenger with the missing value of Fare belonged to Pclass 3

In [None]:
np.nanmedian(data[data['Pclass']==3]['Fare'])

In [None]:
fm = np.nanmedian(data[(data['Pclass']==1)]['Fare'])
data.loc[data['Fare'].isnull(), 'Fare'] = fm

### Transforming Fare attribute with log transformation

#### Since there are multiple people in a family, there are multiple tickets that are bought for the journey. The fare price is the total amount paid for the entire group/family. I calculate the amount spent per person in buying the tickets by dividing the fare price by the total number of people in a particular group

In [None]:

data['Fare'] = data['Fare'] / data['Ticket_Frequency']

In [None]:
sns.histplot(data['Fare'], kde = True);

#### As we can see, the above distribution is very skewed and this can create a significant bias in our models

#### So, in order to mitigate that problem, we transform the Fare data with log transformation

In [None]:
d = data[data['Fare'] != 0].index
data['Fare'] = data['Fare'].map(lambda i: np.log(i) if i>0 else 0)

#### The below data looks much better than above but it can still create a lot of unnecessary bias

In [None]:
sns.histplot(data['Fare'], kde = True);

In [None]:
sns.histplot(data = data, x = 'Fare', bins = 10);

### Dividing the Fare into different categories based on the price of the ticket

#### I divide the Fare attribute to mitigate the bias problem and make sure that I include enough data points in every class. The only problem in this is that we lose a bit of information which it makes up for by reducing the bias

In [None]:
data.loc[(data['Fare'] >= 0) & (data['Fare'] <= 2.1), 'Cost'] = 0
data.loc[(data['Fare'] > 2.1) & (data['Fare'] <= 3), 'Cost'] = 1
data.loc[(data['Fare'] > 3), 'Cost'] = 2
data.groupby('Cost').mean()['Survived']

### Below are the survival probability of the classes of the different attributes

#### Looking at this graph helps us know that which attribute has the highest influence on the survivability of a passenger

In [None]:
count = 1
col = ['Family_Size', 'Pclass', 'Embarked', 'Sex', 'Cost', 'Age_group', 'Deck', 'SP']
plt.figure(figsize = (20, 8))
for i in range(len(col)):
    plt.subplot(2, 4, count)
    sns.barplot(data = data, x = col[i], y = 'Survived')
    count += 1;

In [None]:
pd.value_counts(data['Cost'])

### Scaling Numerical attributes

In [None]:
att = ['Fare', 'Parch', 'SibSp', 'Relatives', 'Age', 'Survival']
ss = StandardScaler()
data[att] = ss.fit_transform(data[att])

### Encoding categorical attributes

In [None]:
le = LabelEncoder()
cat = ['Embarked', 'Deck']
for col in cat:
    data[col] = le.fit_transform(data[col])

#### Crammer's V for correlation of categorical attributes

##### Crammer's v is used to calculate correlation between 2 categorical attributes. We cannot use Pearson's R for this task as that can only be used for numerical attributes

In [None]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = sst.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

### Plotting the correlation matrix of the categorical attributes

In [None]:
att = ['Pclass', 'Embarked', 'Family_Size', 'Age_group', 'SP', 'Sex', 'TF', 'Cost', 'Deck', 'Survived']
cor = pd.DataFrame()
for i in att:
    for j in att:
        cor.loc[i, j] = cramers_v(data.loc[:len(train), i], data.loc[:len(train), j])
for i in att:
    cor[i].astype(np.float)

In [None]:
plt.figure(figsize = (10, 8))
sns.heatmap(cor, annot = True);

#### [Pclass, SP, Sex, Cost, Deck] looks like promising attributes

#### The reason I didn't include TF in the above list even though it had a high enough correlation with survived is that it has a high correlation with SP attribute. And we should avoid including redudant attributes in our data. It can create bias which may reduce the perfomance of our models



#### Also, there is a very high correlation between Cost and Pclass and you can choose to not include Cost attribute in your final dataset but including it yielded a little better accuracy on submission so I chose to include it in mine.

### Pearson's Correlation for Numerical Attributes

In [None]:
colls = ['Fare', 'Relatives', 'Survival', 'SibSp', 'Parch', 'Survived']
corr_mat_num = data.loc[:len(train), colls].corr()
dat = data[:len(train)]
for i in range(len(colls) - 1):
    corr_mat_num.loc['Survived', colls[i]] = sst.pointbiserialr(dat['Survived'], dat[colls[i]])[0]
    corr_mat_num.loc[colls[i], 'Survived'] = sst.pointbiserialr(dat[colls[i]], dat['Survived'])[0]
plt.figure(figsize = (10, 7))
sns.heatmap(corr_mat_num, annot = True);

#### There isn't any significant correlation of the given numerical attributes with the Survived attribute except for Fare and Survival, but they have already been accounted for by including Cost and SP attributes. So, I won't be including any numerical attribute in my dataset

### Selecting only those attributes with a high correlation coefficient

In [None]:
# Selecting the following attributes because they worked best in practice
coll = ['Survived', 'Pclass', 'Sex', 'Survival', 'Cost'] 
# I included Survival because it worked better than SP even though SP had a lower correlation
data = data[coll]

### One-Hot Encoding non-ordinal categories

#### O-H Encoding 'Sex' because it is non-ordinal in nature

In [None]:
col = ['Sex']
data = pd.get_dummies(data, columns = col, drop_first = True)

In [None]:
data.info()

#### Separating the training and testing data

In [None]:
train, test = data[:len(train)], data[len(train):]

#### Shuffling the training set

In [None]:
train = train.sample(frac = 1)

In [None]:
X_train, y_train, X_test = train.drop(['Survived'], axis = 1), train['Survived'], test.drop(['Survived'], axis = 1)
X_train.shape, y_train.shape, X_test.shape

In [None]:
X_train.info()

In [None]:
X_test.info()

In [None]:
X_valid, y_valid = X_train[800:], y_train[800:]
X_valid.shape, y_valid.shape

# Importance of the features we have selected

In [None]:
et = ExtraTreesClassifier(n_estimators = 39)
et.fit(X_train, y_train)
et.score(X_valid, y_valid), et.score(X_train, y_train)

In [None]:
pd.Series(et.feature_importances_, 
             index = X_train.columns)

#### As we can see, the Survival attribute has a high importance which confirms the thinking that people survived in groups. If significant number of people from someone's group died then it is highly likely that they died as well

# Modeling

# SVM

In [None]:
param = [
    {
        'kernel': ['rbf'], 'C': [0.1, 0.3, 1, 2, 3, 4], 
        'gamma': [0.3, 1, 3, 10, 12, 15, 25, 28]
    }, 
]

svc = SVC(probability = True)
gs_svc = GridSearchCV(svc, param, cv = 5, n_jobs = -1, verbose = 1)
gs_svc.fit(X_train, y_train)
svc_best = gs_svc.best_estimator_
gs_svc.best_estimator_, gs_svc.score(X_valid, y_valid), gs_svc.score(X_train, y_train)

# XGBoost

In [None]:
param_grid={
    'max_depth': range(2, 10, 2),
    'n_estimators': range(26, 46, 2),
    'learning_rate': [0.2, 0.1, 0.03, 0.01]
}

xg = XGBClassifier(eval_metric='logloss', n_jobs = -1, use_label_encoder = False)
gs_xg = GridSearchCV(xg, param_grid, cv = 5, n_jobs = -1, verbose = 1)
gs_xg.fit(X_train, y_train)

xg_best = gs_xg.best_estimator_
gs_xg.best_params_, gs_xg.score(X_valid, y_valid), gs_xg.score(X_train, y_train)

# Random Forest

In [None]:
param = [
    {'n_estimators': [100, 200, 300, 400, 450, 500], 
     'max_depth': [3, 4, 6, 8, 10, 12], 
     'max_leaf_nodes': [15, 20, 25]}, 
]

rf = RandomForestClassifier()
gs_rf = GridSearchCV(rf, param, cv = 5, n_jobs = -1, verbose = 1)
gs_rf.fit(X_train, y_train)

rf_best = gs_rf.best_estimator_
gs_rf.best_estimator_, gs_rf.score(X_valid, y_valid), gs_rf.score(X_train, y_train)

In [None]:
pd.Series(rf_best.feature_importances_, index = X_train.columns)

# Adaptive Boosting

In [None]:
param = [
    {'n_estimators': [50, 100, 150, 200, 300, 400]}
]
ada = AdaBoostClassifier()
gs_ada = GridSearchCV(ada, param, cv = 5, n_jobs = -1, verbose = 1)
gs_ada.fit(X_train, y_train)

ada_best = gs_ada.best_estimator_
gs_ada.best_estimator_, gs_ada.score(X_valid, y_valid), gs_ada.score(X_train, y_train)

# Extra Trees Classifier

In [None]:
param = [
    {'n_estimators': range(8, 28, 4), 
     'max_depth': range(24, 48, 4),
     'max_leaf_nodes': range(20, 48, 4),
    }
]

et = ExtraTreesClassifier()
gs_et = GridSearchCV(et, param, cv = 5, n_jobs = -1, verbose = 1)
gs_et.fit(X_train, y_train)

et_best = gs_et.best_estimator_
gs_et.best_estimator_, gs_et.score(X_valid, y_valid), gs_et.score(X_train, y_train)

In [None]:
et_best.fit(X_train, y_train)
pd.Series(et_best.feature_importances_, index = X_train.columns)

# KNN

In [None]:
param = [
    {'n_neighbors': range(2, 14, 1)}
]

knn = KNeighborsClassifier()
gs_knn = GridSearchCV(knn, param, cv = 5, n_jobs = -1)
gs_knn.fit(X_train, y_train)

knn_best = gs_knn.best_estimator_
gs_knn.best_estimator_, gs_knn.score(X_valid, y_valid), gs_knn.score(X_train, y_train)

# Decision Tree

In [None]:
param = [
    {
        'criterion': ['gini', 'entropy'],
        'max_depth': range(4, 20, 2),
        'max_leaf_nodes': range(4, 28, 2),
    }
]

dt = DecisionTreeClassifier()
gs_dt = GridSearchCV(dt, param, cv = 5, n_jobs = -1, verbose = 1)
gs_dt.fit(X_train, y_train)

dt_best = gs_dt.best_estimator_
gs_dt.best_estimator_, gs_dt.score(X_valid, y_valid), gs_dt.score(X_train, y_train)

# Gradient Boosting 

In [None]:
param = [
    {'n_estimators': range(12, 36, 4), 
     'max_depth': range(8, 24, 4),
     'max_leaf_nodes': range(8, 28, 4),
    }
]

gb = GradientBoostingClassifier()
gs_gb = GridSearchCV(et, param, cv = 5, n_jobs = -1, verbose = 1)
gs_gb.fit(X_train, y_train)


gb_best = gs_gb.best_estimator_
gs_gb.best_estimator_, gs_gb.score(X_valid, y_valid), gs_gb.score(X_train, y_train)

# Voting Classifier

#### Here I use the best models found by the hyperparameter search of all the models above for voting

In [None]:
vc_lr = LogisticRegression(solver = 'sag')
vc_mlp = MLPClassifier()

vc = VotingClassifier(estimators = [('rf', rf_best), ('svc', svc_best), ('lr', vc_lr), 
                                    ('mlp', vc_mlp), ('xgc', xg_best), ('knn', knn_best),
                                    ('ada', ada_best), ('ET', et_best), ('dt', dt_best),
                                    ('gb', gb_best)], 
                                    voting = 'soft', n_jobs = -1, verbose = 1)
vc.fit(X_train, y_train)
vc.score(X_valid, y_valid), vc.score(X_train, y_train)

# Comparing Models

In [None]:
m = [gs_svc, gs_xg, gs_rf, gs_et, gs_ada, gs_dt, gs_gb, gs_knn, vc]
n = ['SVC', 'XGBoost', 'Random Forest', 'Extra Trees', 'Adaboost', 'Decision Tree', 'Gradient Boosting', 'K-Nearest Neighbors', 'Voting Classifier']
p = dict()
for i in range(len(m)):
    pred = m[i].score(X_train, y_train)
    p[n[i]] = pred

p = dict(sorted(p.items(), key = lambda x: x[1], reverse = True))
plt.figure(figsize = (8, 6))
plt.xlabel('Accuracy')
sns.barplot(x = list(p.values()), y = list(p.keys()), orient = 'h');

### Predictions!!

##### After a lot of experimentation, Gradient Boosting gave me the best results in practice, so I will be using GB for making the predictions. Also, I had to re-submit multiple times to reach the best accuracy. You'll get different accuracies every time you submit because the training set is shuffled differently and that affects the way the models are trained

In [None]:
predictions = gs_gb.predict(X_test).astype(np.uint8)
sub = pd.read_csv('../input/titanic/gender_submission.csv')
sub['Survived'] = predictions
sub.to_csv('submission.csv', index = False)