In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical

In [None]:
raw_data = pd.read_csv('/kaggle/input/titanic/train.csv')
raw_test = pd.read_csv('/kaggle/input/titanic/test.csv')
raw_data.head()

###  Column Definitions copied from Kaggle


| Variable | Definition | Key |
| :- | -: | :-: |
|survival|Survival|0 = No, 1 = Yes|
|pclass|Ticket class|1 = 1st, 2 = 2nd, 3 = 3rd|
|sex|Sex|  |	
|Age|Age in years	||
|sibsp|# of siblings / spouses aboard the Titanic	||
|parch|# of parents / children aboard the Titanic	||
|ticket|Ticket number	||
|fare|Passenger fare	||
|cabin|	Cabin number	||
|embarked|Port of Embarkation|C = Cherbourg, Q = Queenstown, S = Southampton|

In [None]:
# Let's copy data into a new variable
# Remove Nominal features such as name, ticket id 

train_data = raw_data.copy()
train_data.set_index('PassengerId', inplace=True, drop=True)
print('Shape : ',train_data.shape)
print(train_data.info())

In [None]:
# Let's copy data into a new variable
# Remove Nominal features such as name, ticket id 

test_data = raw_test.copy()
test_data.set_index('PassengerId', inplace=True, drop=True)
print('Shape : ',test_data.shape)
print(test_data.info())

- ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare'] => columns have data for all records
- Embarked has around 2 data missing
- We have only 25% data for Cabin. Cabin feature may correlate to survival, but here it's highly useless
- 20% data is missing in age -> We need to fill up this values


#### Age Adjustments 

reference : https://www.kaggle.com/allohvk/titanic-missing-age-imputation-tutorial-advanced

In [None]:
age_test_data = test_data.copy()
age_train_data = train_data.copy()

In [None]:
# we'll list out salutations out of  Names
def Create_salutation(df):
    Split_name1 = df['Name'].str.split(',', expand=True)
    Split_name2 = Split_name1[1].str.split('.', expand=True)
    Split_name2[0].unique()
    Split_name2[0]=Split_name2[0].str.replace(' ', '')
    df['salutation'] = Split_name2[0]
    
Create_salutation(age_test_data)
Create_salutation(age_train_data)

In [None]:
print('Train Data : ',age_train_data[age_train_data['Age'].isnull()]['salutation'].unique())
print('Data : ',age_test_data[age_test_data['Age'].isnull()]['salutation'].unique())

In [None]:
train_sample = age_train_data[age_train_data['salutation'].isin(['Mr', 'Mrs', 'Miss', 'Master', 'Dr'])]
train_age_summary = train_sample.groupby(['salutation', 'Pclass'])['Age'].agg(['mean', 'count'])
train_age_summary

In [None]:
list(train_age_summary['mean'])

In [None]:
age_train_data.info()

In [None]:
salutations = ['Dr','Master','Miss','Mr','Mrs']
pclasses = [1,2,3]
ages = [43.75,
 38.5,
 0,
 5.306666666666667,
 2.2588888888888885,
 5.350833333333333,
 30.0,
 22.390625,
 16.1231884057971,
 41.58045977011494,
 32.76829268292683,
 28.724890829694324,
 40.88235294117647,
 33.68292682926829,
 33.515151515151516]
for stn in range(len(salutations)):
    for pcl in range(len(pclasses)):
        age = (stn*3)+pcl
        age_train_data.loc[(age_train_data['Age'].isnull()) & (age_train_data['salutation']==salutations[stn]) & (age_train_data['Pclass'] == pclasses[pcl]),'Age'] = ages[age]        

In [None]:
age_train_data.info()

In [None]:
age_test_data[age_test_data['Age'].isnull()]['salutation'].unique()

In [None]:
test_sample = age_test_data[age_test_data['salutation'].isin(['Mr', 'Mrs', 'Miss', 'Ms', 'Master'])]
test_age_summary = test_sample.groupby(['salutation', 'Pclass'])['Age'].agg(['mean', 'count'])
test_age_summary

In [None]:
list(test_age_summary['mean'])

In [None]:
ages_test = [9.5,
 5.0,
 7.454615384615384,
 31.428571428571427,
 17.37,
 19.872647058823528,
 41.2,
 31.71818181818182,
 27.198795180722893,
 45.60606060606061,
 33.0,
 29.875,
 29.875,
 29.875,
 29.875,]
salutations_test = ['Mr', 'Mrs', 'Miss', 'Ms', 'Master']
pclasses = [1,2,3]
for stn in range(len(salutations_test)):
    for pcl in range(len(pclasses)):
        age = (stn*3)+pcl
        age_test_data.loc[(age_test_data['Age'].isnull()) & (age_test_data['salutation']==salutations_test[stn]) & (age_test_data['Pclass'] == pclasses[pcl]),'Age'] = ages_test[age]        
        

In [None]:
age_test_data.info()

### Check Point

In [None]:
pre_train_data = age_train_data
pre_test_data = age_test_data

#### Drop unnecessary columns

We can drop below columns which will not be much helpful

- Name 
- Ticket 
- Fare -> direct correlation with Pclass, ticket Fare may change but all will be treated as only 3 classes
- Cabin -> 75% data missing
- Embarked will not influence the survival
- salutation -> same as Name


In [None]:
pre_train_data.columns.values

In [None]:
drop_list = ['Name', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'salutation']
pre_train_data = pre_train_data.drop(drop_list, axis=1)
pre_test_data = pre_test_data.drop(drop_list,axis=1)

Transform Gender Male and female to 0 and 1

In [None]:
pre_train_data['Sex'] = pre_train_data['Sex'].map({'male':0, 'female':1})
pre_test_data['Sex'] = pre_test_data['Sex'].map({'male':0, 'female':1})

#### Explanatory Data Analysis 

In [None]:
pre_test_data

In [None]:
pre_train_data.describe()

We have 891 out of 2435 data points for training

- 38% people survived the event
- 35% of passengers were female
- youngest person aboard was 3 months old (0.42 yr), and oldest person was 80 years old

In [None]:
features = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
plt.figure(figsize=(20,20))

for ftr in range(len(features)):
    plt.subplot(2,3,ftr+1)
    plt.hist([pre_train_data[pre_train_data['Survived']==0][features[ftr]],pre_train_data[pre_train_data['Survived']==1][features[ftr]]],bins=8, label=["Dead", "Survive"])
    plt.legend()
    plt.xlabel(features[ftr])
    plt.ylabel('Count')



plt.show()

- in Pclass, Peaple in higher Class like 1 and 2 had more survival chance as compared to Class 3
- Majority of Male Died, but majoirity of Female survived. (In rescue boats, initially only Children and ladies were allowed, as a common practice)
- in Age group only Kids had more survival chance, same reason before
- Age group 20-30 had worst survival rate than all others
- in SibSp with 1 or 2 Siblings had better chance of surviving, If siblings aboard it's more likely these are kids than adult siblings traveling together
- Similar trend in Parch as well. If they had 1,2 or 3 parents or kids aboard, they have much higher chance of surviving

In [None]:
sns.pairplot(pre_train_data)

#### Scale and Transform data

In [None]:
X = pre_train_data.drop('Survived',axis=1)
Y = pre_train_data['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.2, random_state = 42 )
x_train, x_test = X_train, X_test

In [None]:
scaler = StandardScaler()
scaler.fit(X)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

### Logistic Regression Model 

In [None]:
from sklearn.linear_model import LogisticRegression
logisticRegressionModel = LogisticRegression()

In [None]:
logisticRegressionModel.fit(x_train, y_train)
round(logisticRegressionModel.score(x_train,y_train),2)

In [None]:
y_pred_logReg = logisticRegressionModel.predict(x_test)
round(accuracy_score(y_test, y_pred_logReg),2)

### Supporting Vector Classifiers

In [None]:
from sklearn.svm import SVC, LinearSVC
svc = SVC()
svc.fit(x_train, y_train)
round(svc.score(x_train, y_train),2)

In [None]:
y_pred_svc = svc.predict(x_test)
round(accuracy_score(y_test, y_pred_svc),2)

In [None]:
submissionDF = pre_test_data.copy()
submissionDF = submissionDF.drop(['Pclass','Sex','Age','SibSp','Parch'],axis=1)
testData = scaler.transform(pre_test_data)

y_pred_svc_final = svc.predict(testData)


submissionDF['Survived'] = y_pred_svc_final
submissionDF.to_csv('submission.csv')

### Linear Supporting Vector Classifiers

In [None]:
linear_svc = LinearSVC()
linear_svc.fit(x_train,y_train)
round(linear_svc.score(x_train,y_train),2)

In [None]:
y_pred_linear_svc = linear_svc.predict(x_test)
round(accuracy_score(y_test, y_pred_linear_svc),2)

###  Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
round(rfc.score(x_train, y_train),2)

In [None]:
y_pred_rfc = rfc.predict(x_test)
round(accuracy_score(y_test, y_pred_rfc),2)

###  KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier()
KNN.fit(x_train, y_train)
round(KNN.score(x_train, y_train),2)

In [None]:
y_pred_knn = KNN.predict(x_test)
round(accuracy_score(y_test, y_pred_knn),2)

###  Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

GNB = GaussianNB()
GNB.fit(x_train, y_train)
round(GNB.score(x_train, y_train),2)

In [None]:
y_pred_gnb = GNB.predict(x_test)
round(accuracy_score(y_test, y_pred_gnb),2)

###  Perceptron

In [None]:
from sklearn.linear_model import Perceptron

perc = Perceptron()
perc.fit(x_train, y_train)
round(perc.score(x_train, y_train),2)

In [None]:
y_pred_perc = perc.predict(x_test)
round(accuracy_score(y_test, y_pred_perc),2)

### Stochastic Gradient Descent Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

SGD = SGDClassifier()
SGD.fit(x_train, y_train)
round(SGD.score(x_train, y_train),2)

In [None]:
y_pred_sgd = SGD.predict(x_test)
round(accuracy_score(y_test, y_pred_sgd),2)

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

DTC = DecisionTreeClassifier()
DTC.fit(x_train, y_train)
round(DTC.score(x_train, y_train),2)

In [None]:
y_pred_dtc = DTC.predict(x_test)
round(accuracy_score(y_test, y_pred_dtc),2)

Best Accuracy I could achieve was 78%. 
We'll try see different data cleaning and data manipulation process to see if we can improve anymore

### Tensorflow

In [None]:
# creating validation data
x_train_tf, x_val_tf, y_train_tf, y_val_tf = train_test_split(x_train, y_train , test_size=0.2)

In [None]:
# creationg early stopping condition
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [None]:
# changing output lables to numpy arrays, which later changing to onhot arrays
# on hot arrays change [0,1,0] values to [[0,1],[1,0],[0,1]]

y_train_np = y_train_tf.to_numpy()
y_val_np = y_val_tf.to_numpy()

y_train_onhot = tf.one_hot(y_train_np, 2)
y_val_onhot = tf.one_hot(y_val_np, 2)

In [None]:
# Making initial epochs 100, and applying early stopping,
epochs = 100
model = keras.Sequential()
model.add(keras.layers.Dense(32, input_shape=[5]))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(2, activation='sigmoid'))

model.compile(
      optimizer=keras.optimizers.Adam(learning_rate=1e-3),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=['accuracy'])
history = model.fit(
    x_train_tf, 
    y_train_onhot, 
    validation_data = (x_val_tf, y_val_onhot),
    callbacks = [early_stopping],
    epochs = epochs)

In [None]:
# plotting accuracy vs validation accuracy
plt.plot(history.history['accuracy'],label = 'accuracy')
plt.plot(history.history['val_accuracy'], label='validation')
plt.legend()
plt.show()

In [None]:
# prediction and testing accuracy 
y_pred_tf = model.predict(x_test)
y_pred_tf_on_hot = to_categorical(np.argmax(y_pred_tf, axis=1), 2)
y_pred_tf_final = tf.argmax(y_pred_tf_on_hot, axis=1)

round(accuracy_score(y_test, y_pred_tf_final),2)

In [None]:
submissionDF = pre_test_data.copy()
submissionDF = submissionDF.drop(['Pclass','Sex','Age','SibSp','Parch'],axis=1)
testData = scaler.transform(pre_test_data)

tf_df = submissionDF

y_test_tf = model.predict(testData)
y_test_tf_on_hot = to_categorical(np.argmax(y_test_tf, axis=1), 2)
y_test_tf_final = tf.argmax(y_test_tf, axis=1)

# tf_df['Survived'] = y_test_tf_final
# tf_df.to_csv('submission.csv')

**XGB Classifier**

In [None]:
# 66% accuracy
from xgboost import XGBClassifier

In [None]:
model = XGBClassifier()
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)

In [None]:
round(accuracy_score(y_test, y_pred),2)

In [None]:
y_pred_final = model.predict(testData)
# xgb_df = submissionDF
# xgb_df['Survived'] = y_pred_final
# xgb_df.to_csv('submission.csv')