# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
# Loading the data in pandas dataframe
gender_submission = pd.read_csv("../input/titanic/gender_submission.csv")
test = pd.read_csv("../input/titanic/test.csv")
train = pd.read_csv("../input/titanic/train.csv")
print(train.info())
print(train.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence

# Dropping unwanted columns

In [2]:
train['Title'] = train.Name.str.extract('([A-Za-z]+)\.', expand=False)
#assign a value for missing titles
train['Title'] = train['Title'].fillna('NoTitle')
#Unify titles
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')

test['Title'] = test.Name.str.extract('([A-Za-z]+)\.', expand=False)
#assign a value for missing titles
test['Title'] = test['Title'].fillna('NoTitle')
#Unify titles
test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')

X_train = train.drop(columns = ['Survived','Ticket','Name','Cabin'])
X_test = test.drop(columns = ['Ticket','Name','Cabin'])
X_train=X_train.set_index('PassengerId')
X_test=X_test.set_index('PassengerId')
y_train = train.Survived
y_test = gender_submission.Survived
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 8 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
Title       891 non-null object
dtypes: float64(2), int64(3), object(3)
memory usage: 62.6+ KB


# Filling the missing values

In [3]:
print(X_train.info())
# fill missing values
X_train.Age=X_train.Age.fillna(X_train.Age.median())
#create bands for age
X_train.loc[ X_train['Age'] <= 16, 'Age'] = 0
X_train.loc[ (X_train['Age'] > 16) & (X_train['Age'] <= 32), 'Age'] = 1
X_train.loc[ (X_train['Age'] > 32) & (X_train['Age'] <= 48), 'Age'] = 2
X_train.loc[ (X_train['Age'] > 48) & (X_train['Age'] <= 64), 'Age'] = 3
X_train.loc[ X_train['Age'] > 64, 'Age'] = 4
X_train['Age'] = X_train['Age'].astype(int)

X_test.Age=X_test.Age.fillna(X_test.Age.median())
X_test.loc[ X_test['Age'] <= 16, 'Age'] = 0
X_test.loc[ (X_test['Age'] > 16) & (X_test['Age'] <= 32), 'Age'] = 1
X_test.loc[ (X_test['Age'] > 32) & (X_test['Age'] <= 48), 'Age'] = 2
X_test.loc[ (X_test['Age'] > 48) & (X_test['Age'] <= 64), 'Age'] = 3
X_test.loc[ X_test['Age'] > 64, 'Age'] = 4
X_test['Age'] = X_test['Age'].astype(int)

X_train.Fare=X_train.Fare.fillna(X_train.Age.median())
#create bands for fare
X_train.loc[ X_train['Fare'] <= 7.91, 'Fare'] = 0
X_train.loc[ (X_train['Fare'] > 7.91) & (X_train['Fare'] <= 14.454), 'Fare'] = 1
X_train.loc[ (X_train['Fare'] > 14.454) & (X_train['Fare'] <= 31), 'Fare'] = 2
X_train.loc[ X_train['Fare'] > 31, 'Fare'] = 3
X_train.Fare = X_train.Fare.astype(int)

X_test.Fare=X_test.Fare.fillna(X_test.Age.median())
#create bands for fare
X_test.loc[ X_test['Fare'] <= 7.91, 'Fare'] = 0
X_test.loc[ (X_test['Fare'] > 7.91) & (X_test['Fare'] <= 14.454), 'Fare'] = 1
X_test.loc[ (X_test['Fare'] > 14.454) & (X_test['Fare'] <= 31), 'Fare'] = 2
X_test.loc[ X_test['Fare'] > 31, 'Fare'] = 3
X_test['Fare'] = X_test['Fare'].astype(int)
X_test.Fare = X_test.Fare.astype(int)
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 8 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
Title       891 non-null object
dtypes: float64(2), int64(3), object(3)
memory usage: 62.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 8 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null int64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null int64
Embarked    889 non-null object
Title       891 non-null object
dtypes: int64(5), object(3)
memory usage: 62.6+ KB
None


# Filter categorical columns using mask and turn it into a list

In [4]:
categorical_feature_mask = X_train.dtypes==object
categorical_cols = X_train.columns[categorical_feature_mask].tolist()

# Generate LastName column from Name by using the separator ','

In [5]:
train['LastName'] = train.Name.apply(lambda x:x.split(sep=',')[0])
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
Title          891 non-null object
LastName       891 non-null object
dtypes: float64(2), int64(5), object(7)
memory usage: 97.6+ KB


# Import labelencoder to encode categorical values

In [6]:
from sklearn.preprocessing import LabelEncoder
# instantiate labelencoder object
le = LabelEncoder()
X_train.Sex = le.fit_transform(X_train.Sex)
X_test.Sex = le.fit_transform(X_test.Sex)
X_train.Embarked = X_train.Embarked.fillna(X_train['Embarked'].value_counts().idxmax())
X_train.Embarked = le.fit_transform(X_train.Embarked)
X_test.Embarked = X_test.Embarked.fillna(X_test['Embarked'].value_counts().idxmax())
X_test.Embarked = le.fit_transform(X_test.Embarked)
X_train.Title = X_train.Title.fillna(X_train['Title'].value_counts().idxmax())
X_train.Title = le.fit_transform(X_train.Title)
X_test.Title = X_test.Title.fillna(X_test['Title'].value_counts().idxmax())
X_test.Title = le.fit_transform(X_test.Title)

# Encoding the categorical values with one hot encoder

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), [0,2,5,6,7])],    # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                         # Leave the rest of the columns untouched
)
X_ohe_train = np.array(ct.fit_transform(X_train), dtype=np.float)
X_ohe_test = np.array(ct.fit_transform(X_test), dtype=np.float)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


# Feature scaling needs to be done for continuous columns

In [8]:
from sklearn.preprocessing import MinMaxScaler
min_max=MinMaxScaler(feature_range=(0, 1))
# To scale data 
X_train_minmax = X_ohe_train
X_test_minmax = X_ohe_test
X_train_minmax[:,14:17]=min_max.fit_transform(X_train_minmax[:,14:17])
X_test_minmax[:,14:17]=min_max.fit_transform(X_test_minmax[:,14:17])

In [9]:
#percentage of people survived
print("percentage of people survived :",round(y_train.sum()/y_train.count(),2)*100)
groupby_sex_total = train.groupby('Sex').sum().Survived
groupby_sex_survived = train.groupby('Sex').count().Survived
#percentage of people survived grouped by sex
print("percentage of people survived grouped by sex : \n",str(round(groupby_sex_total/groupby_sex_survived*100,2)))

percentage of people survived : 38.0
percentage of people survived grouped by sex : 
 Sex
female    74.20
male      18.89
Name: Survived, dtype: float64


# Linear Regression

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics
lr = LinearRegression()
#X_train = MinMaxScaler().fit_transform(X_train)
lr.fit(X_train_minmax,y_train)
y_pred = lr.predict(X_test_minmax)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",lr.score(X_test_minmax, y_test))

Accuracy: 0.6331357059622169


# Random Forest Classifier

In [11]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train_minmax,y_train)

y_pred=clf.predict(X_test_minmax)

# Find accuracy of each model

In [12]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.metrics import roc_auc_score
# Model Accuracy, how often is the classifier correct?
Accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:",Accuracy)
roc_score = roc_auc_score(y_test, y_pred)
print("ROC_AUC_SCORE : ",roc_score)
CM = confusion_matrix(y_test, y_pred)
print("Confusion Matrix : \n",CM)
report = classification_report(y_test, y_pred)
print("Classification Report :\n",report)

Accuracy: 0.8301435406698564
ROC_AUC_SCORE :  0.8214285714285715
Confusion Matrix : 
 [[227  39]
 [ 32 120]]
Classification Report :
               precision    recall  f1-score   support

           0       0.88      0.85      0.86       266
           1       0.75      0.79      0.77       152

    accuracy                           0.83       418
   macro avg       0.82      0.82      0.82       418
weighted avg       0.83      0.83      0.83       418



In [13]:
# Neural Network using tensorflow

In [14]:
#Dependencies
import keras
from keras.models import Sequential
from keras.layers import Dense
# Neural network
model = Sequential()
model.add(Dense(20, activation='relu',input_shape=(X_train_minmax.shape[1],)))
model.add(Dense(20, activation='softmax'))
model.add(Dense(16, activation='softmax'))
model.add(Dense(2, activation='softmax'))
model.add(Dense(1, activation='sigmoid'))

#compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['acc'])

from keras.callbacks import EarlyStopping
#set early stopping monitor so the model stops training when it won't improve anymore
early_stopping_monitor = EarlyStopping(patience=3)

#train model
model.fit(X_train_minmax, y_train, validation_split=0.3, epochs=50, callbacks=[early_stopping_monitor])

Using TensorFlow backend.


Train on 623 samples, validate on 268 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x7fda5c200dd8>

In [15]:
y_pred = (model.predict(X_test_minmax)>=0.5).astype('int64')
accuracy = model.evaluate(X_test_minmax,y_test,verbose = 0)[1]
print("Accuracy score of the neural network is :",accuracy)

Accuracy score of the neural network is : 0.9090909361839294


In [16]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.metrics import roc_auc_score
# Model Accuracy, how often is the classifier correct?
Accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:",Accuracy)
roc_score = roc_auc_score(y_test, y_pred)
print("ROC_AUC_SCORE : ",roc_score)
CM = confusion_matrix(y_test, y_pred)
print("Confusion Matrix : \n",CM)
report = classification_report(y_test, y_pred,output_dict=True)
print("Classification Report :\n",report)

Accuracy: 0.9090909090909091
ROC_AUC_SCORE :  0.8961466165413535
Confusion Matrix : 
 [[251  15]
 [ 23 129]]
Classification Report :
 {'0': {'precision': 0.916058394160584, 'recall': 0.943609022556391, 'f1-score': 0.9296296296296296, 'support': 266}, '1': {'precision': 0.8958333333333334, 'recall': 0.8486842105263158, 'f1-score': 0.8716216216216217, 'support': 152}, 'accuracy': 0.9090909090909091, 'macro avg': {'precision': 0.9059458637469586, 'recall': 0.8961466165413534, 'f1-score': 0.9006256256256256, 'support': 418}, 'weighted avg': {'precision': 0.9087038265870384, 'recall': 0.9090909090909091, 'f1-score': 0.9085358085358086, 'support': 418}}


# Generating the final results and saving it in a CSV file

In [17]:
final_results = pd.DataFrame(y_pred, columns = ['Survived'])
final_results['PassengerId'] = gender_submission.PassengerId
final_results=final_results.set_index('PassengerId')
final_results.to_csv('FinalResults-'+str(round(accuracy,2))+str(CM)'.csv')

SyntaxError: invalid syntax (<ipython-input-17-639c3ec36170>, line 4)