In [1]:
# This technical part was coded in python and pyspark to predict the survival of people from the titanic disaster 

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [3]:
titanic_Dataset_train = pd.read_csv("/dbfs/FileStore/tables/train_titanic_dataset.csv") # Training dataset
test = pd.read_csv("/dbfs/FileStore/tables/test_titanic_dataset.csv") # Testing Data Set

In [4]:
titanic_Dataset_train.columns

In [5]:
titanic_Dataset_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# By using the info() function each attributes total values and datatypes are identified. 
# Additionally the use of informations functions null values in attributes age, cabin and embarked are identified.
titanic_Dataset_train.info()

In [7]:
titanic_Dataset_train.shape

In [8]:
titanic_Dataset_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
missing_percent=titanic_Dataset_train.isnull().sum()/891*100
missing_count=titanic_Dataset_train.isnull().sum()
df_null_train= pd.DataFrame(data={'missing_percent_train': missing_percent, 'missing_count_train': missing_count},
                            index=test.columns) 
df_null_train.sort_values(by='missing_percent_train', ascending=False)

Unnamed: 0,missing_percent_train,missing_count_train
Cabin,77.104377,687
Age,19.86532,177
Embarked,0.224467,2
PassengerId,0.0,0
Pclass,0.0,0
Name,0.0,0
Sex,0.0,0
SibSp,0.0,0
Parch,0.0,0
Ticket,0.0,0


In [10]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [11]:
test.shape

In [12]:
# By using the info() function each attributes total values and datatypes are identified. 
# Additionally the use of informations functions null values in attributes age, cabin and embarked are identified.
test.info()

In [13]:
#In test dataset the number of null values are identified using isnull().
test.isnull().sum()

**Data cleaning**

In [15]:
#Missing value for training dataset Embarked and Age column are replaced with the mean value of age and embarked value of 'C'
titanic_Dataset_train['Embarked'].fillna(titanic_Dataset_train['Embarked'].dropna().mode(), inplace=True)
titanic_Dataset_train['Embarked'].fillna(titanic_Dataset_train['Embarked'].value_counts()[0], inplace=True)
titanic_Dataset_train['Age'].fillna(titanic_Dataset_train['Age'].dropna().mean(), inplace=True)

In [16]:
#Missing value for testing datast Age and Fare are replace by their respective average value of the column.
test['Age'].fillna(test['Age'].dropna().mean(), inplace=True)
test['Fare'].fillna(test['Fare'].dropna().mean(), inplace=True)

In [17]:
# Again checking the missing null values in training dataset.
titanic_Dataset_train.isnull().sum()

In [18]:
# Again checking the missing null values in training dataset.
test.isnull().sum()

In [19]:
titanic_Dataset_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


> **Adding New Features**

In [21]:
#New columns such as TotalFamily and FamilyBucket are created by using the concat of SibSp and Parch columns 
df = titanic_Dataset_train.copy()
df_test = test.copy()

df['TotalFamily'] = df['SibSp'] + df['Parch']
df['FamilyBucket'] = 'FamilyBucket'
df.loc[df['TotalFamily'] == 0, 'FamilyBucket'] = 'Single'
df.loc[(df['TotalFamily']>=1) & (df['TotalFamily']<=3), 'FamilyBucket'] = 'SmallFamily'
df.loc[df['TotalFamily']>3, 'FamilyBucket'] = 'LargeFamily'

df_test['TotalFamily'] = df_test['SibSp'] + df_test['Parch']
df_test['FamilyBucket'] = 'FamilyBucket'
df_test.loc[df_test['TotalFamily'] == 0, 'FamilyBucket'] = 'Single'
df_test.loc[(df_test['TotalFamily']>=1) & (df_test['TotalFamily']<=3), 'FamilyBucket'] = 'SmallFamily'
df_test.loc[df_test['TotalFamily']>3, 'FamilyBucket'] = 'LargeFamily'


#New Column called agegroup was created based on the age column.
df['AgeGroup'] = 'agegroup'
df.loc[df['Age']<=1, 'AgeGroup'] = 'Infant'
df.loc[(df['Age']>1) & (df['Age']<=5), 'AgeGroup'] = 'Child'
df.loc[(df['Age']>5) & (df['Age']<=10), 'AgeGroup'] = 'YoungChild'
df.loc[(df['Age']>10) & (df['Age']<=50), 'AgeGroup'] = 'Adult'
df.loc[df['Age']>50, 'AgeGroup'] = 'SeniorCitizen'

df_test['AgeGroup'] = 'agegroup'
df_test.loc[df_test['Age']<=1, 'AgeGroup'] = 'Infant'
df_test.loc[(df_test['Age']>1) & (df_test['Age']<=5), 'AgeGroup'] = 'Child'
df_test.loc[(df_test['Age']>5) & (df_test['Age']<=10), 'AgeGroup'] = 'YoungChild'
df_test.loc[(df_test['Age']>10) & (df_test['Age']<=50), 'AgeGroup'] = 'Adult'
df_test.loc[df_test['Age']>50, 'AgeGroup'] = 'SeniorCitizen'


> ****Converting the objects into Numeric meterics****

In [23]:
#Converting the string column such as AgeGroup, FamilyBucket, Embarked and Sex into numeric metrics.
DAgeGroup = pd.get_dummies(df['AgeGroup'], prefix='AgeGroup')
df = pd.concat([df, DAgeGroup], axis=1)
DFamilyBucket = pd.get_dummies(df['FamilyBucket'], prefix='FamilyBucket')
df = pd.concat([df, DFamilyBucket], axis=1)
DEmbarked = pd.get_dummies(df['Embarked'], prefix='Embarked')
df = pd.concat([df, DEmbarked], axis=1)
DSex = pd.get_dummies(df['Sex'], prefix = 'Sex')
df = pd.concat([df, DSex], axis=1)

DAgeGroup_test = pd.get_dummies(df_test['AgeGroup'], prefix='AgeGroup')
df_test = pd.concat([df_test, DAgeGroup_test], axis=1)
DFamilyBucket_test = pd.get_dummies(df_test['FamilyBucket'], prefix='FamilyBucket')
df_test = pd.concat([df_test, DFamilyBucket_test], axis=1)
DEmbarked_test = pd.get_dummies(df_test['Embarked'], prefix='Embarked')
df_test = pd.concat([df_test, DEmbarked_test], axis=1)
DSex_test = pd.get_dummies(df_test['Sex'], prefix = 'Sex')
df_test = pd.concat([df_test, DSex_test], axis=1)

In [24]:
#To Avoid the poor machine leanrning model perfromance unwanted columns such as Name, Ticket and cabin are removed from the dataframe.
df = df.drop(columns=['Name', 'Ticket', 'Cabin'], axis=1)
df_test = df_test.drop(columns=['Name', 'Ticket', 'Cabin'], axis=1)

In [25]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,TotalFamily,FamilyBucket,AgeGroup,AgeGroup_Adult,AgeGroup_Child,AgeGroup_Infant,AgeGroup_SeniorCitizen,AgeGroup_YoungChild,FamilyBucket_LargeFamily,FamilyBucket_Single,FamilyBucket_SmallFamily,Embarked_644,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,1,0,3,male,22.0,1,0,7.25,S,1,SmallFamily,Adult,1,0,0,0,0,0,0,1,0,0,0,1,0,1
1,2,1,1,female,38.0,1,0,71.2833,C,1,SmallFamily,Adult,1,0,0,0,0,0,0,1,0,1,0,0,1,0
2,3,1,3,female,26.0,0,0,7.925,S,0,Single,Adult,1,0,0,0,0,0,1,0,0,0,0,1,1,0
3,4,1,1,female,35.0,1,0,53.1,S,1,SmallFamily,Adult,1,0,0,0,0,0,0,1,0,0,0,1,1,0
4,5,0,3,male,35.0,0,0,8.05,S,0,Single,Adult,1,0,0,0,0,0,1,0,0,0,0,1,0,1


In [26]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,TotalFamily,FamilyBucket,AgeGroup,AgeGroup_Adult,AgeGroup_Child,AgeGroup_Infant,AgeGroup_SeniorCitizen,AgeGroup_YoungChild,FamilyBucket_LargeFamily,FamilyBucket_Single,FamilyBucket_SmallFamily,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,892,3,male,34.5,0,0,7.8292,Q,0,Single,Adult,1,0,0,0,0,0,1,0,0,1,0,0,1
1,893,3,female,47.0,1,0,7.0,S,1,SmallFamily,Adult,1,0,0,0,0,0,0,1,0,0,1,1,0
2,894,2,male,62.0,0,0,9.6875,Q,0,Single,SeniorCitizen,0,0,0,1,0,0,1,0,0,1,0,0,1
3,895,3,male,27.0,0,0,8.6625,S,0,Single,Adult,1,0,0,0,0,0,1,0,0,0,1,0,1
4,896,3,female,22.0,1,1,12.2875,S,2,SmallFamily,Adult,1,0,0,0,0,0,0,1,0,0,1,1,0


In [27]:
# correlation between the dependent and independent variables
corr=titanic_Dataset_train.corr()
plt.figure(figsize=(6, 6))

sns.heatmap(corr, vmax=.8, linewidths=0.01,
            square=True,annot=True,cmap='YlGnBu',linecolor="white")
plt.title('Correlation between features');
display()

In [28]:
titanic_Dataset_train.corr()["Survived"]

> **Selecting Independent and Dependent variables for model**

In [30]:
#Selecting dependent and independent variables
#X = df.drop(columns=['PassengerId', 'Survived', 'FamilyBucket', 'AgeGroup', 'Embarked', 'Sex'], axis=1)
X = df[['Fare', 'Sex_female', 'Sex_male', 'FamilyBucket_SmallFamily', 'Pclass', 'Age', 'Embarked_C', 'FamilyBucket_Single', 'AgeGroup_Infant', 'FamilyBucket_LargeFamily']]
y = df['Survived']

In [31]:
#Splitting Training dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)

> **Using different models for training the dataset and checking accuracy**

In [33]:
#Logistic Regression

lr = LogisticRegression(penalty='l2', C=1.0, max_iter=100)
acc_lr_cv=cross_val_score(estimator=lr,X=X_train,y=y_train,cv=10)
print("Logistic Regression average score using K-fold cross validation:",np.mean(acc_lr_cv))

lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = metrics.accuracy_score(y_pred_lr, y_test)
print('Accuracy', metrics.accuracy_score(y_pred_lr, y_test))
print('Classification report: ', classification_report(y_test, y_pred_lr))
print('Confusion matrix: ', confusion_matrix(y_test, y_pred_lr))

In [34]:
#Choosing best parameters of Logistic egression using Grid search
grid = {'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100]}

CV_lr = GridSearchCV(estimator=LogisticRegression(), param_grid=grid, cv= 5)
CV_lr.fit(X_train, y_train)
print("tuned hyperparameters :",CV_lr.best_params_)
print("tuned parameter accuracy (best score):",CV_lr.best_score_)

In [35]:
#KNN classifier

knc = KNeighborsClassifier(n_neighbors=5)
acc_knc_cv=cross_val_score(estimator=knc,X=X_train,y=y_train,cv=10)  #K=10
print("KNN average score using K-fold cross validation: :",np.mean(acc_knc_cv))

knc.fit(X_train, y_train)
y_pred_knc = knc.predict(X_test)
acc_knc = metrics.accuracy_score(y_pred_knc, y_test)
print('Accuracy: ', metrics.accuracy_score(y_pred_knc, y_test))
print('Classification report: ', classification_report(y_test, y_pred_knc))
print('Confusion matrix: ', confusion_matrix(y_test, y_pred_knc))

In [36]:
#Choosing best parameters of KNN using Grid search
grid ={"n_neighbors":np.arange(1,50)}
CV_knc=GridSearchCV(KNeighborsClassifier(),grid,cv=10)#K=10 
CV_knc.fit(X_train,y_train)
print("tuned hyperparameter K:",CV_knc.best_params_)
print("tuned parameter accuracy (best score):",CV_knc.best_score_)

In [37]:
#Comparing Accuracy of each model
models = pd.DataFrame({'Model' : ['LogisticRegression', 'KNN'], 
                      'Score' : [acc_lr, acc_knc]})
models.sort_values(by='Score', ascending=False)
fig, ax=plt.subplots(figsize=(7,3))
sns.barplot(x='Model', y='Score', data=models, palette='rainbow')
ax.set_xlabel('Classifiers')
ax.set_ylabel('Accuracy Score')
ax.set_title('Classifiers Vs Accuracy score')
ax.set_ylim([0.6, 0.9])
plt.show()
display()

> ****Data Visualization****

In [39]:
#Finding the number of male and female passengers are survived or dead in disaster based on the sex and survived features.
fig, ax = plt.subplots(1, 2, figsize = (8, 4))
titanic_Dataset_train["Sex"].value_counts().plot.bar(color = "pink", ax = ax[0])
ax[0].set_title("Total of male and female passenger in ship")
ax[0].set_ylabel("Total Passengers")
sns.countplot("Sex", hue = "Survived", data = titanic_Dataset_train, ax = ax[1], palette = 'winter')
ax[1].set_title("Survived vs Dead based on gender")
plt.show()
display()

In [40]:
#Finding the number of passengers by class wise was survived or dead based on the Pclass and survived features.
fig, ax = plt.subplots(1, 2, figsize = (10, 4))
titanic_Dataset_train["Pclass"].value_counts().plot.bar(color = "green", ax = ax[0])
ax[0].set_title("Number Of Passengers By Pclass")
ax[0].set_ylabel("Population")
sns.countplot("Pclass", hue = "Survived", data = titanic_Dataset_train, ax = ax[1], palette='rainbow')
ax[1].set_title("Pclass: Survived vs Dead")
plt.show()
display()

In [41]:
#Finding the number of poeple died or survived or died based on the age range 0-85
fig, ax = plt.subplots(1, 2, figsize = (10, 5))
titanic_Dataset_train[titanic_Dataset_train["Survived"] == 0]["Age"].plot.hist(ax = ax[0], bins = 20, edgecolor = "black", color = "orange")
ax[0].set_title("Unsurvived")
domain_1 = list(range(0, 85, 5))
ax[0].set_xticks(domain_1)
titanic_Dataset_train[titanic_Dataset_train["Survived"] == 1]["Age"].plot.hist(ax = ax[1], bins = 20, edgecolor = "black", color = "red")
ax[1].set_title("Survived")
domain_2 = list(range(0, 85, 5))
ax[1].set_xticks(domain_2)
plt.show()
display()