In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### 1. Import Libraries

In [None]:
import pandas as pd  #i/o operations and data analysis
import numpy as np   # linear algebra
import matplotlib.pyplot as plt  # viz
import seaborn as sns  #viz
#Machine Learning
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier

plt.style.use('ggplot') #set style for charts
import warnings #filter warnings
warnings.filterwarnings('ignore')

### 2. Read Data

In [None]:
df=pd.read_csv('../input/passenger-list-for-the-estonia-ferry-disaster/estonia-passenger-list.csv')
df.head()

### 3. Data Wrangling

In [None]:
#check for null values
df.isna().sum()

### 4. Exploratory Data Analysis

### 4.1. Univariate Analysis

### 4.1.a Sex

In [None]:
sns.countplot('Sex',data=df)
plt.show()
df.Sex.value_counts()

Observation:
* Number of Male and Female passengers are almost equal

### 4.1.b Age

In [None]:
print('Mean age : {}'.format(round(df.Age.mean(),2)))
plt.figure(figsize=(9,4))
plt.subplot(1,2,1)
sns.boxplot(df.Age)
plt.subplot(1,2,2)
sns.kdeplot(df.Age)
plt.tight_layout()
plt.show()

Observation:
* Most of the passengers are young in the age group 30-60

### 4.1.c Category

In [None]:
print("No: Passengers : {}".format(df.Category.value_counts()['P']))
print("No: Crew Members : {}".format(df.Category.value_counts()['C']))
sns.countplot(df.Category)
plt.show()

Observation:
* Around 20% of passengers are in Crew members and the rest are passengers.

### 4.1.d. Survived

In [None]:
n=df.PassengerId.count()
print('Total Passengers : {}'.format(n))
print('Total Survivors : {}'.format(df.Survived.sum()))
print(f"Survival Percentage : {round(df.Survived.mean()*100,2)}%")

In [None]:
plt.title("Survived Vs Succumbed")
x=[df.Survived.value_counts()[1],df.Survived.value_counts()[0]]
labels= ['Survived', 'Dead']
explode=[0.05,0.05]
plt.pie(x=x,labels=labels, autopct='%1.2f%%', explode=explode)
plt.show()

Observation:
* Around 1 out of 7 on board aboard MS Estonia Survived

### 4.1.e. Country

In [None]:
c=df.Country.value_counts().sort_values(ascending=False)
plt.figure(figsize=(8,4))
sns.barplot(y=c.index,x=c.values*100/n, orient='h')
plt.show()

Observation : 
* More than 90% of the passengers on board were from Sweeden or Estonia
* Since passengers from other countries constitute less than 10%, we will make a new feature 'Ctry' with values Sweden, Estonia and Others.

In [None]:
df['Ctry']=df.Country.apply(lambda x: 'Estonia' if x=='Estonia' 
                            else ('Sweden' if x=='Sweden' else 'Others'))
df.head(5)

In [None]:
# Delete PassengerId, Country, Firstname and Lastname as they are not required for further analysis
df=df[['Sex', 'Age','Category', 'Survived', 'Ctry']]
df.head()

### 4.2. Bivariate Analysis

### 4.2.a. Age Vs Survival

In [None]:
print(f'Mean age of Survivors : {round(df[(df.Survived==1)].Age.mean(),2)}')
print(f'Mean age of Death : {round(df[(df.Survived==0)].Age.mean(),2)}')
plt.title("Age distribution Survived/Died")
sns.kdeplot(data=df[(df.Survived==1)].Age, label='Survived')
sns.kdeplot(data=df[ (df.Survived==0)].Age, label='Not Survived')
plt.show()

Observation:
* Younger passengers are more likely to survive

### 4.2.b Sex Vs Survival

In [None]:
sns.countplot('Survived', hue='Sex', data=df)
plt.show()
print("Survival Rate (Male) : {}%".format(round(df.Survived[df.Sex=='M'].mean()*100,2)))
print("Survival Rate (Female) : {}%".format(round(df.Survived[df.Sex=='F'].mean()*100,2)))

Observation:
* Male Passenger is 4 times more likely to survive than female passenger

### 4.2.c. Category Vs Survival

In [None]:
sns.countplot('Survived', hue='Category', data=df)
plt.show()
print("Survival Rate for C : {}%".format(round(df.Survived[df.Category=='C'].mean()*100,2)))
print("Survival Rate for P  : {}%".format(round(df.Survived[df.Category=='P'].mean()*100,2)))

Observation: 
* Survival rate is more for crew compared to that for passengers.

### 4.2.d. Age Vs Sex

In [None]:
plt.figure(figsize=(8,4))
plt.title('Male')
sns.kdeplot(data=df[df.Sex=='F'].Age, label='Female')
sns.kdeplot(data=df[df.Sex=='M'].Age, label='Male')
plt.show()
print("Average age of Male Passenger : {}".format(round(df[df.Sex=='M'].Age.mean(),2)))
print("Average age of Female Passenger : {}".format(round(df[df.Sex=='F'].Age.mean(),2)))

Observation:
* There is no drastic variation in age profile of male and female passengers

### 4.2.e. Age Vs Category

In [None]:
plt.figure(figsize=(8,4))
plt.title('Category Vs Age')
sns.boxplot(data=df, x='Age', y='Category')
#sns.kdeplot(data=df[df.Sex=='M'].Age, label='Male')
plt.show()
print("Average age of MS Estonia Crew : {}"
      .format(round(df[df.Category=='C'].Age.mean(),2)))
print("Average age of Passengers : {}"
      .format(round(df[df.Category=='P'].Age.mean(),2)))

Observation:
* Average age of crew is much lower than the average age of Passengers.
* Distribution of crew age is very narrow, most of the crew members are 20-40 years old.

### 4.2.f. Sex Vs Category

In [None]:
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.title('P Category')
x=[df[df.Category=='P'].Sex.value_counts()['M'],
   df[df.Category=='P'].Sex.value_counts()['F']]
labels= ['Male', 'Female']
explode=[0.05,0.05]
plt.pie(x=x,labels=labels, autopct='%1.2f%%', explode=explode)

plt.subplot(1,2,2)
plt.title("C Category")
x=[df[df.Category=='C'].Sex.value_counts()['M'],
   df[df.Category=='C'].Sex.value_counts()['F']]
plt.pie(x=x,labels=labels, autopct='%1.2f%%', explode=explode)

plt.tight_layout()
plt.show()

Observation:
* MS Estonia had more female crew members than males.
* However in passenger category, males were more. 

### 4.2.g. Survival Vs Country

In [None]:
sns.barplot(x='Ctry', y='Survived', data=df)
plt.show()
print("Survival Rate :")
print("Sweden : {}".format(round(df.Survived[df.Ctry=='Sweden'].mean()*100,2)))
print("Estonia : {}".format(round(df.Survived[df.Ctry=='Estonia'].mean()*100,2)))
print("Others : {}".format(round(df.Survived[df.Ctry=='Others'].mean()*100,2)))

Observation:
* SURVIVAL OF THE ESTONIEST!
* Estonians are twice as likely to survive than Swedes.
* It prima facie seems that odds of survival is stacked against Swedish Nationals, However further investigation is required to establish the relationship.

### 4.2.h. Category Vs Country

In [None]:
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.title('Passengers')
labels=df.Ctry.unique()
x=[]
for i in labels:
    x.append(df[df.Category=='P'].Ctry.value_counts()[i])
plt.pie(x=x,labels=labels, autopct='%1.2f%%')

plt.subplot(1,2,2)
plt.title("Crew")
x=[]
for i in labels:
    x.append(df[df.Category=='C'].Ctry.value_counts()[i])
plt.pie(x=x,labels=labels, autopct='%1.2f%%')

plt.tight_layout()
plt.show()

Observation: 
* Around 90% of the crew members are Estonians whereas Swedes consitute only 7.7%
* Swedes constitute two-thirds in Passenger category. 

### 4.2.i. Age Vs Country

In [None]:
plt.title("Age Vs Nationality")
sns.kdeplot(data=df.Age[df.Ctry=='Sweden'], label='Sweden')
sns.kdeplot(data=df.Age[df.Ctry=='Estonia'], label='Estonia')
sns.kdeplot(data=df.Age[df.Ctry=='Others'], label='Others')
plt.show()

print("Mean Age")
print("Sweden : {}".format(round(df.Age[df.Ctry=='Sweden'].mean(),2)))
print("Estonia : {}".format(round(df.Age[df.Ctry=='Estonia'].mean(),2)))
print("Others : {}".format(round(df.Age[df.Ctry=='Others'].mean(),2)))

Observation:
* Swedish passengers were considerably older than others on an average.

### 4.2.j. Sex Vs Country

In [None]:
plt.figure(figsize=(8,4))
labels=['Male', 'Female']
explode=[0.05,0.05]
plt.suptitle("GENDER VS NATIONALITY")

plt.subplot(1,3,1)
plt.title('Sweden')
x=[df[df.Ctry=='Sweden'].Sex.value_counts()['M'],
  df[df.Ctry=='Sweden'].Sex.value_counts()['F']]
plt.pie(x=x,labels=labels, explode=explode,autopct='%1.2f%%')

plt.subplot(1,3,2)
plt.title('Estonia')
x=[df[df.Ctry=='Estonia'].Sex.value_counts()['M'],
  df[df.Ctry=='Estonia'].Sex.value_counts()['F']]
plt.pie(x=x,labels=labels, explode=explode,autopct='%1.2f%%')

plt.subplot(1,3,3)
plt.title('Others')
x=[df[df.Ctry=='Others'].Sex.value_counts()['M'],
  df[df.Ctry=='Others'].Sex.value_counts()['F']]
plt.pie(x=x,labels=labels, explode=explode,autopct='%1.2f%%')

plt.tight_layout()
plt.show()

Observation:
* Proportion of female passengers is more for Sweden.
* <b>From 4.2.g it seems that Swedish nationaly is lowers the survivial rate. However from 4.2.h, 4.2.i and 4.2.j it is clear that Swedes were more older, had higer proportion of females and were majority in passenger category which are risk factors.</b>

### 4.3 Relation Between 3 Variable

### 4.3.a. Age Vs Sex Vs Survived

In [None]:
sns.boxplot(data=df, x='Age', y='Sex', hue='Survived')
plt.show()

### 4.3.b. Age Vs Category Vs Survival

In [None]:
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.title('P Category')
sns.kdeplot(data=df[(df.Category=='P') & (df.Survived==1)].Age, label='Survived')
sns.kdeplot(data=df[(df.Category=='P') & (df.Survived==0)].Age, label='Not Survived')
plt.subplot(1,2,2)
plt.title("C Category")
sns.kdeplot(data=df[(df.Category=='C') & (df.Survived==1)].Age,label='Survived')
sns.kdeplot(data=df[(df.Category=='C') & (df.Survived==0)].Age,label='Not Survived')
plt.tight_layout()
plt.show()

### 4.3.c. Age Vs Country Vs Survival

In [None]:
sns.boxplot(data=df, x='Age', y='Ctry', hue='Survived')

### 4.3.d. Age Vs Sex Vs Category

In [None]:
sns.boxplot(data=df, x='Age', y='Sex', hue='Category')

### 4.3.e. Age Vs Country Vs Category

In [None]:
sns.boxplot(data=df, x='Age', y='Ctry', hue='Category')

### 5. Correlation

In [None]:
df.corr()

### 5.1. Chi Square Test

TO BE DONE

### 6. Preprocessing for ML

### 6.1 Split Features and Target

In [None]:
X=df.drop('Survived', axis=1)
y=df.Survived

### 6.2. Scaling Numeric Features

In [None]:
#Scaling Age which is the only numeric feature
scalar=MinMaxScaler()
X.Age=scalar.fit_transform(np.array(X.Age).reshape(-1,1))
X.head()

### 6.3. Encoding Categorical Features

In [None]:
X=pd.get_dummies(X, drop_first=True)
X.head()

### 6.4. Dive the dataset into train, test and validation sets

In [None]:
X_train_a, X_test, y_train_a, y_test=train_test_split(X,y,test_size=0.2, random_state=42)
X_train,X_val,y_train,y_val=train_test_split(X_train_a,y_train_a, test_size=0.2,
                                             random_state=42)
print('Train Shape :', X_train.shape)
print('Val Shape :', X_val.shape)
print('Test Shape :', X_test.shape)

### 7 Fitting Model and Evaluation

### 7.1. Logistic Regression

In [None]:
model_lr=LogisticRegression()
model_lr.fit(X_train,y_train)
print('Score in Train : {}'.format(round(model_lr.score(X_train,y_train)*100,4)))
print('Score in Validation : {}'.format(round(model_lr.score(X_val,y_val)*100,4)))
print('Score in Test : {}'.format(round(model_lr.score(X_test,y_test)*100,4)))
yval_lr=model_lr.predict(X_val)
ytest_lr=model_lr.predict(X_test)

### 7.1.a. Coefficients for each feature

In [None]:
sns.barplot(X_train.columns,model_lr.coef_[0])
plt.xticks(rotation=60)
plt.show()

### 7.2. Support Vector Machine

In [None]:
model_svc=SVC(random_state=42)
model_svc.fit(X_train,y_train)
print('Score in Train : {}'.format(round(model_svc.score(X_train,y_train)*100,4)))
print('Score in Validation : {}'.format(round(model_svc.score(X_val,y_val)*100,4)))
print('Score in Test : {}'.format(round(model_svc.score(X_test,y_test)*100,4)))
yval_svc=model_svc.predict(X_val)
ytest_svc=model_svc.predict(X_test)

### 7.3. Decision Tree

In [None]:
model_dt=DecisionTreeClassifier(criterion='entropy')
model_dt.fit(X_train,y_train)
print('Score in Train : {}'.format(round(model_dt.score(X_train,y_train)*100,4)))
print('Score in Validation : {}'.format(round(model_dt.score(X_val,y_val)*100,4)))
print('Score in Test : {}'.format(round(model_dt.score(X_test,y_test)*100,4)))
yval_dt=model_dt.predict(X_val)
ytest_dt=model_dt.predict(X_test)

### 7.3.a. Feature importance

In [None]:
sns.barplot(X_train.columns,model_dt.feature_importances_)
plt.xticks(rotation=60)
plt.show()

### 7.4. Random Forest 

In [None]:
model_rf=RandomForestClassifier(n_estimators=100)
model_rf.fit(X_train,y_train)
print('Score in Train : {}'.format(round(model_rf.score(X_train,y_train)*100,4)))
print('Score in Validation : {}'.format(round(model_rf.score(X_val,y_val)*100,4)))
print('Score in Test : {}'.format(round(model_rf.score(X_test,y_test)*100,4)))
yval_rf=model_rf.predict(X_val)
ytest_rf=model_rf.predict(X_test)

### 7.4.a. Feature Importance

In [None]:
sns.barplot(X_train.columns,model_rf.feature_importances_)
plt.xticks(rotation=60)
plt.show()

### 7.5. Naive Bayes

In [None]:
model_nb=GaussianNB()
model_nb.fit(X_train,y_train)
print('Score in Train : {}'.format(round(model_nb.score(X_train,y_train)*100,4)))
print('Score in Validation : {}'.format(round(model_nb.score(X_val,y_val)*100,4)))
print('Score in Test : {}'.format(round(model_nb.score(X_test,y_test)*100,4)))
yval_nb=model_nb.predict(X_val)
ytest_nb=model_nb.predict(X_test)

### 7.6. XG Boost

In [None]:
model_xgb=XGBClassifier(learning_rate=0.00005, n_estimators=600,n_jobs=100, max_depth=2)
model_xgb.fit(X_train,y_train)
print('Score in Train : {}'.format(round(model_xgb.score(X_train,y_train)*100,4)))
print('Score in Validation : {}'.format(round(model_xgb.score(X_val,y_val)*100,4)))
print('Score in Test : {}'.format(round(model_xgb.score(X_test,y_test)*100,4)))
yval_xgb=model_xgb.predict(X_val)
ytest_xgb=model_xgb.predict(X_test)

### 7.6.a. Feature Importance

In [None]:
sns.barplot(X_train.columns,model_xgb.feature_importances_)
plt.xticks(rotation=60)
plt.show()

### 7.7. Model Comparison

In [None]:
models =["LR","SVM","DTC","RFC","NB", "XGB"]
scores_val =[round(model_lr.score(X_val,y_val)*100,4),
         round(model_svc.score(X_val,y_val)*100,4),
         round(model_dt.score(X_val,y_val)*100,4),
         round(model_rf.score(X_val,y_val)*100,4),
         round(model_nb.score(X_val,y_val)*100,4),
         round(model_xgb.score(X_val,y_val)*100,4)]
scores_test=[round(model_lr.score(X_test,y_test)*100,4),
         round(model_svc.score(X_test,y_test)*100,4),
         round(model_dt.score(X_test,y_test)*100,4),
         round(model_rf.score(X_test,y_test)*100,4),
         round(model_nb.score(X_test,y_test)*100,4),
         round(model_xgb.score(X_test,y_test)*100,4)]

df_scores=pd.DataFrame({'Model':models,'Score_val':scores_val, 'Score_test':scores_test})
df_scores=df_scores.sort_values(by=['Score_val','Score_test'], ascending=False)

plt.title('Model Score Comparison')
sns.barplot(data=df_scores, x='Model', y='Score_val', color='blue', 
            label='Validation', alpha=0.8)
sns.barplot(data=df_scores, x='Model', y='Score_test', color='red', 
            label='Test', alpha=0.5)
plt.legend()
plt.ylim(70,90)
plt.show()

OBSERVATION:
* SVM, XGB and Logistic Regression have the highest accuracy in validation dataset.
* SVM and XGB have the highest accuracy in test data.
* Most important feature for survival is 'Sex' followed by 'Age' in XGB Classifier.
* Random Forest and Decision Tree models give more importance to 'Age' and 'Sex' followed by 'Category' and 'Country'.
* Logistic regression also provides more weightage to 'Age' followed by 'Sex'
* From all the models, it is clear that Nationality is not a determining factor in survival prediction.

### 8. Stacking Models

In [None]:
yval_stacked=np.column_stack((yval_lr,yval_svc,yval_xgb))
ytest_stacked=np.column_stack((ytest_lr,ytest_svc,ytest_xgb))

meta_model=LogisticRegression()
meta_model.fit(yval_stacked,y_val)
print('Score in Validation : ',round(meta_model.score(yval_stacked,y_val)*100,4))
print('Score in Test : ',round(meta_model.score(ytest_stacked,y_test)*100,4))

### 8.1 Meta Model vs Individual Models

In [None]:
models.append('Meta Model')
scores_test.append(round(meta_model.score(ytest_stacked,y_test)*100,4))
df_scores=pd.DataFrame({'Model':models,'Score_test':scores_test})

In [None]:
df_scores.sort_values(by='Score_test', ascending=False, inplace=True)
plt.title("Model Performance")
sns.barplot(x='Model',y='Score_test', data=df_scores)
plt.ylabel("Accuracy Score")
plt.xlabel("Model")
plt.ylim(70,90)
plt.show()

OBSERVATION:
* The stacked meta model is as good as Logistic Regression, SVC and XGB model.

# Please Upvote the Notebook if you find it useful