# Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pickle
%matplotlib inline

# Titanic Survival Prediction App
**This project is deployed on streamlit as app, you should explore and play around with the app to experience the tremendous value addition by model deployment**

**[Titanic Survival Prediction App link](https://sudhanshu2198-titanic-survival-prediction-a-introduction-vigidy.streamlit.app/)**

**[Github Link](https://github.com/sudhanshu2198/Titanic-Survival-Prediction-App)**

# Data Summary

In [None]:
data=pd.read_csv("/kaggle/input/titanic/train.csv")
data.head()

In [None]:
data.info()

In [None]:
data.describe().T

**Name, Ticket, PassengerId are not useful feature useful, and cabin contains large no of missing values due to which it is not helpful**

# Data Cleaning

In [None]:
data=data.drop(columns=['Name','Ticket','PassengerId','Cabin'],axis=1)

change={"Pclass":"Class",
        "SibSp":"No_of_siblings",
        "Parch":"No_of_parents"}

data.rename(columns=change,inplace=True)

data["Survived"]=data["Survived"].map({0:"Not Survived",1:"Survived"})
data["Class"]=data["Class"].map({1:"Lower",2:"Middle",3:"Upper"})
data["Embarked"]=data["Embarked"].map({"C":"Cherbourg","Q":"Queenstown","S":"Southampton"})

data["No_of_siblings"]=data["No_of_siblings"].apply( lambda x: x if x<=4 else 4)
data["No_of_parents"]=data["No_of_parents"].apply( lambda x: x if x<=4 else 4)

data["Fare"].fillna(data["Fare"].median(),inplace=True)
data["Age"].fillna(data["Age"].median(),inplace=True)
data["Embarked"].fillna(method="ffill",inplace=True)

data.head()

# Data Visualization

In [None]:
df=data["Class"].value_counts()
fig = px.pie(values=df.values, names=df.index, title='Survived Distribution')
fig.show()

In [None]:
df=data["Sex"].value_counts()
fig = px.pie(values=df.values, names=df.index, title='Survived Distribution')
fig.show()

In [None]:
df=data["Embarked"].value_counts()
fig = px.pie(values=df.values, names=df.index, title='Survived Distribution')
fig.show()

In [None]:
fig = px.histogram(data, x="Age")
fig.show()

In [None]:
fig = px.histogram(data, x="Fare")
fig.show()

In [None]:
fig = px.histogram(data, x="No_of_siblings")
fig.show()

In [None]:
fig = px.histogram(data, x="No_of_parents")
fig.show()

In [None]:
fig = px.scatter(data, x="Age", y="No_of_siblings",color="Survived")
fig.show()

In [None]:
df=data.groupby(["Class","Survived"])[["Sex"]].count().reset_index()
fig = px.bar(x=df["Class"], y=df["Sex"], color=df["Survived"], title="Long-Form Input")
fig.show()

In [None]:
df=data.groupby(["Sex","Survived"])[["Class"]].count().reset_index()
fig = px.bar(x=df["Sex"], y=df["Class"], color=df["Survived"], title="Long-Form Input")
fig.show()

In [None]:
df=data.groupby(["Embarked","Survived"])[["Class"]].count().reset_index()
fig = px.bar(x=df["Embarked"], y=df["Class"], color=df["Survived"], title="Long-Form Input")
fig.show()

In [None]:
fig = px.box(data, x="Sex", y="Age", color="Survived")
fig.show()

In [None]:
fig = px.box(data, x="Sex", y="Fare", color="Survived")
fig.show()

In [None]:
fig = px.box(data, x="Class", y="Age", color="Survived")
fig.show()

In [None]:
df=data.groupby(["Embarked","Class","Sex","Survived"])[["Fare"]].count().reset_index()
fig=px.treemap(df,path=['Embarked','Class','Survived'],values='Fare')
fig.show()

# Data Preprocessing

In [None]:
X=data.drop("Survived",axis=1)
y=data["Survived"]

lencoder=LabelEncoder()
y=lencoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y,
                                                    random_state=42)

In [None]:
ohencoder=OneHotEncoder()
cat_df=ohencoder.fit_transform(X_train[['Class', 'Sex','Embarked']]).toarray()

num_df=X_train[['Age', 'No_of_siblings', 'No_of_parents','Fare']].values
X_train=np.concatenate((num_df, cat_df), axis=1)

In [None]:
cv=StratifiedKFold(n_splits=5)
scores=cross_val_score(RandomForestClassifier(random_state=42),X_train,y_train,scoring="accuracy",cv=cv)

print("RandomForest model accuracy is: {}\n".format(scores.mean()))
print("RandomForest model accuracy standard deviation is: {}".format(scores.std()))

# Optimization

In [None]:
params={"n_estimators":list(range(80,150,10)),
         "max_depth":list(range(4,11)),
         "min_samples_split":list(range(2,6))}

search_grid=GridSearchCV(RandomForestClassifier(random_state=42),params,scoring="accuracy",
                         n_jobs=-1,cv=cv)
search_grid.fit(X_train,y_train)

In [None]:
search_grid.best_score_

In [None]:
search_grid.best_params_

# Prediction

In [None]:
model=search_grid.best_estimator_

In [None]:
cat_df=ohencoder.transform(X_test[['Class', 'Sex','Embarked']]).toarray()
num_df=X_test[['Age', 'No_of_siblings', 'No_of_parents','Fare']].values

X_test=np.concatenate((num_df, cat_df), axis=1)
y_pred=model.predict(X_test)

In [None]:
print(f'Classification_report: \n\n {classification_report(y_test,y_pred)}')

In [None]:
val=['Non-Survived','Survived']
data=pd.DataFrame(confusion_matrix(y_test,y_pred),columns=val,index=val)
plt.figure(figsize=(8,8))

sns.heatmap(data,annot=True,cbar=False,cmap='Blues',fmt='g');
plt.xlabel('Predicted Label')
plt.ylabel('True Label')

# Submission

In [None]:
test=pd.read_csv("/kaggle/input/titanic/test.csv")

p_id=test['PassengerId']
test=test.drop(columns=['Name','Ticket','PassengerId','Cabin'],axis=1)

change={"Pclass":"Class",
        "SibSp":"No_of_siblings",
        "Parch":"No_of_parents"}

test.rename(columns=change,inplace=True)

test["Class"]=test["Class"].map({1:"Lower",2:"Middle",3:"Upper"})
test["Embarked"]=test["Embarked"].map({"C":"Cherbourg","Q":"Queenstown","S":"Southampton"})

test["No_of_siblings"]=test["No_of_siblings"].apply( lambda x: x if x<=4 else 4)
test["No_of_parents"]=test["No_of_parents"].apply( lambda x: x if x<=4 else 4)

test["Fare"].fillna(test["Fare"].median(),inplace=True)
test["Age"].fillna(test["Age"].median(),inplace=True)
test["Embarked"].fillna(method="ffill",inplace=True)

cat_df=ohencoder.transform(test[['Class', 'Sex','Embarked']]).toarray()
num_df=test[['Age', 'No_of_siblings', 'No_of_parents','Fare']].values

test=np.concatenate((num_df, cat_df), axis=1)

In [None]:
pred=model.predict(test)

In [None]:
submission=pd.DataFrame({'PassengerId':p_id,
                         "Survived":pred})
submission.to_csv("submission.csv",index=False)

# Model Persisting

In [None]:
pickle.dump(ohencoder,open("ohencoder.pkl","wb"))
pickle.dump(model,open("model.pkl","wb"))
pickle.dump(lencoder,open("lencoder.pkl","wb"))