In [None]:
import os
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
dat=pd.read_csv("../input/titanicdataset-traincsv/train.csv")

In [None]:
dat.head()

In [None]:
dat['Sex']=[1 if i =="male" else 0 for i in dat["Sex"]]

In [None]:
dat.isnull().sum()

In [None]:
dat.info()

### Missing value treatment

In [None]:
dat['Embarked'].value_counts()

In [None]:
dat[dat["Embarked"].isnull()]

In [None]:
dat.boxplot(column="Fare",by = "Embarked")
plt.show()

In [None]:
#Since Embarked has Fare closer to median of C
dat["Embarked"]=dat["Embarked"].fillna("C")

In [None]:
dat.shape

In [None]:
plt.style.use("seaborn-whitegrid")
sns.factorplot(x = "Sex",y="Age",data = dat, kind="box")
plt.show()

In [None]:
dat['Age'] = dat['Age'].fillna(dat.groupby('Pclass')['Age'].transform('median'))

In [None]:
dat.groupby('Pclass')['Age'].agg(np.sum)
dat.groupby('Pclass')['Age'].transform('median')

In [None]:
dat["Age"].describe()

### EDA

In [None]:
dat.head()

In [None]:
import re
dat['Title'] = dat['Name'].map(lambda x: re.compile("([A-Za-z]+)\.").search(x).group())

In [None]:
dat["Title"].unique()

In [None]:
dat['Title'] = dat['Title'].replace(['Capt.', 'Col.','Don.', 'Dr.', 'Major.', 'Rev.', 'Jonkheer.', 'Dona.'], 'Rare.')
dat['Title'] = dat['Title'].replace(['Countess.', 'Lady.', 'Sir.'], 'Royal.')
dat['Title'] = dat['Title'].replace('Mlle.', 'Miss.')
dat['Title'] = dat['Title'].replace('Ms.', 'Miss.')
dat['Title'] = dat['Title'].replace('Mme.', 'Mrs.')

In [None]:
dat.groupby(["Title","Survived"])["Survived"].agg({np.size})

In [None]:
dat["Title"].unique().tolist()

In [None]:
#Title Mapping
title_mapping = {"Mr.": 1, "Miss.": 2, "Mrs.": 3, "Master.": 4, "Royal.": 5, "Rare.": 6}

In [None]:
dat["Title"]=dat["Title"].map(title_mapping)

In [None]:
dat['fam']=dat['SibSp']+dat["Parch"]+1

In [None]:
dat=dat.drop(["SibSp","Parch","PassengerId","Name","Cabin","Ticket"], axis=1)

In [None]:
dat.head()

### Dividing the dataset into X and Y

In [None]:
X=dat.drop(["Survived",], axis=1)

In [None]:
y=dat["Survived"]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X=pd.get_dummies(X)

In [None]:
X.head()

In [None]:
X.info()

In [None]:
#The following function compares train dataset columns with 'DV' column
def bar_chart(feature):
    survived = dat[dat['Survived']==1][feature].value_counts()
    dead = dat[dat['Survived']==0][feature].value_counts()
    df = pd.DataFrame([survived,dead])
    df.index = ['Survived','Dead']
    df.plot(kind='bar',stacked=True, figsize=(10,5))

In [None]:
bar_chart('Sex')

In [None]:
bar_chart('Pclass')

In [None]:
dat['Age'].hist(bins=40,color='salmon')
plt.title("AGE",size=20)

In [None]:
bar_chart('Title')

In [None]:
bar_chart('fam')

In [None]:
bar_chart('Embarked')

#### Creating training and testing Model

In [None]:
import sklearn.model_selection as model_selection
X_train,X_test, y_train, y_test=model_selection.train_test_split(X,y, test_size=0.2, random_state=200)

In [None]:
#Accuracy score without hyperparameter tuning
import sklearn.metrics as metrics
def fit_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    return metrics.accuracy_score(y_test, y_predicted)

In [None]:
import sklearn.tree as tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

dt_classifier= tree.DecisionTreeClassifier()
rf_classifier = RandomForestClassifier()
gb_classifier = GradientBoostingClassifier()

dt_accuracy= fit_evaluate_model(dt_classifier, X_train, y_train, X_test, y_test)
rf_accuracy = fit_evaluate_model(rf_classifier, X_train, y_train, X_test, y_test)
gb_accuracy = fit_evaluate_model(gb_classifier, X_train, y_train, X_test, y_test)

In [None]:
print("Decision Tree : ",dt_accuracy)
print("Random Forest : ",rf_accuracy)
print("GradientBoosting : ",gb_accuracy)

In [None]:
#Now accuracy score with hyperparameter tuning


#### Decision Tree

In [None]:
clf=tree.DecisionTreeClassifier(max_depth=3,random_state=200)
clf.fit(X_train,y_train)

In [None]:
clf.score(X_test,y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
mod=GridSearchCV(clf,param_grid={'max_depth':[2,3,4,5,6]})
mod.fit(X_train,y_train)

In [None]:
mod.best_estimator_

In [None]:
mod.best_params_

In [None]:
#Finalizing max_depth as 3
clf=tree.DecisionTreeClassifier(max_depth=3,random_state=200)
clf.fit(X_train,y_train)

In [None]:
# Confusion matrix ( ACTUAL LABLES, PREDICTED LABLES)
metrics.confusion_matrix( y_test, clf.predict(X_test))

In [None]:
mod1=metrics.accuracy_score(y_test, clf.predict(X_test))

In [None]:
mod1

In [None]:
#Random Forest
rf=RandomForestClassifier(n_estimators=80,oob_score=True,n_jobs=-1,random_state=400)
rf.fit(X_train,y_train)

In [None]:
rf.oob_score_

In [None]:
#Getting the best n_estimators
for w in range(10,150,10):
    rf=RandomForestClassifier(n_estimators=w,oob_score=True,n_jobs=-1,random_state=400)
    rf.fit(X_train,y_train)
    oob=rf.oob_score_
    print('For n_estimators = '+str(w))
    print('OOB score is '+str(oob))
    print('************************')

In [None]:
#Finalizing n_estimator as 70
rf=RandomForestClassifier(n_estimators=70,oob_score=True,n_jobs=-1,random_state=400)
rf.fit(X_train,y_train)

In [None]:
rf.oob_score_

In [None]:
rf.feature_importances_

In [None]:
imp_feat=pd.Series(rf.feature_importances_,index=X.columns.tolist())
imp_feat.sort_values(ascending=False)

In [None]:
mod2=metrics.accuracy_score(y_test, rf.predict(X_test))

In [None]:
mod2

In [None]:
#Gradient Boosting
gb=GradientBoostingClassifier(n_estimators=80,random_state=400, max_depth=2)
gb.fit(X_train,y_train)

In [None]:
#For n_estimator
from sklearn.model_selection import GridSearchCV
mod=GridSearchCV(gb,param_grid={'n_estimators':[20,40,60,80,100,120,140,160,180,200]})
mod.fit(X_train,y_train)

In [None]:
mod.best_estimator_

In [None]:
mod.best_params_

In [None]:
gb=GradientBoostingClassifier(n_estimators=140,random_state=400, max_depth=2)
gb.fit(X_train,y_train)

In [None]:
gb.feature_importances_

In [None]:
feature_imp=pd.Series(gb.feature_importances_,index=X.columns)
feature_imp.sort_values(ascending=False)

In [None]:
#For depth
from sklearn.model_selection import GridSearchCV
mod=GridSearchCV(clf,param_grid={'max_depth':[2,3,4,5,6,7,8]})
mod.fit(X_train,y_train)

In [None]:
mod.best_estimator_

In [None]:
#Finalizing max_depth=2 and n_estimators=140
gb=GradientBoostingClassifier(n_estimators=140,random_state=400, max_depth=2)
gb.fit(X_train,y_train)

In [None]:
mod3=metrics.accuracy_score(y_test,clf.predict(X_test))

In [None]:
mod3

In [None]:
print("Decision Tree: ",mod1)
print("Random Forest: ",mod2)
print("Gradient Boosting: ",mod3)

In [None]:
#Hence here, Decision Tree is most accurate for predecting for this dataset