In [None]:
import pandas as pd
import seaborn as sns
import re
sns.set()
import numpy as np
import matplotlib.pyplot as plt 
import matplotlib.ticker as ticker
plt.rc("font", size=14)
import warnings
warnings.simplefilter(action='ignore')

In [None]:
# seaborn color palettes 

palette_1 = sns.color_palette('Accent', 6)
palette_2 = sns.color_palette('Set1', 6)
palette_3 = sns.color_palette('BrBG')
palette_4 = sns.color_palette('CMRmap')
palette_5 = sns.color_palette('Paired', 6)
palette_6 = sns.color_palette('RdYlBu')
palette_binary_1 = sns.color_palette('Accent_r', 2)
palette_binary_2 = sns.color_palette('Set1', 2)
palette_binary_3 = sns.color_palette('Set2', 2)

for color in [palette_1, palette_2, palette_3, palette_4, palette_5, 
              palette_6, palette_binary_1, palette_binary_2, palette_binary_3]:
        sns.palplot(color)

In [None]:
Train = pd.read_csv("../input/titanic/train.csv", header=0)

In [None]:
test = pd.read_csv("../input/titanic/test.csv", header=0)

In [None]:
test.head()

In [None]:
Train.head()

In [None]:
train = pd.concat([Train,test],axis=0)

In [None]:
train.shape

In [None]:
train.describe()

In [None]:
# let's plot pair plot to visualise the attributes all at once
sns.pairplot(data=train, hue = 'Survived')
plt.savefig("palette_binary_1")

## Missing Vlues

In [None]:
train.info() 

In [None]:
train.Fare= train.Fare.fillna(train.Fare.mean()) 

In [None]:
# percent of missing "Embarked" 
print('Percent of missing "Embarked" records is %.2f%%' %((train['Embarked'].isnull().sum()/train.shape[0])*100))

In [None]:
train.Age= train.Age.fillna(train.Age.median()) #filling missing values by median

###### here most common boarding port of embarkation is S.. so we replace  Nan value by s

In [None]:
train["Embarked"].fillna(train['Embarked'].value_counts().idxmax(), inplace=True)

In [None]:
# percent of missing "Cabin" 
print('Percent of missing "Cabin" records is %.2f%%' %((train['Cabin'].isnull().sum()/train.shape[0])*100))

###### here cabin has 77% missing value so we ignore this clm

In [None]:
train.info()

## Visualization

In [None]:
plt.figure(figsize=(16, 7))
train['Age_cat'] = pd.cut(train.Age, bins=[0, 5, 24, 30, 36, 40, 50, 60, 70, 80])

sns.countplot(data=train, x='Age_cat', hue='Survived', palette=palette_binary_3)

plt.show()

In [None]:
cat_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

plt.figure(figsize=(16, 14))
sns.set(font_scale= 1.2)
sns.set_style('ticks')

for i, feature in enumerate(cat_features):
    plt.subplot(2, 3, i+1)
    sns.countplot(data=train, x=feature, hue='Survived', palette=palette_5)  
    
sns.despine()

In [None]:
# percent of Survived male
male = train.loc[train.Sex == 'male']["Survived"]
rate_male =sum(male)/len(male)*100
print(rate_male)

In [None]:
# percent of Survived female
Female = train.loc[train.Sex == 'female']["Survived"]
rate_Female =sum(Female)/len(Female)*100
print(rate_Female)

In [None]:
temp = train.copy()
temp['Cabin'] = temp.Cabin.str.extract(pat='([A-Z])')

fig, ax = plt.subplots(1, 2, figsize=(16, 8))
sns.set_style('ticks')

sns.countplot(data=temp, x='Cabin', hue='Pclass', ax=ax[0], palette=palette_1)
ax[0].set_title('Pclass-Cabin Proportions', x=0.28, y=1.04, size=25)

temp.Cabin.fillna('missing', inplace=True)
temp_missing = temp.loc[temp.Cabin == 'missing']

sns.countplot(data=temp_missing, x='Cabin', hue='Pclass', palette=palette_1)
ax[1].set_title('Missing Cabin proportions', x=0.27, y=1.04, size=25)

sns.despine()
plt.show()

In [None]:
num_features = ['Fare', 'Age']
sns.set_style('white')

plt.figure(figsize=(16, 14))
for i, feature in enumerate(num_features):
    plt.subplot(2, 2, i+1)
    plt.hist(x=[train[feature][train['Survived'] == 1], train[feature][train['Survived']==0]],
            stacked=True, label=['Survived', 'Not Survived'], bins=20, color=['orange', 'b'])
    plt.legend()
    plt.xlabel(f'{feature}', fontsize=15)
    plt.ylabel('Count', fontsize=15)

In [None]:
plt.figure(figsize=(15,8))
ax = sns.kdeplot(train["Age"][train.Survived == 1], color="red", shade=True)
sns.kdeplot(train["Age"][train.Survived == 0], color="lightcoral", shade=True)
plt.legend(['Survived', 'Died'])
plt.title('Density Plot of Age for Surviving Population and Deceased Population')
ax.set(xlabel='Age')
plt.xlim(-10,85)
plt.show()

In [None]:
plt.figure(figsize=(15,8))
ax = sns.kdeplot(train["Fare"][train.Survived == 1], color="green", shade=True)
sns.kdeplot(train["Fare"][train.Survived == 0], color="blue", shade=True)
plt.legend(['Survived', 'Died'])
plt.title('Density Plot of Fare for Surviving Population and Deceased Population')
ax.set(xlabel='Fare')
plt.xlim(-10,85)
plt.show()

In [None]:
temp = train.copy()
temp['Family_size'] = temp['SibSp']+temp['Parch']+1 # +1cause if there is no sibsb or parch then it would consider alone pass
sns.set_style('ticks')

fig, ax = plt.subplots(1, 1, figsize=(15, 8))
sns.countplot(data=temp, x='Family_size', hue='Survived', ax=ax, palette=palette_6)
ax.set_title('Family Size - Survived Plot', size=25, loc='Left', y=1.04)

sns.despine()
plt.show()

In [None]:
temp['Family_size_cat'] = temp['Family_size'].replace({1:'alone', 2:'small_family', 3:'small_family', 4:'small_family'
                                                      ,5:'large_family', 6:'large_family', 7:'large_family'
                                                      ,8:'large_family', 9:'large_family', 10:'large_family', 
                                                       11:'large_family'})

fig, ax = plt.subplots(1, 1, figsize=(15, 8))
sns.set_style('ticks')

sns.countplot(data=temp, x='Family_size_cat', hue='Survived', ax=ax, palette=palette_5)
ax.set_title('Family Category - Survived Plot', size=25, loc='Left', y=1.04)

sns.despine()
plt.show()

In [None]:
temp = train.copy()
temp['Name_length'] = temp.Name.str.replace(pat='[^a-zA-Z]', repl='').str.len()
sns.set_style('ticks')

fig, ax = plt.subplots(1, 1, figsize=(16, 6))
sns.histplot(data=temp, x='Name_length', hue='Survived', kde=True, fill=True, ax=ax, palette=palette_binary_2)
ax.set_title('Name Length - Survived Plot', size=20, loc='Left', y=1.03)

sns.despine()
plt.show()

###### Great!! we can see that with increase in the name length the survival rate increases!! from about name_length 26 more pople with name length more than 26 survived than not survived.

In [None]:
temp = train.copy()

temp['Title'] = temp.Name.str.extract(pat='([a-zA-Z]+\.)')

temp.Title[~temp.Title.isin(['Mr.', 'Miss.', 'Mrs.', 'Master.'])] = 'rare'

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 8))
sns.set_style('ticks')

sns.countplot(data=temp, x='Title', hue='Survived', ax=ax, palette=palette_6)
ax.set_title('Title - Survived Plot', loc='Left', size=25, y=1.03)

sns.despine()
plt.show()

In [None]:
temp.head()

## Create Dummy variables

In [None]:
df = pd.get_dummies(train, columns=["Pclass","Embarked","Sex"])
df.drop('Sex_female', axis=1, inplace=True)
df.drop('Ticket', axis=1, inplace=True)
df.drop('SibSp', axis=1, inplace=True)
df.drop('Parch', axis=1, inplace=True)
df.drop('Age_cat', axis=1, inplace=True)
df.drop('Cabin', axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df.drop('Pclass_3', axis=1, inplace=True)
df.drop('Embarked_C', axis=1, inplace=True)

df.head()

## Correlation Heatmap

In [None]:
plt.figure(figsize=(16, 6))
heatmap =sns.heatmap(df.corr(), annot = True, cmap= 'Greens')
heatmap.set_title('Correlation Heatmap', fontdict = {'fontsize':20}, pad =14);

## Logistic Regression

In [None]:
from sklearn.metrics import classification_report
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, roc_auc_score, precision_score
from sklearn.metrics import roc_curve
from statsmodels.tools import add_constant
import warnings
warnings.filterwarnings('ignore')

In [None]:
new_train = df.iloc[:891,:]
new_test = df.iloc[891:,:]

In [None]:
new_train[['Survived']]=new_train[['Survived']].astype('int64')

In [None]:
x = new_train.drop(['Survived'], axis=1)
y = new_train[['Survived']]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42) # 70% training and 30% test

In [None]:
from sklearn.preprocessing import StandardScaler
x_train = StandardScaler().fit_transform(x_train)
x_test = StandardScaler().fit_transform(x_test)

In [None]:
logReg = LogisticRegression().fit(x_train, y_train)
train_pred = logReg.predict(x_train)
test_pred = logReg.predict(x_test)

print('train set accuracy:', accuracy_score(y_train, train_pred))
print(' test set accuracy:', accuracy_score(y_test, test_pred))

## Logit

In [None]:
x_cons = sm.add_constant(x)

In [None]:
result = sm.Logit(y, x_cons).fit()
result.summary()

In [None]:
x.drop(['Fare'], axis=1, inplace=True)

result = sm.Logit(y, x).fit()
result.summary()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=254)

logReg = LogisticRegression().fit(x_train, y_train)
train_pred = logReg.predict(x_train)
test_pred = logReg.predict(x_test)

print('New train set accuracy:', accuracy_score(y_train, train_pred))
print('New test set accuracy:', accuracy_score(y_test, test_pred))
pred_prob = logReg.predict_proba(x_test)

## Confusion Matrix Logistic

In [None]:
log_reg=accuracy_score(y_test, test_pred)
cm=confusion_matrix(y_test,test_pred)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

all_sample_title = 'Accuracy Score: {0}'.format(log_reg)

plt.title(all_sample_title, size = 19)
plt.savefig("pne.png")

# Ensemble Techniques

In [None]:
from sklearn.metrics import classification_report
import statsmodels.api as sm
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, roc_auc_score, precision_score, auc
from sklearn.metrics import roc_curve
import warnings
warnings.filterwarnings('ignore')

In [None]:
pip install pydotplus

In [None]:
#for decision tree object
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn.tree import plot_tree
from sklearn import tree
from IPython.display import Image
import pydotplus

# Decision Tree

In [None]:
#Decision tree
clf = tree.DecisionTreeClassifier(criterion="gini",random_state = 42, max_depth=5,
                            min_samples_split=5,min_samples_leaf=1,min_impurity_decrease = 0.001)
                            
clf = clf.fit(x_train,y_train)

#Predict the response
y_pred1 = clf.predict(x_test)
print("Classification report - \n", classification_report(y_test,y_pred1))
Dtree=accuracy_score(y_test, y_pred1)

In [None]:
dot_data = tree.export_graphviz(clf, out_file=None, 
                              feature_names=x_train.columns, 
                              filled=True, rounded=True,  
                              special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

In [None]:
cm = confusion_matrix(y_test, y_pred1)
plt.figure(figsize = (8,5))

conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
sns.heatmap(conf_matrix, annot=True,  cmap = 'YlGnBu')

# print the scores on training and test set
all_sample_title = 'Accuracy Score: {0}'.format(accuracy_score(y_test, y_pred1))

plt.title(all_sample_title, size = 19)
plt.savefig("pne.png")

# Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

#Create Model
bag_clf = BaggingClassifier(base_estimator=clf, n_estimators=300,
                            bootstrap=True,bootstrap_features=True, n_jobs=-1,
                            random_state=42)
#fit model
bag_clf.fit(x_train, y_train)
y_pred2 = bag_clf.predict(x_test)
bagging = accuracy_score(y_test, y_pred2)

In [None]:
cm = confusion_matrix(y_test,y_pred2)
plt.figure(figsize = (8,5))

conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
sns.heatmap(conf_matrix, annot=True,  cmap = 'YlGnBu')

# print the scores on training and test set
all_sample_title = 'Accuracy Score: {0}'.format(accuracy_score(y_test, y_pred2))

plt.title(all_sample_title, size = 19)
plt.savefig("pne.png")

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
#creating model
rf_clf = RandomForestClassifier(criterion = 'gini',
                                n_estimators=700,
                                oob_score= True,
                                max_features ='log2',
                                min_samples_split=10,
                                min_samples_leaf=3,
                                bootstrap=True,
                                n_jobs=-1,
                                random_state=1)
#fitting model
rf_clf.fit(x_train, y_train)
y_pred3 = rf_clf.predict(x_test)
RF=accuracy_score(y_test, y_pred3)

In [None]:
cm = confusion_matrix(y_test, y_pred3)
plt.figure(figsize = (8,5))

conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
sns.heatmap(conf_matrix, annot=True,  cmap = 'YlGnBu')

# print the scores on training and test set
all_sample_title = 'Accuracy Score: {0}'.format(accuracy_score(y_test, y_pred3))

plt.title(all_sample_title, size = 19)
plt.savefig("pne.png")

# Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Ada boosting tunning
dtree = DecisionTreeClassifier(criterion='gini', max_depth = 3, random_state =42)

adaclass = AdaBoostClassifier(base_estimator=dtree,
                             n_estimators = 300,
                             learning_rate = 0.01,
                             algorithm= 'SAMME',
                             random_state = 42)


adaclass.fit(x_train,y_train)
y_pred4 = adaclass.predict(x_test)
Ada =accuracy_score(y_test, y_pred4)

In [None]:
cm = confusion_matrix(y_test, y_pred4)
plt.figure(figsize = (8,5))

conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
sns.heatmap(conf_matrix, annot=True,  cmap = 'YlGnBu')

# print the scores on training and test set
all_sample_title = 'Accuracy Score: {0}'.format(accuracy_score(y_test, y_pred4))

plt.title(all_sample_title, size = 19)
plt.savefig("pne.png")

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# create model
gradclass =  GradientBoostingClassifier(learning_rate=0.3, loss='deviance', max_depth=5,
                                         max_features='auto',min_impurity_decrease=0.001, min_impurity_split=None,
                                         min_samples_leaf=1, min_samples_split = 4, n_estimators=300)
                                    

#fit model
gradclass.fit(x_train,y_train)
y_pred5 = gradclass.predict(x_test)
Grad=accuracy_score(y_test, y_pred5)

In [None]:
y_pred= gradclass.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (8,5))

conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
sns.heatmap(conf_matrix, annot=True,  cmap = 'YlGnBu')

# print the scores on training and test set
all_sample_title = 'Accuracy Score: {0}'.format(accuracy_score(y_test, y_pred))

plt.title(all_sample_title, size = 19)
plt.savefig("pne.png")

# XG Boost 

In [None]:
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import model_selection

In [None]:
xgb = XGBClassifier(
     learning_rate = 0.1,
     n_estimators = 1000,
     max_depth = 3,
     min_child_weight=1,
     gamma = 0.1,
     alpha = 1e-05, 
     subsample = 0.9,
     colsample_bytree = 0.6,
     objective = 'binary:logistic',
     nthread = 5,
     scale_pos_weight =1,
     seed = 27)

xgb.fit(x_train, y_train)
y_pred6=xgb.predict(x_test)
XGB=accuracy_score(y_test, y_pred6)
XGB

In [None]:
cm = confusion_matrix(y_test, y_pred6)
plt.figure(figsize = (8,5))

conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
sns.heatmap(conf_matrix, annot=True,  cmap = 'YlGnBu')

# print the scores on training and test set
all_sample_title = 'Accuracy Score: {0}'.format(accuracy_score(y_test, y_pred6))

plt.title(all_sample_title, size = 19)
plt.savefig("pne.png")

In [None]:
models = pd.DataFrame({
    'Model': ['Logistic Regreesion','Decision Tree','Bagging','Random Forest', 'Gradient boosting', 
              'Adaboost', 'XG Boost'],

    'Score': [log_reg,Dtree, bagging, RF, Grad, Ada,XGB]})
models.sort_values(by='Score', ascending=True)

In [None]:
# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

y_score0 = logReg.predict_proba(x_test)[:,1]
log_fpr, log_tpr, thresh = roc_curve(y_test, y_score0)
auc_log = auc(log_fpr, log_tpr)

y_score1 = clf.predict_proba(x_test)[:,1]
tree_fpr, tree_tpr, thresh = roc_curve(y_test, y_score1)
auc_tree = auc(tree_fpr, tree_tpr)

y_score2 = bag_clf.predict_proba(x_test)[:,1]
bag_fpr, bag_tpr, thresh = roc_curve(y_test, y_score2)
auc_bag = auc(bag_fpr, bag_tpr)

y_score3 = rf_clf.predict_proba(x_test)[:,1]
rf_fpr, rf_tpr, thresh = roc_curve(y_test, y_score3)
auc_rf = auc(rf_fpr, rf_tpr)

y_score4 = adaclass.predict_proba(x_test)[:,1]
ada_fpr, ada_tpr, thresh = roc_curve(y_test, y_score4)
auc_ada = auc(ada_fpr, ada_tpr)

y_score5 = gradclass.predict_proba(x_test)[:,1]
grad_fpr, grad_tpr, thresh = roc_curve(y_test, y_score5)
auc_grad = auc(grad_fpr, grad_tpr)

y_score6 = xgb.predict_proba(x_test)[:,1]
xgb_fpr, xgb_tpr, thresh = roc_curve(y_test, y_score6)
auc_xgb = auc(xgb_fpr, xgb_tpr)


plt.figure(figsize=(8, 5), dpi = 100)
# plot roc curves
plt.plot(log_fpr, log_tpr, linestyle='--',color='grey', label='logReg(auc = %0.3f)'%auc_log)
plt.plot(tree_fpr, tree_tpr, linestyle='--',color='red', label='tree(auc = %0.3f)'%auc_tree)
plt.plot(bag_fpr, bag_tpr, linestyle='--',color='blue', label='bag(auc = %0.3f)'%auc_bag)
plt.plot(rf_fpr, rf_tpr, linestyle='--',color='green', label='rf(auc = %0.3f)'%auc_rf)
plt.plot(ada_fpr, ada_tpr, linestyle='--',color='pink', label='ada(auc = %0.3f)'%auc_ada)
plt.plot(grad_fpr, grad_tpr, linestyle='--',color='black', label='grad(auc = %0.3f)'%auc_grad)
plt.plot(xgb_fpr, xgb_tpr, linestyle='--',color='orange', label='XGb(auc = %0.3f)'%auc_xgb)
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')

# x label y label
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.show();

In [None]:
submit = pd.DataFrame({"PassengerId":x_test.PassengerId, "Survived":y_pred2})
submit.to_csv("submissionT.csv", index = False)