# Titanic Survival Prediction 
### This is my first kernel in Python. Hope you guys like it


In [None]:
import numpy as np 
import pandas as pd
import warnings 
warnings.filterwarnings("ignore")
import os
print(os.listdir("../input"))
import matplotlib.pyplot as plt
import seaborn as sns

## Importing and exploring data

In [None]:
train=pd.read_csv("../input/train.csv")

In [None]:
train.head()


In [None]:
train.describe()

## Don't forget to explore object type columns

In [None]:
train.describe(include=['O'])

In [None]:
train.hist(figsize=(10,8))

In [None]:
train.info()

## So column Age, Cabin, Embarked have missing values. Let's Start with Age column

In [None]:
train.Age.hist(bins=20)

## Age distribution is positive skewed. Need more information to fill missing data. Let's plot Age with PClass

In [None]:
sns.boxplot(x='Pclass',y='Age',data=train,hue='Survived')

In [None]:
print (train.groupby(['Pclass']).get_group(1).Age.mean())
print (train.groupby(['Pclass']).get_group(2).Age.mean())
print (train.groupby(['Pclass']).get_group(3).Age.mean())

## We can fill missing age with mean but age varies for each Pclass so filling missing age with mean will not be proper. Lets fill Age according to Pclass 

In [None]:
train['Age']=train.groupby(['Pclass','Survived'])['Age'].transform(lambda x:x.fillna(x.mean()))

In [None]:
train.info()

# Now lets take a look at Fare column. May be it want to something to us

In [None]:
sns.stripplot(y='Fare',x='Pclass',hue='Survived',data=train)

## From above figure we can say that people who paid higher got 1st Pclass and there chanced of survival are better than other Pclass. 
## Let's see average Fare based on class

In [None]:
train.groupby(['Pclass','Survived'])['Fare'].mean()

In [None]:
plt.hist(train.Fare,bins=30)
plt.xlabel('Fare')
plt.ylabel('count')

## Most of the people paid 0-80 Fare. Fare varies based on Pclass and Survival. Survived people paid higher fare than people who died. So we need to utilise fare column. Since Fare as an integer column will not be usefull. Lets make it Categorical 

In [None]:
train.Fare=np.ceil(train.Fare)
train['fare']=pd.cut(train.Fare,bins=[0,8,13,20,30,50,80,600],labels=['a','b','c','d','e','f','g'],right=False)

In [None]:
sns.countplot(x='fare',hue='Survived',data=train)

## Thats look nice!!! As Fare increases (a to g) chances of survival increases.
## Fare really wanted to tell us something :)

## Lets see now SibSp and Parch Columns

In [None]:
sns.countplot(x='SibSp',hue='Survived',data=train)

In [None]:
sns.countplot(x='Parch',hue='Survived',data=train)

## Lets combine both columns. As both column represent members

In [None]:
train['members']=train['SibSp']+train['Parch']

In [None]:
sns.countplot(x='members',hue='Survived',data=train)

In [None]:
train.members.value_counts()

In [None]:
train[train.members>6].Survived.value_counts()


## Members with head count of more than 6 never survived in our train dataset so lets make 6+ members that is 7 and 10 members as 7 members

In [None]:
train.members.replace({10:7},inplace=True)

In [None]:
train.head()

## Now lets choose our feature attributes. Name is not giving us any proper info so lets drop it. Cabin column have various missing values and filling it may affect our prediction so drop it to. Ticket also not needed so drop it.

In [None]:
attributes=['Survived','Pclass','Sex','Age','Embarked','fare','members']

In [None]:
train=train[attributes]

In [None]:
train.head()

## Wait !!! Embarked also have 2 mising values. So lets do filling. But first we need to explore Embarked column

In [None]:
sns.countplot(x='Embarked',hue='Survived',data=train)

In [None]:
train[train.Embarked.isnull()]

## Two missing values belong to same Pclass and Same Sex with same Fare category ie g. Lets explore further more

In [None]:
sns.catplot(kind='point',x='Embarked',y='Pclass',hue='Sex',data=train)

In [None]:
train.groupby(['Pclass','Sex']).get_group((1,'female')).Embarked.value_counts()

## So with above exploration we can say that female which belong to Pclass 1 have C Embarked most probably. Lets fill it

In [None]:
train.Embarked.fillna('C',inplace=True)

In [None]:
train.info()

## No missing value finally. Now lets do type conversion

In [None]:
def func(x):
    if(x.dtype=='O'):
        x=x.astype('category')
    return(x)

In [None]:
train=train.apply(func,axis=0)

In [None]:
train.info()

In [None]:
train.members=train.members.astype('category')
train.Survived=train.Survived.astype('category')
train.Pclass=train.Pclass.astype('category')
train.Age=train.Age.astype('int64')

In [None]:
train.info()

## Now lets convert categorical values into dummy variable and Scaling 

In [None]:
df_label=train.Survived
del train['Survived']
df=pd.get_dummies(train)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaled=StandardScaler().fit_transform(df)
df=pd.DataFrame(scaled,index=df.index,columns=df.columns)


In [None]:
df=pd.concat([df,df_label],axis=1)

In [None]:
df.head()

# Now our data is ready now its time to use it for model building and prediction

In [None]:
from sklearn.model_selection import  train_test_split

In [None]:
train,test= train_test_split(df,test_size=0.25,random_state=43)

In [None]:
train_label=train.Survived
test_label=test.Survived
del train['Survived']
del test['Survived']

# 1. Linear Classifier

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict

In [None]:
from sklearn.linear_model import SGDClassifier
sgd=SGDClassifier(n_iter=50,penalty='l2')
score=cross_val_predict(sgd,train,train_label,cv=10)
confusion_matrix(train_label,score)

In [None]:
from sklearn.metrics import accuracy_score
acc_lc=accuracy_score(train_label,score)
acc_lc

# 2. logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
score=cross_val_predict(lr,train,train_label,cv=10)
confusion_matrix(train_label,score)

In [None]:
from sklearn.metrics import accuracy_score
acc_lr=accuracy_score(train_label,score)
acc_lr

# 3. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier()
score=cross_val_predict(dtc,train,train_label,cv=10)
confusion_matrix(train_label,score)

In [None]:
acc_dtc=accuracy_score(train_label,score)
acc_dtc

# 4. SVM

In [None]:
from sklearn.svm import SVC
svm=SVC(kernel='rbf',C=50)
score=cross_val_predict(svm,train,train_label,cv=10)
confusion_matrix(train_label,score)

In [None]:
acc_svm=accuracy_score(train_label,score)
acc_svm

# 5.Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=300,max_depth=4)
score=cross_val_predict(rf,train,train_label,cv=10)
confusion_matrix(train_label,score)

In [None]:
acc_rf=accuracy_score(train_label,score)
acc_rf

## From Above Classification algorithm we can say that Logistic Regression and Random forest is performing  than any other algorithm. So now lets use Grid Search to get more optimal Hyper parameters

# GRID Search on Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV
param=[{'max_depth':[3,4,5,6],'min_samples_split':[20,30,40,50,60],'min_samples_leaf':[8,10,20,30]
       ,'max_features':[15,20,22],'max_leaf_nodes':[6,8,10,14,18]}]
grid=GridSearchCV(RandomForestClassifier(),param,cv=5)
grid.fit(train,train_label)
grid.best_params_

# 6.Using this hyper parameter lets train RandomForerst again

In [None]:
rf=RandomForestClassifier(n_estimators=300, 
                          max_depth=grid.best_params_['max_depth'],
                          max_features=grid.best_params_['max_features'],
                          max_leaf_nodes=grid.best_params_['max_leaf_nodes'],
                          min_samples_leaf=grid.best_params_['min_samples_leaf'],
                          min_samples_split=grid.best_params_['min_samples_split'])
score=cross_val_predict(rf,train,train_label,cv=10)
confusion_matrix(train_label,score)

In [None]:
acc_rf_grid=accuracy_score(train_label,score)
acc_rf_grid

# 7.Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier
vc=VotingClassifier(estimators=[('sgd',SGDClassifier()),('rf',RandomForestClassifier(n_estimators=300)),
                                ('lr',LogisticRegression())],voting='hard')
score=cross_val_predict(vc,train,train_label,cv=10)
confusion_matrix(train_label,score)

In [None]:
acc_vc=accuracy_score(train_label,score)
acc_vc

# 8. ADA BOOSTING

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada=AdaBoostClassifier(DecisionTreeClassifier(max_depth=4),n_estimators=400,
algorithm='SAMME.R',learning_rate=0.01)
score=cross_val_predict(ada,train,train_label,cv=10)
confusion_matrix(train_label,score)

In [None]:
acc_ada=accuracy_score(train_label,score)
acc_ada

# 9. Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb=GradientBoostingClassifier(n_estimators=400,
                          max_depth=grid.best_params_['max_depth'],
                          max_features=grid.best_params_['max_features'],
                          max_leaf_nodes=grid.best_params_['max_leaf_nodes'],
                          min_samples_leaf=grid.best_params_['min_samples_leaf'],
                          min_samples_split=grid.best_params_['min_samples_split'],learning_rate=0.001)
score=cross_val_predict(gb,train,train_label,cv=10)
confusion_matrix(train_label,score)

In [None]:
acc_gb=accuracy_score(train_label,score)
acc_gb

# 10. Bagging And Pasting

In [None]:
from sklearn.ensemble import BaggingClassifier
bp=BaggingClassifier(DecisionTreeClassifier(max_depth=grid.best_params_['max_depth'],
                          max_features=grid.best_params_['max_features'],
                          max_leaf_nodes=grid.best_params_['max_leaf_nodes'],
                          min_samples_leaf=grid.best_params_['min_samples_leaf'],
                          min_samples_split=grid.best_params_['min_samples_split']),
                            n_estimators=300, bootstrap=True ,n_jobs=-1 )
score=cross_val_predict(bp,train,train_label,cv=10)
confusion_matrix(train_label,score)

In [None]:
acc_bp=accuracy_score(train_label,score)
acc_bp

In [None]:
pd.DataFrame([acc_lr*100,acc_lr*100,acc_svm*100,acc_dtc*100,acc_rf*100,
             acc_rf_grid*100,acc_vc*100,acc_ada*100,acc_gb*100,acc_bp*100],
            index=['SGD','Logistic','SVM','Decision Tree','Random Forest',
                  'Random Forest with Hyper parameter','Voting Classifier',
                  'ADA boost','Gradient Boost','Bagging and pasting'])

# Thanks. Please give your suggestion and upvote it if you like it 