# Table of Contents

* [Section  1 - Importing Library & Data](#section-one)  
* [Section 2 - Exploratory Data Analysis](#section-two)
* [Section 3 - Visualizing Data via Graphs](#section-three)
* [Section 4 - Linear Support Vector Machine(Linear-SVM)](#section-four)
* [Section 5 - Radial Support Vector Machine(Radial-SVM)](#section-five)
* [Section 6 - K Fold Cross Validation](#section-six)
* [Section 7 - Box Plot of Accuracy](#section-seven)
* [Section 8 - Bar Plot of Accuracy](#section-eight)
* [Section 9 - Decision tree](#section-nine)


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="section-one"></a>
# Section  1 - Importing Library & Data

In [None]:
# import library
import ggplot
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px 

In [None]:
# Import data from CSV files

df_train=pd.read_csv('/kaggle/input/titanic-machine-learning-from-disaster/train.csv')
df_test=pd.read_csv('/kaggle/input/titanic-machine-learning-from-disaster/test.csv')
df_train.head()

<a id="section-two"></a>
# Section  2 - Exploratory Data Analysis

In [None]:
# Number of survivors of the titanic accident
total_Passengers= df_train['Survived'].value_counts()[1] + df_train['Survived'].value_counts()[0]
print("Total Passengers in Titanic : ",total_Passengers)
Survived_passengers=df_train['Survived'].value_counts()[1]
print("Survived Passengers in Titanic : ",Survived_passengers)

In [None]:
df_test.head()  # to have a look to data

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
# let's find out the missing values and fill them with appropriate values

df_train.isnull().sum()

In [None]:
df_train.isnull()

In [None]:
# Draw heat map to check numm values in train data
sns.heatmap(df_train.isnull())

In [None]:
# Draw heat map to check numm values in test data
sns.heatmap(df_test.isnull())

In [None]:
# check No. of males and females travelled

print("No of males tavelled in Titanic : ",df_train['Sex'].value_counts()['male'])
print("No of Females travelled in Titanic : ",df_train['Sex'].value_counts()['female'])

In [None]:
# check No. of males and females survived

df_train['Survived'].value_counts()[df_train['Sex']=='male']

In [None]:
df_train['Survived'].value_counts()[df_train['Sex']=='female']

In [None]:
# we can also visualise the survival rate in gender category using count plot

plt.subplot(1,2,1)
sns.countplot(x='Sex',data=df_train)

In [None]:
# plot to visualize Survived stats
plt.subplot(1,2,1)
sns.countplot(data=df_train,x='Survived')

In [None]:
# check survived count as per sex

sns.countplot(x='Survived',data=df_train,palette='rainbow',hue='Sex')

In [None]:
# check survived count as per Pclass

sns.countplot(x='Survived',data=df_train,palette='rainbow',hue='Pclass')
# people who were living in better passenger classes survived people living in lower class were remained dead

In [None]:
# check survived count as per Embarked

sns.countplot(x='Survived',data=df_train,palette='rainbow',hue='Embarked')

In [None]:
# check survived count as per Parch

sns.countplot(x='Survived',data=df_train,palette='rainbow',hue='Parch')

In [None]:
df_train['Fare']//10

In [None]:
# check survived count as per Fare
sns.countplot(data=df_train,x='Survived',hue=df_train['Fare'],palette='rainbow')

In [None]:
# from above missing observations we learnt that around 20% of age values were missing.from the distribution large groups of passengers are of 15 to 35 years
sns.set_style('whitegrid')
sns.distplot(df_train['Age'],kde=False,bins=20,color='g')

In [None]:
# Survived people vs Siblings/Spouses aboard the Titanic
df_train[['SibSp','Survived']].groupby(['SibSp'],as_index=False).mean().sort_values(ascending=False,by='SibSp')

In [None]:
# Visualize a count plot between Survived & Sibsp
sns.countplot(data=df_train,x='Survived',hue='SibSp',palette='rainbow')

In [None]:
# Draw scatter plot between Survived & SibSp
import matplotlib.pyplot as plt
x=df_train['SibSp']
y=df_train['Survived']
fig,Axes=plt.subplots()
plt.suptitle('SibSp vs Survived')
plt.subplot(1,3,1)
plt.scatter(x,y,marker='*',color='r',linewidth=5,s=25,edgecolor='g')
Axes.set_title('using scatterplot')


In [None]:
# Draw different plot between Survived & SibSp
plt.subplot(1,3,2)
plt.xlabel('SibSp')
plt.ylabel('Survived')
Axes.set_title('using plot')
plt.plot(x,y,'g*',linestyle='dashdot',linewidth=2,markersize=10)
plt.subplot(1,3,3)
plt.bar(x,y,align='center',color='black')
Axes.set_title('using bar')
plt.xlabel('SibSp')
plt.ylabel('Survived')

In [None]:
# Survived based on their gender

df_train[['Sex','Survived']].groupby(['Sex'],as_index=False).mean().sort_values(by='Sex',ascending=False)

In [None]:
# cleansing of the data
sns.boxplot(x='SibSp',y='Age',data=df_train)

In [None]:
# From the above observation we can map an estimated age to the null values in comparision with SibSp

def fill_age(cols):
    SibSp = cols[0]
    Age =cols[1]
    if pd.isnull(Age):
        if SibSp==0:
            return 29
        if SibSp==1:
            return 30
        if SibSp==2:
            return 25
        if SibSp==3:
            return 21
        if SibSp==4:
            return 17
        if SibSp==5:
            return 11
        else:
            return df_train('ffill')
    else:
            return Age

In [None]:
df_train['Age']=df_train[["Age","SibSp"]].apply(fill_age,axis=1)

In [None]:
# Draw heat map to display null values
sns.heatmap(df_train.isnull())

In [None]:
df_train.isnull().sum()

In [None]:
# From the above heatmap we can see all the missing values are resolved but if we see the missing values using isnull().sum() there are still some missing values in Embarked columns
# so we will fill missing Embarked values with backward or forward fill

df_train['Embarked'].fillna('bfill',inplace=True)

In [None]:
df_train.isnull().sum()

In [None]:
# so we now resolved all the missing values in the dataset, we can use the data provided efficiently only if the data is categorical format

df_train.info()

In [None]:
# we can represent the given values except Name,Sex,Embarked ,Ticket
# so we will convert object datatype into categorical values if possible or we will drop the unnecessary columns
pd.get_dummies(df_train)

In [None]:
# so we will create a copy of train_df and proceed accordingly
train_copy=df_train.copy()
train_copy


In [None]:
# now we will drop name and ticket columns because they can't be converted into valid categorical columns
train_copy.drop(['Name','Ticket'],inplace=True,axis=1)

In [None]:
train_copy

In [None]:
# so we will convert the Embarked and Sex to categorical values using get_dummies()
Sex_category=pd.get_dummies(train_copy['Sex'],drop_first=True)
Embarked_category=pd.get_dummies(train_copy['Embarked'],drop_first=True)

In [None]:
# drop Sex and Embarked
train_copy.drop(['Sex','Embarked'],axis=1,inplace=True)

In [None]:
# now we will add Sex_category and Embarked_category into the train_copy DataFrame
train=pd.concat([train_copy,Sex_category,Embarked_category],axis=1)

In [None]:
train.head()

In [None]:
train.drop(['bfill'],axis=1,inplace=True)

In [None]:
train.info()

In [None]:
# Visualise using some seaborn plotting techniques
sns.rugplot(train['Age'].isnull())

# the below figure depicts that there are no missing values present in Age column

<a id="section-three"></a>
# Section  3 - Visualizing Data via Graphs

In [None]:
sns.jointplot(data=train,x=train['Survived'],y=train['Pclass'],kind='kde')

In [None]:
# create pair plot for visibility across data
sns.pairplot(train)

In [None]:
sns.distplot(train[['Survived','Pclass']],kde=True,bins=10)

In [None]:
sns.jointplot(x=train['male'],y=train['Pclass'],kind='kde')

In [None]:
# correlation of the train data
sns.heatmap(train.corr())

In [None]:
# visualize through categorical plottings
sns.boxplot(x='male',y='Pclass',data=train,color='r')
sns.boxenplot(x='male',y='Pclass',data=train,color='g')

In [None]:
# Plot swarm plot between male & Pclass
sns.swarmplot(x='male',y='Pclass',data=train,color='k')


In [None]:
# Plot violine plot between male & Pclass
sns.violinplot(x='male',y='Pclass',data=train,color='g')

In [None]:
# Plot strip plot between male & Pclass
sns.stripplot(x='male',y='Pclass',data=train,color='r')

In [None]:
# Using all the categorical plotting in a single figure

sns.stripplot(x='Survived',y='SibSp',data=train,color='b')

In [None]:
# Plot swarm plot between Survived & SibSp
sns.swarmplot(x='Survived',y='SibSp',data=train,color='k')

In [None]:
# Plot violine plot between Survived & SibSp
sns.violinplot(x='Survived',y='SibSp',data=train,palette='rainbow')

In [None]:
# Plot boxen plot between Survived & SibSp
sns.boxenplot(data=train,x='Survived',y='SibSp',color='m')

In [None]:
# Plot bar plot between Survived & SibSp
sns.barplot(data=train,y='SibSp',x='Survived',color='y')

In [None]:
# Plot box plot between Survived & SibSp
sns.boxplot(x='Survived',y='SibSp',data=train,palette='dark')

In [None]:
# Plot count plot between SibSp
sns.countplot(data=train,y='SibSp',color='red')

In [None]:
# Plot factor plot between Pclass & SibSp
sns.factorplot(x='Pclass',y='SibSp',data=train)

In [None]:
# We have explored the data to a great extent.Now we can use machine learning techiniques to predict who survived or died on the Titanc.We can use different algorithms to predict this.Also we will try to quantify which algorithm gives us the highest accuracy.

# #importing all the required ML packages

from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier  #Random Forest
from sklearn.neighbors import KNeighborsClassifier   # KNN
from sklearn.tree import DecisionTreeClassifier   # Decision tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [None]:
f,ax=plt.subplots(2,2,figsize=(20,15))
sns.countplot('Embarked',data=df_train,ax=ax[0,0])
ax[0,0].set_title('No. Of Passengers Boarded')
sns.countplot('Embarked',hue='Sex',data=df_train,ax=ax[0,1])
ax[0,1].set_title('Male-Female Split for Embarked')
sns.countplot('Embarked',hue='Survived',data=df_train,ax=ax[1,0])
ax[1,0].set_title('Embarked vs Survived')
sns.countplot('Embarked',hue='Pclass',data=df_train,ax=ax[1,1])
ax[1,1].set_title('Embarked vs Pclass')
plt.subplots_adjust(wspace=0.2,hspace=0.5)
plt.show()

In [None]:
# Factor Plot based on Class,Survival,Sex and Embarked
sns.factorplot('Pclass','Survived',hue='Sex',col='Embarked',data=df_train)
plt.show()

In [None]:
data=pd.read_csv('/kaggle/input/titanic-machine-learning-from-disaster/train.csv')
data.head()

In [None]:
data.isnull().sum()

In [None]:
# Filing Age Missing Values
# We can fill the missing value with the mean of the Age column.This is because we can fill the age of a child (5 years) with the mean age ( Around 30 years).This would increase the error in the data.But we can see that in the Name column we have titles like Mr,Mrs menioned.We have to get the mean age of each group in the titles and then replace the missing age with the mean values of the titles.

data['Title']=0
for i in data:
    data['Title']=data.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations

In [None]:
# Here we are using the Regex: [A-Za-z]+).. So what it does is, it looks for strings which lie between A-Z or a-z and followed by a .(dot). So we successfully extract the Initials from the Name.
pd.crosstab(data.Title,data.Sex).T.style.background_gradient(cmap='summer_r') #Checking the Initials with the Sex

In [None]:
# In the dataset initials like Mlle or Mme stand for Miss.We will replace them with Miss.
data['Title'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)

In [None]:
pd.crosstab(data.Title,data.Sex).T.style.background_gradient(cmap='summer_r') #Checking the Initials with the Sex

In [None]:
# We have converted the Titles in four categories like Master,Miss,Mr,Mrs and Other.
data.groupby('Title')['Age'].mean() #lets check the average age by Initials

In [None]:
# Now we have got the mean Age of the different groups in the Title.Now we can use this values to imput the missing values in the Age Column in our Titanic Dataset.

data.loc[(data.Age.isnull())&(data.Title=='Mr'),'Age']=33
data.loc[(data.Age.isnull())&(data.Title=='Mrs'),'Age']=36
data.loc[(data.Age.isnull())&(data.Title=='Master'),'Age']=5
data.loc[(data.Age.isnull())&(data.Title=='Miss'),'Age']=22
data.loc[(data.Age.isnull())&(data.Title=='Other'),'Age']=46

In [None]:
data.Age.isnull().any() #So no null values left finally 

In [None]:
# Filling Embarked NaN
# We can see that the Embarked column has two missing values.In the dataset most people embarked Titanic from the port S.So we can replace the two missing value with S
data['Embarked'].fillna('S',inplace=True)

In [None]:
data.Embarked.isnull().any()# Finally No NaN values

In [None]:
# Fare Bin
# Here we will use qcut this will split the data into bins based on the number of bins requested by us.
data['Fare_Range']=pd.qcut(data['Fare'],4)
data.groupby(['Fare_Range'])['Survived'].mean().to_frame().style.background_gradient(cmap='summer_r')

In [None]:
# We can clearly see that as the fare increased the the survival percentage increases.
data['Fare_Group']=0
data.loc[data['Fare']<=7.91,'Fare_Group']=0
data.loc[(data['Fare']>7.91)&(data['Fare']<=14.454),'Fare_Group']=1
data.loc[(data['Fare']>14.454)&(data['Fare']<=31),'Fare_Group']=2
data.loc[(data['Fare']>31)&(data['Fare']<=513),'Fare_Group']=3

In [None]:
data

In [None]:
# Age Binning
# In case of a continous variable like Age we can get better insight by making bins of the Age data.

data['Age_band']=0
data.loc[data['Age']<=16,'Age_band']=0
data.loc[(data['Age']>16)&(data['Age']<=32),'Age_band']=1
data.loc[(data['Age']>32)&(data['Age']<=48),'Age_band']=2
data.loc[(data['Age']>48)&(data['Age']<=64),'Age_band']=3
data.loc[data['Age']>64,'Age_band']=4
data.head(10)

In [None]:
data['Age_band'].value_counts().to_frame().style.background_gradient(cmap='summer')#checking the number of passenegers in each band

In [None]:
# Sibling and Spouse
f,ax=plt.subplots(2,2,figsize=(20,15))
sns.countplot('Embarked',data=data,ax=ax[0,0])
ax[0,0].set_title('No. Of Passengers Boarded')
sns.countplot('Embarked',hue='Sex',data=data,ax=ax[0,1])
ax[0,1].set_title('Male-Female Split for Embarked')
sns.countplot('Embarked',hue='Survived',data=data,ax=ax[1,0])
ax[1,0].set_title('Embarked vs Survived')
sns.countplot('Embarked',hue='Pclass',data=data,ax=ax[1,1])
ax[1,1].set_title('Embarked vs Pclass')
plt.subplots_adjust(wspace=0.2,hspace=0.5)
plt.show()

In [None]:
data['Sex'] = data['Sex'].astype(str)
data['Embarked'] = data['Embarked'].astype(str)
data['Sex'].replace(['male','female'],[0,1],inplace=True)
data['Embarked'].replace(['S','C','Q'],[0,1,2],inplace=True)
data['Title'].replace(['Mr','Mrs','Miss','Master','Other'],[0,1,2,3,4],inplace=True)
data.head(2)

In [None]:
df=data.copy()
df.drop(['Name','Age','Ticket','Fare','Cabin','Fare_Range','PassengerId'],axis=1,inplace=True)
sns.heatmap(df.corr(),annot=True,cmap='RdYlGn',linewidths=0.2,annot_kws={'size':20})
fig=plt.gcf()
fig.set_size_inches(18,15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
df.drop(['SibSp','Parch'],axis=1,inplace=True)
df.head()

In [None]:
df.isnull().sum()

In [None]:
#importing all the required ML packages
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
import cv2
import glob
import os
import matplotlib.pyplot as plt
import string
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict
from sklearn.utils.multiclass import unique_labels
from sklearn import metrics
from sklearn.svm import SVC
from mlxtend.plotting import plot_decision_regions
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import plot_confusion_matrix

In [None]:
X = df.drop(labels='Survived',axis=1)
y = df['Survived']

In [None]:
# Test Train Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state= 1234,stratify=y)

print('Training Set:',len(X_train))
print('Test Set:',len(X_test))
print('Training labels:',len(y_train))
print('Test labels:',len(y_test))

<a id="section-four"></a>
# Section  4 - Linear Support Vector Machine(Linear-SVM)

In [None]:
model=svm.SVC(kernel='linear',C=0.2,gamma=0.1)
model.fit(X_train,y_train)
prediction1=model.predict(X_test)
print('Accuracy for linear SVM is',metrics.accuracy_score(prediction1,y_test))


<a id="section-five"></a>
# Section  5 - Radial Support Vector Machine(Radial-SVM)

In [None]:
model=svm.SVC(kernel='rbf',C=0.8,gamma=0.4)
model.fit(X_train,y_train)
prediction2=model.predict(X_test)
print('Accuracy for rbf SVM is ',metrics.accuracy_score(prediction2,y_test))

<a id="section-six"></a>
# Section  6 - K Fold Cross Validation



In [None]:
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
kfold = KFold(n_splits=10, random_state=22) # k=10, split the data into 10 equal parts
xyz=[]
accuracy=[]
std=[]
classifiers=['Linear Svm','Radial Svm']
models=[svm.SVC(kernel='linear'),svm.SVC(kernel='rbf')]
for i in models:
    model = i
    cv_result = cross_val_score(model,X,y, cv = kfold,scoring = "accuracy")
    cv_result=cv_result
    xyz.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)
new_models_dataframe2=pd.DataFrame({'CV Mean':xyz,'Std':std},index=classifiers)       
new_models_dataframe2

<a id="section-seven"></a>
# Section  7 - Box Plot of Accuracy



In [None]:
plt.subplots(figsize=(12,6))
box=pd.DataFrame(accuracy,index=[classifiers])
box.T.boxplot()
pass

<a id="section-eight"></a>
# Section  8 - Bar Plot of Accuracy


In [None]:
new_models_dataframe2['CV Mean'].plot.bar(width=0.5)
plt.title('Average CV Mean Accuracy')
fig=plt.gcf()
fig.set_size_inches(9,6)
plt.show()

<a id="section-nine"></a>
# Section  9 - Decision tree

In [None]:
# Decision tree
from sklearn import tree
# Create Decision Tree with max_depth = 3
decision_tree = tree.DecisionTreeClassifier(max_depth = 3)
decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_test)
print(y_pred)

In [None]:
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)
acc_decision_tree