# Importing important libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 

# Reading the dataset


In [None]:
url = "https://raw.github.com/mattdelhey/kaggle-titanic/master/Data/train.csv"
titanic = pd.read_csv(url) 
titanic.head(10)

# Data Analysis

In [None]:
titanic.shape

There are **891** rows & **11 columns**. This means there are 891 datapoints in the dataset & 11 features.

In [None]:
titanic.columns

Out of these features, the feature **'survived' is the target feature**. 

In [None]:
titanic.info()

*  There are **5 object fields** which needs to be encoded. 

*  'age', 'cabin' & embarked has some **missing values**


So I need to know how many Nan values are there in each columns.

In [None]:
titanic.isna().sum()

# Data Visualization

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(10,10))
sns.heatmap(titanic.corr(), annot=True, linewidths=0.5, fmt= '.3f')       #Plot rectangular data as a color-encoded matrix.

In [None]:
titanic.corr()


By the previous knowledge we have, let's create a new feature telling **whether the passenger is man, woman or a child.**

In [None]:
def woman_child_or_man(passenger):
    age, sex = passenger
    if age < 16:
        return "child"
    else:
        return dict(male="man", female="woman")[sex]

In [None]:
titanic["who"] = titanic[["age", "sex"]].apply(woman_child_or_man, axis=1) #we dont have any column who earlier so it will be created
#.apply is a keyword that applies the function woman_child_or_man to the specified columns # axis=1 means one row at a time( the possible values are o and 1)
titanic.head()

We will create another feature to see wether a person was an adult male or not.

In [None]:
titanic["adult_male"] = titanic.who == "man"
titanic.head()

We can have another feature with the deck information.

In [None]:
titanic["deck"] = titanic.cabin.str[0]
titanic.head()

Now one more feature can be created, whether the passenger was alone or not. So let's do this.

In [None]:
titanic["alone"] = ~(titanic.parch + titanic.sibsp).astype(bool)
titanic.head()

Now let's try to look at the trends in different feature.

In [None]:
sns.factorplot("pclass", "survived", data=titanic).set(ylim=(0, 1))


From here we see that if a passenger travelled in 1st class, the survival rate is highest and equal to 0.63. If a passenger travelled in 2nd class, the survival rate is medium and equal to 0.5. If a passenger travelled in 3rd class, the survival rate is lowest and equal to 0.3

Let's see how the above case is dependent on the **sex of the passenger.**

In [None]:
sns.factorplot("pclass", "survived", data=titanic, hue="sex")

It;s pretty clear that the surviavl of female passengers is much more than the male passengers. From here we see that if a passenger travelled in 1st class and was female then their survival chance is most. On the other hand, if a passenger travelled in 3rd class amd was male then their survival chance is least. So we can combine these two features to **create new feature**.

Let's have a similar observation with the features **'class' & 'who'**  

In [None]:
sns.factorplot("pclass", "survived", data=titanic, hue="who")

From here also we can have similar observation. We get 9 cases from here and we will be building a feature based on it in a while.

Let's try to find the trends with **the feature 'alone' & 'adult_male'**.

In [None]:
sns.factorplot("alone", "survived", data=titanic, hue="sex")

In [None]:
sns.factorplot("adult_male", "survived", data=titanic, hue="sex").set(ylim=(0, 1))

Now let's see what effect does the feature **'deck'** has.

In [None]:
sns.barplot("deck", "survived", data=titanic,order=['A','B','C','D','E','F','G'])

Now let's try to combine 3 features together.

In [None]:
sns.factorplot("alone", "survived", data=titanic, hue="sex",col="pclass")

# Data Preprocessing

Let's have the object fields encoded.

In [None]:
#encoding deck
#encoding means to replace a particular object with another
dk = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7} #we are replacing A with 1,B with 2 and so on....        
titanic['deck']=titanic.deck.map(dk) #we are replacing the values in the column deck with the ones like declared in dk.
titanic.head()

In [None]:
# encoding embarked


titanic['embarked'].value_counts()

In [None]:
e = {'S':3,'Q':2, 'C':1}
titanic['embarked']=titanic.embarked.map(e)
titanic.head()

In [None]:
# encoding gender

genders = {"male": 0, "female": 1}
titanic['sex'] = titanic.sex.map(genders)  #titanic['sex'].map(genders)
titanic.head()

In [None]:
#encoding who

wh = {'child':3,'woman':2, 'man':1}
titanic['who']=titanic.who.map(wh)

In [None]:
titanic.head()

Now we need to impute the **Missing Values**

There are alot of missing values in deck. So we will simply fill it with **0**

In [None]:
#imputing deck
titanic['deck']=titanic['deck'].fillna(0)     #to replace the Nan values by 0
titanic.head()

There are only 2 missing vaues in 'embarked'. So we will find out which of the values in embarked has **maximum occurence** and fill the missing values with **that value**.

In [None]:
#imputing embarked

titanic['embarked'].value_counts()

In [None]:
titanic['embarked']=titanic['embarked'].fillna('3.0')
titanic.head(10)

Now we will impute the missing values in **'age'**.

In [None]:
#imputing age

m=titanic['age'].mean()
m


In [None]:
titanic['age']=titanic['age'].fillna(m)
titanic.head(10)

# Adding New Features

In [None]:
def process_family(parameters):
     
    x,y=parameters
    
    # introducing a new feature : the size of families (including the passenger)
    family_size = x+ y + 1
    
    if (family_size==1):
      return 1 # for singleton
    elif(2<= family_size <= 4 ):
      return 2 #for small family
    else:
      return 3 #for big family 

In [None]:
titanic['FAM_SIZE']= titanic[['parch','sibsp']].apply(process_family, axis=1)
titanic.head()

In [None]:
# to get title from the name.

titles = set()
for name in titanic['name']:
    titles.add(name.split(',')[1].split('.')[0].strip())

In [None]:
titles #all the salutations present in my dataset.

In [None]:
len(titles)

In [None]:
Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

In [None]:
def get_titles():
    # we extract the title from each name
    titanic['title'] = titanic['name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
    
    # a map of more aggregated title
    # we map each title
    titanic['title'] = titanic.title.map(Title_Dictionary)
    return titanic

In [None]:
titanic = get_titles()
titanic.head(10)

Now we need to encode these titles. Right now I will use one-hot encoding with this.

In [None]:
titles_dummies = pd.get_dummies(titanic['title'], prefix='title')
titanic = pd.concat([titanic, titles_dummies], axis=1)
titanic.head()

And finally the Feature that we observed during the visualization.

In [None]:
def new_fe(parameters):
  p,w=parameters
  
  if (p==1):
    if (w==1):
      return 1
    elif (w==2):
      return 2
    elif (w==3):
      return 3
  elif (p==2):
    if (w==1):
      return 4
    elif (w==2):
      return 5
    elif (w==3):
      return 6
  elif (p==3):
    if (w==1):
      return 7
    elif (w==2):
      return 8
    elif (w==3):
      return 9

In [None]:
titanic['pcl_wh']= titanic[['pclass','who']].apply(new_fe, axis=1)
titanic.head()

Now we will drop all the features which I don't want.

In [None]:
titanic.columns

In [None]:
drop_list=['name','ticket','fare', 'cabin','title']
titanic = titanic.drop(drop_list, axis=1)
titanic.head()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(titanic.corr(), annot=True, linewidths=0.5, fmt= '.3f')

# Build the Models

The first task will be to **split the dataset** into train set and test set.

In [None]:
X_train = titanic.drop("survived", axis=1)
Y_train = titanic["survived"]

In [None]:
from sklearn.model_selection import train_test_split

# splitting data in training set(70%) and test set(30%).
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.3)

## Logistic Regression

In [None]:
x_train.shape

In [None]:
titanic.isna().sum()

In [None]:
from sklearn.linear_model import LogisticRegression
 
lr = LogisticRegression() #create the object of the model
lr = lr.fit(x_train,y_train)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix,precision_score,recall_score,f1_score

act = accuracy_score(y_train,lr.predict(x_train))
print('Training Accuracy is: ',(act*100))
p = precision_score(y_train,lr.predict(x_train))
print('Training Precision is: ',(p*100))
r = recall_score(y_train,lr.predict(x_train))
print('Training Recall is: ',(r*100))
f = f1_score(y_train,lr.predict(x_train))
print('Training F1 Score is: ',(f*100))



In [None]:
act = accuracy_score(y_test,lr.predict(x_test))
print('Test Accuracy is: ',(act*100))
p = precision_score(y_test,lr.predict(x_test))  # total how many right predictions are given.
print('Test Precision is: ',(p*100))         
r = recall_score(y_test,lr.predict(x_test))   #how much the prediction is actually right
print('Test Recall is: ',(r*100))
f = f1_score(y_test,lr.predict(x_test))         #(2*p*r)/(p+r)
print('Test F1 Score is: ',(f*100))



## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier



rf = RandomForestClassifier(criterion = "gini", 
                                       min_samples_leaf = 3, 
                                       min_samples_split = 10,   
                                       n_estimators=100, 
                                       max_features=0.5, 
                                       oob_score=True, 
                                       random_state=1, 
                                       n_jobs=-1)
rf = rf.fit(x_train,y_train)

In [None]:
act = accuracy_score(y_train,rf.predict(x_train))
print('Training Accuracy is: ',(act*100))
p = precision_score(y_train,rf.predict(x_train))
print('Training Precision is: ',(p*100))
r = recall_score(y_train,rf.predict(x_train))
print('Training Recall is: ',(r*100))
f = f1_score(y_train,rf.predict(x_train))
print('Training F1 Score is: ',(f*100))

In [None]:
act = accuracy_score(y_test,rf.predict(x_test))
print('Test Accuracy is: ',(act*100))
p = precision_score(y_test,rf.predict(x_test))
print('Test Precision is: ',(p*100))
r = recall_score(y_test,rf.predict(x_test))
print('Test Recall is: ',(r*100))
f = f1_score(y_test,rf.predict(x_test))
print('Test F1 Score is: ',(f*100))

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier


dt = DecisionTreeClassifier()
dt=dt.fit(x_train, y_train)

In [None]:
act = accuracy_score(y_train,dt.predict(x_train))
print('Training Accuracy is: ',(act*100))
p = precision_score(y_train,dt.predict(x_train))
print('Training Precision is: ',(p*100))
r = recall_score(y_train,dt.predict(x_train))
print('Training Recall is: ',(r*100))
f = f1_score(y_train,dt.predict(x_train))
print('Training F1 Score is: ',(f*100))

In [None]:
act = accuracy_score(y_test,dt.predict(x_test))
print('Test Accuracy is: ',(act*100))
p = precision_score(y_test,dt.predict(x_test))
print('Test Precision is: ',(p*100))
r = recall_score(y_test,dt.predict(x_test))
print('Test Recall is: ',(r*100))
f = f1_score(y_test,dt.predict(x_test))
print('Test F1 Score is: ',(f*100))

# **K-Nearest Neighbour(KNN)**

In [None]:
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors = 1) 
knn.fit(x_train, y_train) 

In [None]:
act = accuracy_score(y_train,knn.predict(x_train))
print('Training Accuracy is: ',(act*100))
p = precision_score(y_train,knn.predict(x_train))
print('Training Precision is: ',(p*100))
r = recall_score(y_train,knn.predict(x_train))
print('Training Recall is: ',(r*100))
f = f1_score(y_train,knn.predict(x_train))
print('Training F1 Score is: ',(f*100))

In [None]:
act = accuracy_score(y_test,knn.predict(x_test))
print('Test Accuracy is: ',(act*100))
p = precision_score(y_test,knn.predict(x_test))
print('Test Precision is: ',(p*100))
r = recall_score(y_test,knn.predict(x_test))
print('Test Recall is: ',(r*100))
f = f1_score(y_test,knn.predict(x_test))
print('Test F1 Score is: ',(f*100))

**Support Vector Machine(SVM)**










In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(x_train, y_train)

In [None]:
act = accuracy_score(y_train,svc.predict(x_train))
print('Training Accuracy is: ',(act*100))
p = precision_score(y_train,svc.predict(x_train))
print('Training Precision is: ',(p*100))
r = recall_score(y_train,svc.predict(x_train))
print('Training Recall is: ',(r*100))
f = f1_score(y_train,svc.predict(x_train))
print('Training F1 Score is: ',(f*100))

In [None]:
act = accuracy_score(y_test,svc.predict(x_test))
print('Test Accuracy is: ',(act*100))
p = precision_score(y_test,svc.predict(x_test))
print('Test Precision is: ',(p*100))
r = recall_score(y_test,svc.predict(x_test))
print('Test Recall is: ',(r*100))
f = f1_score(y_test,svc.predict(x_test))
print('Test F1 Score is: ',(f*100))