# **Importing the Datasets**

In [None]:
from IPython.display import Image
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')       #training data
test = pd.read_csv('/kaggle/input/titanic/test.csv')         #testing data

# Exploratory data analysis

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.shape

We can see that there are 891 rows and 12 columns in our training dataset.

In [None]:
train.info()

From this we can see that there are some categorical columns and some missing values in two columns.

In [None]:
train.isnull().sum()

In [None]:
train.describe()

Out of 891 rows,only 714 values are present in Age column (i.e) 177 values are missing.
similarly Cabin column also has many missing values, as only 204 values are there out of 891.

In [None]:
test.shape

In [None]:
test.info()

In [None]:
test.isnull().sum()

In [None]:
train.describe()

In [None]:
# let's look at the unique values in each column
for col in train.columns:
    if len(train[col].unique()) <30:
        print(col,train[col].unique())
        print(col,train[col].value_counts(),sep='\n')
        print("------------------------")

**visualization**

In [None]:
train["Survived"].value_counts().plot(kind='bar',stacked=True, figsize=(8,5),color=['red','green'])

In [None]:
def bar_charts(col):
    survived = train[train['Survived']==1][col].value_counts()
    dead = train[train['Survived']==0][col].value_counts()
    df = pd.DataFrame([survived,dead])
    df.index = ['Survived','Dead']
    df.plot(kind='bar', figsize=(10,6),color=["orange","crimson","royalblue"])
    print("survived",train[train["Survived"]==1][col].value_counts(),sep='\n'),
    print("dead",train[train["Survived"]==0][col].value_counts(),sep='\n')

In [None]:
bar_charts("Pclass")

so,people from **1st class** more likely survivied than 2nd and 3rd classes.Also
we can see that **3rd class** more likely dead than other classes.


In [None]:
bar_charts("Sex")

This confirms that Female are more likely to be survived than male

In [None]:
bar_charts("Embarked")

In [None]:
bar_charts("SibSp")

This confirms that a person onboard with siblings survived where as a person without any siblings or spouse are most likely dead

In [None]:
bar_charts("Parch")

#  Feature engineering

Feature engineering is used to extract features from raw data along with the use of domain knowledge.
It is a process that takes raw data and transforms it into features that can be used to create a predictive model.Here we are going to tranform categorical features to numerical features and to fill the missing values.
Feature engineering helps us to improve the performance of the machine learning model.

**How Titanic sank???**

In [None]:
Image(url= "https://static1.squarespace.com/static/5006453fe4b09ef2252ba068/t/5090b249e4b047ba54dfd258/1351660113175/TItanic-Survival-Infographic.jpg?format=1500w")

It starts to sank from where 3rd class is located so column Pclass is  arbitrarily important.

In [None]:
#changing sex column to numeric
train["Sex"].replace(to_replace =["male"],
                 value =0,inplace=True)
train["Sex"].replace(to_replace =["female"],
                 value =1,inplace=True)

In [None]:
test["Sex"].replace(to_replace =["male"],
                 value =0,inplace=True)
test["Sex"].replace(to_replace =["female"],
                 value =1,inplace=True)

In [None]:
train["Embarked"].value_counts()

filling the missing embark with"S"embark.

In [None]:
train['Embarked'].fillna(train['Embarked'].mode()[0],inplace=True)
train["Embarked"].isnull().sum()
embark={"S":0, "C":1, "Q":2}
train["Embarked"].replace(embark,inplace=True)

In [None]:
test['Embarked'].fillna(test['Embarked'].mode()[0],inplace=True)
test["Embarked"].isnull().sum()
embark={"S":0, "C":1, "Q":2}
test["Embarked"].replace(embark,inplace=True)

In [None]:
train["identity"]=train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
train["identity"].value_counts()


In [None]:
#trurning categorical to numeric
train["identity"].replace(to_replace =["Ms","Miss"],
                 value =0,inplace=True)
train["identity"].replace(to_replace =["Mrs","Lady"],
                 value =1,inplace=True)
train["identity"].replace(to_replace =["Mr","Sir"],value =2,inplace=True)
train["identity"].replace(to_replace ="Master",
                 value =3,inplace=True)
train["identity"].replace(to_replace =["Dr","Rev","Major","Mlle","Col","Mme","Capt","Don","Jonkheer","Countess","Dona"],
                 value =4,inplace=True)
train["identity"].isnull().sum()

In [None]:
test["identity"]=test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test["identity"].value_counts()

In [None]:
test["identity"].replace(to_replace =["Ms","Miss"],
                 value =0,inplace=True)
test["identity"].replace(to_replace =["Mrs","Lady"],
                 value =1,inplace=True)
test["identity"].replace(to_replace =["Mr","Sir"],value =2,inplace=True)
test["identity"].replace(to_replace ="Master",
                 value =3,inplace=True)
test["identity"].replace(to_replace =["Dr","Rev","Major","Mlle","Col","Mme","Capt","Don","Jonkheer","Countess","Dona"],
                 value =4,inplace=True)
test["identity"].isnull().sum()

In [None]:
for i in range(5):
    print(train[train["identity"]==i]["Age"].median())
train["Age"].fillna(train.groupby("identity")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("identity")["Age"].transform("median"), inplace=True)


In [None]:
test["Fare"].fillna(test.groupby("Pclass")["Fare"].transform("median"), inplace=True)

In [None]:
train.drop(['Name'],axis=1,inplace=True)
train.drop(['Cabin'],axis=1,inplace=True)
train.drop(['Ticket'],axis=1,inplace=True)

In [None]:
test.drop(['Name'],axis=1,inplace=True)
test.drop(['Cabin'],axis=1,inplace=True)
test.drop(['Ticket'],axis=1,inplace=True)

In [None]:
train.head()

In [None]:
test.head()

# Model and Prediction

In [None]:
train = train.drop(['PassengerId'], axis=1)
x=train.drop(["Survived"],axis=1)
y=train["Survived"]

In [None]:
print(x.shape,y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
k_fold = KFold(n_splits=10, shuffle=True, random_state=1)

## DecisionTree

In [None]:
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
dt=round(np.mean(score)*100, 2)
print(dt)

## KNN

In [None]:
clf = KNeighborsClassifier(n_neighbors = 10)
scoring = 'accuracy'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
kn=round(np.mean(score)*100, 2)
print(kn)

## RandomForest

In [None]:
clf = RandomForestClassifier(n_estimators=13)
scoring = 'accuracy'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
rf=round(np.mean(score)*100, 2)
print(rf)


##  SVM

In [None]:
from sklearn.svm import SVC
clf = SVC()
scoring = 'accuracy'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
svm=round(np.mean(score)*100,2)
print(svm)


## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf, X_train, y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
nb=round(np.mean(score)*100, 2)
print(nb)


In [None]:
model = pd.DataFrame({
    'Model': ['Decision Tree', 'KNN', 'Random Forest','Support Vector Machines', 
               'Naive Bayes'],
    'Score': [dt,kn,rf,svm,nb]})
model.sort_values(by='Score', ascending=False)

In [None]:
ran = RandomForestClassifier(max_depth=10,random_state=1)
ran.fit(X_train,y_train)
test_data = test.drop("PassengerId", axis=1).copy()
prediction = ran.predict(test_data)
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": prediction
    })

submission.to_csv('submission.csv', index=False)

In [None]:
submission = pd.read_csv('submission.csv')
submission.head()


**Your comments and feedback are most welcome.**