In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



Let's get started with Titanic Problem

# IMPORTING THE LIBRARIES

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# IMPORTING THE 'TRAIN' AND 'TEST' DATASETS

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

In [None]:
train.head()

In [None]:
y_train = train.iloc[:, 1].values

In [None]:
test.head()

# Exploratory Data Analysis 

We will explore the given data with various given features in datasets before jumping to modeling the data. Here, our main ojective is to gain as much knowledge as we can. We will have maximum insights of data with library Seaborn.

Analysing data with graphs.

In [None]:
# Countplot 
sns.catplot(x ="Sex", hue ="Survived", kind ="count", data = train) 

After observing the above graph, we can say that women were more likely to survived than men as they have high rate of survival than man. Hence, in determining whether a passenger will survive or not, gender(male or female) plays an important role.

In [None]:
group = train.groupby(['Pclass', 'Survived']) 
pclass_survived = group.size().unstack() 
  
sns.heatmap(pclass_survived, annot = True, fmt ="d") 

It helps in determining if higher-class passengers had more survival rate than the lower class ones or vice versa. Class 1 passengers have a higher survival chance compared to classes 2 and 3. It implies that Pclass contributes a lot to a passenger’s survival rate.

In [None]:
#Code : Factor plot for Family_Size (Count Feature) and Family Size.

# Adding a column Family_Size 
train['Family_Size'] = 0
train['Family_Size'] = train['Parch']+train['SibSp'] 
  
# Adding a column Alone 
train['Alone'] = 0
train.loc[train.Family_Size == 0, 'Alone'] = 1
  
# Factorplot for Family_Size 
sns.factorplot(x ='Family_Size', y ='Survived', data = train) 
  
# Factorplot for Alone 
sns.factorplot(x ='Alone', y ='Survived', data = train) 

Family_Size denotes the number of people in a passenger’s family. It is calculated by summing the SibSp and Parch columns of a respective passenger. Also, another column Alone is added to check the chances of survival of alone passenger against the one with a family.

Important observations –

If a passenger is alone, the survival rate is less.
If the family size is greater than 5, chances of survival decreases considerably.

In [None]:
#Code : Bar Plot for Fare


# Divide Fare into 4 bins 
train['Fare_Range'] = pd.qcut(train['Fare'], 4) 
  
# Barplot - Shows approximate values based  
# on the height of bars. 
sns.barplot(x ='Fare_Range', y ='Survived', data = train)

Fare denotes the fare paid by a passenger. As the values in this column are continuous, they need to be put in separate bins(as done for Age feature) to get a clear idea. It can be concluded that if a passenger paid a higher fare, the survival rate is more.

In [None]:
sns.distplot(train['Age'].dropna(), bins=15, kde=False)

Many passensgers are of age 15-40 yrs.

In [None]:
#Code : Categorical Count Plots for Embarked Feature


# Countplot 
sns.catplot(x ='Embarked', hue ='Survived', kind ='count', col ='Pclass', data = train)

**Some notable observations are:**

* Majority of the passengers boarded from S. 

* Majority of class 3 passengers boarded from Q.

# Overall Conclusions from EDA:


1. Women survived more than men.
2. Class 1 passengers were more lucky than Class 2 and 3.
3. Unfortunately, Class 3 was most affected.
4. Alone passengers had less survival rate.
5. Survival rate is more for passengers who paid higher fare.
6. Most of the passangers were of age between 20-40.
7. Majority of the passengers were boarded from 'S'.

We are done with EDA. Now, we will perform Data Preprocessing on both train and test dataset followed by Feature Scaling and then finally we will train our datasets on various models.

**Excited......????
      
  So let's get one step closer to solve this problem...

In [None]:
train.head()

First we will drop unnecessary columns because they do not contribute to final output.

In [None]:
extra_eda_cols = ['SibSp', 'Parch', 'Family_Size', 'Fare_Range', 'Alone']
train = train.drop(extra_eda_cols, axis = 1, inplace = False)
train.head()

In [None]:
#droping the  unnecessary columns

extra_cols = ['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin']
train = train.drop(extra_cols, axis = 1, inplace = False)
train.head()

In [None]:
x_train = train.drop('Survived', axis = 1, inplace = False)
print(x_train)

## Checking the missing values

In [None]:
sns.heatmap(x_train.isnull())

In [None]:
train.isnull().sum()

Checking how many missing values are there.

'Age' has 177 and 'Embarked' has 2 missing values.

### Filling the missing values column by column using scikit-learn.

In [None]:
#For 'Age' column

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x_train[['Age']])
x_train[['Age']]= imputer.transform(x_train[['Age']])


#For 'Embarked' column

imputers = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputers.fit(x_train[['Embarked']])
x_train[['Embarked']]= imputers.transform(x_train[['Embarked']])

In [None]:
x_train.isnull().sum().any()

As we see, now there is not any missing value in any column.

In [None]:
x_train.head()

# Encoding Categorical Data 

In [None]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 


#Sex Column  
x_train['Sex']= label_encoder.fit_transform(x_train['Sex']) 

#Embarked Column
x_train['Embarked']= label_encoder.fit_transform(x_train['Embarked'])

In [None]:
x_train.head()

# Applying Feature Scaling on training data

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)

# Preprocessing on TEST Dataset

In [None]:
test.head()

## Checking for missing values

In [None]:
test.isnull().sum().any()

In [None]:
sns.heatmap(test.isnull())

'Cabin' has maximum number of missing values. 'Age' column also have many NaN values and 'Fare' cloumn has 1 missing value.

## Filling missing values column by column

In [None]:
#For 'Age' column

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(test[['Age']])
test[['Age']]= imputer.transform(test[['Age']])


#For 'Embarked' column

imputers = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputers.fit(test[['Embarked']])
test[['Embarked']]= imputers.transform(test[['Embarked']])

# Dropping unnecessary columns

In [None]:
extra_cols_test = ['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin']
test = test.drop(extra_cols_test, axis = 1, inplace = False)
test.head()

# Encoding Categorical Data

In [None]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 


#Sex Column  
test['Sex']= label_encoder.fit_transform(test['Sex']) 

#Embarked Column
test['Embarked']= label_encoder.fit_transform(test['Embarked'])

In [None]:
test.head()

# Applying Feature Scaling on Test Set

In [None]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
test = sc_x.fit_transform(test)

# Building Various Classification Models

We will build a number of Classification models and at the end we will take the model having highest accuracy.
So let's get started........

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(test)
from sklearn.model_selection import cross_val_score
acc_Tree = cross_val_score(classifier, x_train, y_train, cv=10, scoring='accuracy').mean()
acc_Tree

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(test)

In [None]:
y_pred = classifier.predict(test)
from sklearn.model_selection import cross_val_score
acc_Tree = cross_val_score(classifier, x_train, y_train, cv=10, scoring='accuracy').mean()
acc_Tree

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(test)

In [None]:
y_pred = classifier.predict(test)
from sklearn.model_selection import cross_val_score
acc_Tree = cross_val_score(classifier, x_train, y_train, cv=10, scoring='accuracy').mean()
acc_Tree

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(test)
from sklearn.model_selection import cross_val_score
acc_Tree = cross_val_score(classifier, x_train, y_train, cv=10, scoring='accuracy').mean()
acc_Tree

## SVC

In [None]:
from sklearn.svm import SVC
classifier = SVC()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(test)

In [None]:
y_pred = classifier.predict(test)
from sklearn.model_selection import cross_val_score
acc_Tree = cross_val_score(classifier, x_train, y_train, cv=10, scoring='accuracy').mean()
acc_Tree

Now we are done with all the models. Now let's make a dataframe showing models with their cross_val_score for visualizing in a good way.

In [None]:
accuracy = {'Model' : ['Logistic Regression', 'K- Nearest Neighbor', 'SVC', 'Decision Tree', 'Random Forest'],
                  'Accuracy' : [0.7890, 0.8047, 0.8226, 0.7935, 0.8037]
                 }
all_cross_val_scores = pd.DataFrame(accuracy, columns = ['Model', 'Accuracy'])
all_cross_val_scores.head()

Alright as we can see, '**SVC**' has highest score. So, here we have best model.

In [None]:
test_df = pd.read_csv('../input/titanic/test.csv')
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred
})
submission.to_csv('titanic_prediction.csv', index=False)
print('File Saved')

In [None]:
submission