# Setting the environment

In [175]:
# This Python 3 environment comes with many helpful analytics libraries installed
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Loading Data ...

In [176]:
train = pd.read_csv('../input/train.csv')
train.head(10)

In [177]:
test = pd.read_csv('../input/test.csv')
test.head(10)

# Data Manipulation and Analysis

**Label Encoding of datatypes from objects to categories and then further into numeric datatypes**

In [178]:
train.Sex = train.Sex.astype('category').cat.codes
test.Sex = test.Sex.astype('category').cat.codes

In [179]:
train.isnull().sum()

In [180]:
train.describe()

**One hot encoding for categorical variables**

In [181]:
train_df = train.copy()
test_df = test.copy()
train_df = pd.get_dummies(train_df, columns = ['Embarked', 'Pclass'], drop_first = True)
test_df = pd.get_dummies(test_df, columns = ['Embarked', 'Pclass'], drop_first = True)

In [182]:
test.isnull().sum()

In [183]:
train_df.info()
print(test_df.head(5))

**Calculating the mean age for both the test and train dataset and using imputation to replace the value of the mean age wherever an NaN value exists**

In [184]:
age_mean = train_df.Age.mean()

In [185]:
train_df['Age'].fillna(train_df['Age'].median(),inplace = True)

Taking a peek at the dataframe, its dimensions and datatypes

In [186]:
train_df.head(10)

In [187]:
train_df.info()

In [188]:
age_mean_test = test_df['Age'].mean()
print(age_mean_test)
test_df['Age'].fillna(test_df['Age'].median(),inplace = True)

In [189]:
test_df['Fare'].fillna(test_df['Fare'].mean(),inplace = True)

In [190]:
test_df.Cabin.unique()

In [191]:
test_df.drop(columns = ['Cabin','Name','Ticket'], axis = 1, inplace = True)

In [192]:
train_df.drop(columns = ['Cabin','Name','Ticket'], axis = 1, inplace = True)

In [193]:
train_df.describe

# Data Visualiation using matplotlib and seaborn

Pearson's correlation coefficient between all columns in the dataframe

In [194]:
corr = train_df.corr(method = 'pearson')
corr

In [195]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,5))
cat_columns = ['Sex','Pclass','Embarked']
for i in range(len(cat_columns)):
    curr_feature = cat_columns[i]
    train.groupby([curr_feature, 'Survived'])['PassengerId'].count().unstack().plot(kind = 'bar', stacked = False, ax = axes[i])
    axes[i].legend(['Dead','Survived'])

In [196]:
grid = sns.FacetGrid(train_df, col = 'Survived', row = 'Pclass_3', height = 2.2, aspect = 1.6)
grid.map(plt.hist, 'Age', alpha = 0.5, bins = 20)
grid.add_legend();

In [197]:
sns.set(context = 'paper', style = 'whitegrid', palette = 'muted', font = 'sans-serif', font_scale = 1, color_codes = True, rc = None)
sns.heatmap(corr,linewidths = 0.5)

In [198]:
sns.barplot(x = corr.Survived,y = corr.columns)

# Importing scikit learn library and regression, classification bundles 

In [199]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier

logReg=LogisticRegression(solver='liblinear')
sgdcls=SGDClassifier()
nbcls=GaussianNB()
knn=KNeighborsClassifier()
desclr=DecisionTreeClassifier()
svc=SVC()
ada=AdaBoostClassifier()

Test-train split isn't needed since the data was already split before and only the X_train and y_train should be defined properly

# Preparing the data to train ML models

In [200]:
X=train_df.drop('Survived',axis=1)
y=train_df['Survived']
from sklearn.model_selection import train_test_split
X_train, X_df_test, y_train, y_df_test = train_test_split(X, y, test_size=0.20, random_state=42)

Using logistic regression to fit the X_train and y_train to the model and make predictions

# Selecting, training and validating multiple ML models

In [201]:
logReg.fit(X_train,y_train)
prediction=logReg.predict(X_df_test)
score=cross_val_score(logReg,X_train,y_train,cv=5)
print("Score:",score)
acc_log = round(logReg.score(X_train,y_train) * 100, 2)
acc_log

Using a Stochastic gradient descent classifier to fit, train and predict using a gradient descent optimization method

In [202]:
#SGDClassifier
sgdcls.fit(X_train,y_train)
prediction_sgdcls=sgdcls.predict(X_df_test)
score=cross_val_score(sgdcls,X_train,y_train,cv=5)
print("Score:",score)
acc_log_sgdcls = round(sgdcls.score(X_train,y_train) * 100, 2)
acc_log_sgdcls

Naive-Bayes calssifier to train the model and make predictions

In [203]:
nbcls.fit(X_train, y_train)
prediction_nbcls = nbcls.predict(X_df_test)
score = cross_val_score(nbcls, X_train, y_train, cv = 5)
print("Score:", score)
acc_log_nbcls = round(nbcls.score(X_train, y_train) * 100, 2)
acc_log_nbcls

K-Nearest Neighbors method to make predictions

In [204]:
knn.fit(X_train, y_train)
prediction_knn = knn.predict(X_df_test)
score = cross_val_score(knn, X_train, y_train, cv = 5)
print("Score:", score)
acc_log_knn = round(knn.score(X_train, y_train) * 100, 2)
acc_log_knn

Using a Decision Tree classifier to make predictions

In [205]:
#DecisionTreeClassifier
desclr.fit(X_train, y_train)
prediction_desclr = desclr.predict(X_df_test)
score = cross_val_score(desclr, X_train, y_train, cv = 5)
print("Score:", score)
acc_log_desclr = round(desclr.score(X_train, y_train) * 100, 2)
acc_log_desclr

Using a Support Vector Classifier to build a model and make predictions

In [206]:
svc.fit(X_train, y_train)
prediction_svc = svc.predict(X_df_test)
score = cross_val_score(svc,X_train, y_train, cv = 5)
print("Score:", score)
acc_log_svc = round(svc.score(X_train, y_train) * 100, 2)
acc_log_svc

AdaBoost classifier for making predictions using train and test data

In [207]:
ada.fit(X_train, y_train)
prediction_svc=ada.predict(X_df_test)
score = cross_val_score(ada, X_train, y_train, cv = 5)
print("Score:", score)
acc_log_ada = round(ada.score(X_train, y_train) * 100, 2)
acc_log_ada

Logistic Regression for prediction

In [208]:
prediction_test = logReg.predict(test_df)

# Fine Tuning the model by trying to reduce the overfit (if there is) by reducing the number of features in the dataframe for training

In [209]:
y_new = train_df['Survived']
X_new = train_df.drop('Survived', axis = 1)
X_new = train_df.drop('SibSp', axis = 1)

In [210]:
ada.fit(X_new, y_new)
prediction_svc = ada.predict(X_df_test)
score = cross_val_score(ada, X_new, y_new, cv = 5)
print("Score:", score)
acc_log_ada = round(ada.score(X_new, y_new) * 100, 2)
acc_log_ada