In [None]:
import numpy as np
import pandas as pd

import os

for dirname, _, filenames in os.walk('../input'):
    for file in filenames:
        print(os.path.join(dirname, file))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

plt.rcParams['figure.figsize'] = 10, 8
sns.set(style='whitegrid', palette='muted',
       rc={'figure.figsize': (15, 10)})

In [None]:
# Scikit-Learn dependencies for Data Science
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
train_df = pd.read_csv("../input/titanic-machine-learning-from-disaster/train.csv")
test_df = pd.read_csv("../input/titanic-machine-learning-from-disaster/test.csv")

In [None]:
train_df.head()

In [None]:
train_df.info()
print('-'*20)
test_df.info()

Drop Unnecessary Columns to Clean Data

In [None]:
train_df = train_df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
test_df = test_df.drop(['Name', 'Ticket'], axis=1)

## Examine Data

In [None]:
train_df.Survived.value_counts().plot(kind='pie', shadow=True,
             colors=('darkgreen', 'orange'),
             autopct='%.2f%%', figsize=(8,6))
plt.title("Survived")
plt.show()

* 62% of passengers died 
* 38% survived

## Check For Missing Values

In [None]:
# Check null values
print(train_df.isnull().sum())
print('-'*20)
print(test_df.isnull().sum())

In [None]:
# Check percentage of train missing values
print(train_df.Age.isnull().mean())
print('-'*20)
print(train_df.Cabin.isnull().mean())

In [None]:
# Check percentage of test missing values
print(test_df.Age.isnull().mean())
print('-'*20)
print(test_df.Cabin.isnull().mean())

In [None]:
# Check missing values
train_df[train_df['Embarked'].isnull()]

## Feature Engineering

### Engineer Age Features

Fill Null values with Mean

In [None]:
train_df['Age_median'] = train_df['Age'].fillna(train_df['Age'].median(), inplace=False)
test_df['Age_median'] = test_df['Age'].fillna(test_df['Age'].median(), inplace=False)

Check Standard Deviation Difference

In [None]:
print(train_df['Age'].std())
print('-'*20)
print(train_df['Age_median'].std())

Compare Engineered Features of Ages

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
train_df['Age'].plot(kind='kde', ax=ax)
train_df.Age_median.plot(kind='kde', ax=ax, color='red')
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')

In [None]:
train_df.drop(['Age_median'], axis=1, inplace=True)
test_df.drop(['Age_median'], axis=1, inplace=True)

train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)

train_df['Age'].std()

### Engineer Test Fare 

In [None]:
test_df['Fare'].fillna(0, inplace=True)

### Drop Cabin
* Not necessary for Model Training

In [None]:
train_df.drop(['Cabin'], axis=1, inplace=True)
test_df.drop(['Cabin'], axis=1, inplace=True)

### Engineer Embarked Features

In [None]:
train_df['Embarked'].value_counts()

In [None]:
test_df['Embarked'].value_counts()

* Mode for both train and test is 'S' embarked

In [None]:
train_df['Embarked'].fillna('S', inplace=True)
test_df['Embarked'].fillna('S', inplace=True)

## Visualize Data for Examination

In [None]:
from itertools import product
import plotly.express as px

### Sex

In [None]:
fig = plt.figure(figsize=(16,16))
fig = plt.subplot(221)
train_df['Sex'].value_counts().plot(kind='pie', 
                                    colors=('lightgreen','lightblue'),
                                   autopct='%.2f%%')
plt.legend()
plt.xlabel('Sex')

# Distribution according to Survived and Not
fig = plt.subplot(222)
sns.countplot(x='Sex', data=train_df, hue='Survived')
plt.show()

# fig = px.histogram(train_df, x='Sex', color='Survived', barmode='group')
# fig.show()


# class_order = ['Male', 'Female']
# hue_order = ['Survived', 'Not']
# bar_order = product(class_order, hue_order)

# catp = sns.catplot(data=train_df, kind='count', x='Survived', hue='Sex',
#                   order=class_order, hue_order = hue_order)

# spots = zip(catp.ax.patches, bar_order)
# for spot in spots:
#     class_total = len(train_df[train_df['Survived'] == spot[1][0]])
#     class_who_total = len(train_df[(train_df['Survived'] == spot[1][0]) & (train_df['Sex'] == spot[1][1])])
#     height = spot[0].get_height()
#     catp.ax.text(spot[0].get_x(), height+3, '{:1.2f}'.format(class_who_total/class_total))

* Most of the passengers are males (65%)
* Most that died are Males
* There are more females that survived than died

In [None]:
fig = px.histogram(train_df, x='Sex', color='Survived', barmode='group')
fig.show()

### Age

In [None]:
fig = plt.figure(figsize=(16,16))
fig = plt.subplot(211)
sns.kdeplot(train_df[train_df['Survived'] == 0]['Age'], shade=True, label='Not')
sns.kdeplot(train_df[train_df['Survived'] == 1]['Age'], shade=True, label='Survived')
plt.legend()
plt.xlabel('Age')

# Distribution of Ages according to Survival

train_df['Fare']= np.log1p(train_df['Fare'])

fig = plt.subplot(212)
sns.kdeplot(train_df[train_df['Survived'] == 0]['Fare'], shade=True, label='Not')
sns.kdeplot(train_df[train_df['Survived'] == 1]['Fare'], shade=True, label='Survived')
plt.legend()
plt.xlabel('Fare')

* Most passengers that died are between ages 20 and 40
* Most survivors are between ages 20 and 40
* Passengers with low to average fare (poor to average passengers) are most that died
* Most that survived do not vary much from the fare

### PClass (1,2,3) - Socio-Economic Class

In [None]:
fig = plt.figure(figsize=(16,16))
fig = plt.subplot(221)
train_df['Pclass'].value_counts().plot(kind='pie', autopct='%.2f%%')
plt.legend()
plt.xlabel("Pclass")

# According to survival
fig = plt.subplot(222)
sns.countplot(x='Pclass', data=train_df, hue='Survived')
plt.show()

### Number of Siblings (SibSp) & Number of Parents/Children Onboard (Parch)

In [None]:
fig = plt.figure(figsize=(16,16))
fig = plt.subplot(221)
sns.countplot(x='SibSp', data=train_df, hue='Survived')
plt.legend(loc='upper right')
plt.xlabel("Number of Siblings")

fig = plt.subplot(222)
sns.countplot(x='Parch', data=train_df, hue='Survived')
plt.legend(loc='upper right')
plt.xlabel("Number of Parents")


* Most passengers are of high socio-economic class
* Death toll is high on high socio-economic class
* Most survivors are of low Pclass

### Family Size and Has Family

In [None]:
combined = [train_df, test_df]

for data in combined:
    data['Family_Size'] = data['SibSp'] + data['Parch']
    data['has_Family'] = (data['Family_Size'] > 0).astype('int32')

fig = plt.subplot(212)
train_df['has_Family'].value_counts().plot(kind='pie', colors=('lightblue','lightgreen'),
                                          autopct='%.2f%%')
plt.xlabel("has_Family")
plt.legend()
    
fig = plt.subplot(221)
sns.countplot(x='Family_Size', data=train_df, hue='Survived')
# plt.legend(loc='upper right')
plt.xlabel("Family Size")

fig = plt.subplot(222)
sns.countplot(x="has_Family", data=train_df, hue='Survived')
# plt.legend(loc='upper right')
plt.xlabel("has family")

* Most that died travelled alone and has no siblings
* Most have no families
* Most that survived are have no siblings as well 
* Survivors with no families and those with families are almost proportional

### Embarked

* Embarked is where the traveller mounted from
* Southampton (S), Cherbourg (C), Queenstown (Q) 

In [None]:
for data in combined:
    data.fillna(data['Embarked'].mode(), inplace=True)

fig = plt.subplot(111)
train_df['Embarked'].value_counts().plot(kind='pie',
                                        autopct='%.2f%%')
plt.legend()
px.histogram(train_df, x='Embarked', color='Survived', barmode='group')

* Majority of the passengers boarden from Southampton (73%)
* Over 400 passengers from Southampton died 
* Most survivors are from Southampton as well

## Correlation Matrix

* To the strength of correlation of individual columns to the target column (Survived)

In [None]:
train_df['Pclass'].value_counts()

one-hot encode values

In [None]:
pclass_train_dummies = pd.get_dummies(train_df['Pclass'])
pclass_test_dummies = pd.get_dummies(test_df['Pclass'])
print(pclass_test_dummies)

In [None]:
from sklearn.preprocessing import OneHotEncoder

