All the code is taken from https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy

In [None]:
from sklearn import tree, linear_model, gaussian_process, ensemble, naive_bayes, neighbors, svm, discriminant_analysis
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection 
from sklearn import model_selection
from sklearn import metrics

import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib as mpl

import seaborn as sns
from pandas.tools.plotting import scatter_matrix

%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

In [None]:
import sys
import pandas as pd 
import numpy as np
import scipy as sp
import sklearn 
import matplotlib
import IPython

import time 
import random

import warnings
warnings.filterwarnings('ignore')

print('sys version: {}'.format(sys.version))
print('pandas version: {}'.format(pd.__version__))
print('numpy version: {}'.format(np.__version__))
print('scipy version: {}'.format(sp.__version__))
print('skelearn version: {}'.format(sklearn.__version__))
print('matplotlib version: {}'.format(matplotlib.__version__))
print('IPython version: {}'.format(IPython.__version__))

print('-'*25)

from subprocess import check_output
print(check_output(['ls', '/home/ginkobab/Documents/ML/Kaggle/titanic']).decode('utf-8'))

In [None]:
data_raw = pd.read_csv('/home/ginkobab/Documents/ML/Kaggle/titanic/train.csv')
data_val = pd.read_csv('/home/ginkobab/Documents/ML/Kaggle/titanic/train.csv')

data1 = data_raw.copy(deep = True)

data_cleaner = [data1, data_val]

data_raw.info()


data_raw.sample(10)

In [None]:
print('Train data null values:', data1.isnull().sum())
print(10*'-')
print('Test data null values:', data_val.isnull().sum())
print(10*'-')

data_raw.describe(include='all')

In [None]:
for dataset in data_cleaner:
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
                                       
drop_column = ['PassengerId', 'Ticket', 'Cabin']
data1.drop(drop_column, axis=1, inplace=True)
print('Train data null values:', data1.isnull().sum())
print(10*'-')
print('Test data null values:', data_val.isnull().sum())

In [None]:
for dataset in data_cleaner:
    dataset['Family_Size'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['Title'] = dataset['Name'].str.split(', ', expand = True)[1].str.split('.', expand = True)[0]
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['Family_Size']>1] = 0

    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
    
    
stats_min = 10
title_names = (data1['Title'].value_counts() < stats_min)
data1['Title'] = data1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

print(data1['Title'].value_counts())
print('-'*10)

data1.info()
print('-'*10)
data_val.info()
print('-'*10)
data1.sample(10)


In [None]:
label = LabelEncoder()
for dataset in data_cleaner:
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
    dataset['Title_Code'] = label.fit_transform(dataset['Title'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])
    dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
    
Target = ['Survived']

data1_x = ['Pclass', 'Sex', 'Embarked', 'Title', 'SibSp', 'Parch', 'Fare', 'Family_Size', 'IsAlone',]
data1_x_calc = ['Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'SibSp', 'Parch', 'Age', 'Fare']
data1_xy = Target + data1_x
print('Original X and y: ', data1_xy, '\n')

data1_x_bin = ['Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'FareBin_Code','AgeBin_Code']
data1_xy_bin = Target + data1_x_bin
print('Bin X and y: ', data1_xy_bin, '\n')

data1_dummy = pd.get_dummies(data1[data1_x])
data1_x_dummy = data1_dummy.columns.tolist()
data1_xy_dummy = Target + data1_x_dummy
print('Dummy X and y', data1_xy_dummy, '\n')

data1_dummy.head(5)

In [None]:
#Double Check stuff
print('Null val in train set')
print(data1.isnull().sum())
print('-'*25)
data1.info()
print('-'*25, '\n')
print('Null val in test set')
print(data_val.isnull().sum())
print('-'*25)
data_val.info()

data_raw.describe()

In [None]:
train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(data1[data1_x], data1[Target], random_state = 0)
train1_x_bin, test1_x_bin, train1_y_bin, test1_y_bin = model_selection.train_test_split(data1[data1_x_bin], data1[Target], random_state = 0)
train1_x_dummy, test1_x_dummy, train1_y_dummy,test1_y_dummy = model_selection.train_test_split(data1_dummy[data1_x_dummy], data1[Target], random_state = 0)

print("Data1 Shape: {}".format(data1.shape))
print("Train1 Shape: {}".format(train1_x.shape))
print("Test1 Shape: {}".format(test1_x.shape))

train1_x_bin.head()

In [None]:
for x in data1_x:
    if data1[x].dtype != 'float64' :
        print('Survival Correlation by:', x)
        print(data1[[x, Target[0]]].groupby(x, as_index=False).mean())
        print('-'*10, '\n')
        

print(pd.crosstab(data1['Title'],data1[Target[0]]))

In [None]:
plt.figure(figsize=[16,12])

plt.subplot(231)
plt.boxplot(x=data1['Fare'], showmeans = True, meanline = True)
plt.title('Fare Boxplot')
plt.ylabel('Fare ($)')

plt.subplot(232)
plt.boxplot(data1['Age'], showmeans = True, meanline = True)
plt.title('Age Boxplot')
plt.ylabel('Age (Years)')

plt.subplot(233)
plt.boxplot(data1['FamilySize'], showmeans = True, meanline = True)
plt.title('Family Size Boxplot')
plt.ylabel('Family Size (#)')

plt.subplot(234)
plt.hist(x = [data1[data1['Survived']==1]['Fare'], data1[data1['Survived']==0]['Fare']], 
         stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Fare Histogram by Survival')
plt.xlabel('Fare ($)')
plt.ylabel('# of Passengers')
plt.legend()

plt.subplot(235)
plt.hist(x = [data1[data1['Survived']==1]['Age'], data1[data1['Survived']==0]['Age']], 
         stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Age Histogram by Survival')
plt.xlabel('Age (Years)')
plt.ylabel('# of Passengers')
plt.legend()

plt.subplot(236)
plt.hist(x = [data1[data1['Survived']==1]['FamilySize'], data1[data1['Survived']==0]['FamilySize']], 
         stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Family Size Histogram by Survival')
plt.xlabel('Family Size (#)')
plt.ylabel('# of Passengers')
plt.legend()

In [None]:
fig, saxis = plt.subplots(2, 3,figsize=(16,12))

sns.barplot(x = 'Embarked', y = 'Survived', data=data1, ax = saxis[0,0])
sns.barplot(x = 'Pclass', y = 'Survived', order=[1,2,3], data=data1, ax = saxis[0,1])
sns.barplot(x = 'IsAlone', y = 'Survived', order=[1,0], data=data1, ax = saxis[0,2])

sns.pointplot(x = 'FareBin', y = 'Survived',  data=data1, ax = saxis[1,0])
sns.pointplot(x = 'AgeBin', y = 'Survived',  data=data1, ax = saxis[1,1])
sns.pointplot(x = 'FamilySize', y = 'Survived', data=data1, ax = saxis[1,2])