In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Purpose:
## 1_overview on data;
## 2_a little bit of feature engineering;
## 3_use of simple classifiers: decision tree, random forest.

## Load datasets.
#### Load train and test files, apply the same steps to each dataset

In [None]:
#df dataframe from train data
df = pd.read_csv('/kaggle/input/titanic/train.csv')
df.info()

#dt dataframe from test data
dt = pd.read_csv('/kaggle/input/titanic/test.csv')
dt.info()

#### Cabin feature has too few values, Ticket feature provides no useful information.
#### Ticket and Cabin will be dropped out.
#### Convert Sex feature to numeric in order to visualize its correlation with Survived.

In [None]:
df1 = df.drop(columns=['Ticket', 'Cabin'])
dt1 = dt.drop(columns=['Ticket', 'Cabin'])

for i, row in df1.iterrows():
    if df1.iloc[i, 4] == 'male':
        df1.iloc[i, 4] = 0
    if df1.iloc[i, 4] == 'female':
        df1.iloc[i, 4] = 1
df1 = df1.astype({'Sex' : int})

for i, row in dt1.iterrows():
    if dt1.iloc[i, 3] == 'male':
        dt1.iloc[i, 3] = 0
    if dt1.iloc[i, 3] == 'female':
        dt1.iloc[i, 3] = 1
dt1 = dt1.astype({'Sex' : int})

#### Plot heatmap graph for correlations of features vs. Survived.

In [None]:
#dfx = df.drop('PassengerId', axis=1)
import matplotlib.pyplot as plt
import seaborn as sns

corr = np.abs(df1.drop('PassengerId', axis=1).corr())
mask = np.zeros_like(corr)
mask[np.tril_indices_from(mask)] = True

fig, ax = plt.subplots(figsize=(6,6))
ax = sns.heatmap(corr, mask=mask, annot=True, cmap='YlGnBu')

#### Bar graph with ordered values of correlations.

In [None]:
l = abs(df1.corr()['Survived']).sort_values(axis=0, ascending=False)
l[1:-1].plot(kind='bar', color='r');

#### Sex, Pclass and Fare show best correlation with Survived.
#### Initially, I thought Age would be a good predictor, but it has almost the same value of correlation of Parch and SbSp, and all under 0.1.
#### So, keep Sex, Pclass and Fare and try to manipulate Age, Parch and SibSp in order to create better predictors.

# Fill missing values.
#### First of all, fill missing values in train and test datasets.
#### In Age feature replace missing data with substituted values.
#### An initial approach might be imputing the mean age value (mean value of age) in every empty slot; I prefer to try to infer the value on the basis of the title contained in Name feature: as a rule, a “Ms” should be younger than a “Mrs”;  I hope this might keep a better distribution in Age.
#### Divide Name column into three columns: name, surname and title; name feature is useless, surname feature may be used to analyze family groups, but with regards to age, what matters is the title, so I'll use this feature and discard the other two.


In [None]:
title = pd.DataFrame(df1.Name.str.split(',', expand=True))[1].str.split('.', expand=True)[0].str.strip()
#title
surname = pd.DataFrame(df1.Name.str.split(',', expand=True))[0].str.split('.', expand=True)[0].str.strip()
#surname
name = pd.DataFrame(df1.Name.str.split(',', expand=True))[1].str.split('.', expand=True)[1].str.strip()
#name
df1['title']=title
df1['surname']=surname
df1['name']=name
df1.drop(columns=['Name', 'surname', 'name'], inplace=True)
#df1.head(3)

title = pd.DataFrame(dt1.Name.str.split(',', expand=True))[1].str.split('.', expand=True)[0].str.strip()
#title
surname = pd.DataFrame(dt1.Name.str.split(',', expand=True))[0].str.split('.', expand=True)[0].str.strip()
#surname
name = pd.DataFrame(dt1.Name.str.split(',', expand=True))[1].str.split('.', expand=True)[1].str.strip()
#name
dt1['title']=title
dt1['surname']=surname
dt1['name']=name
dt1.drop(columns=['Name', 'surname', 'name'], inplace=True)
#dt1.head(3)

In [None]:
df1.head(3)

In [None]:
dt1.head(3)

In [None]:
print('Number of different titles in train dataset:', len(df1.title.unique()))
print(df1.title.unique())
print('****************')
print('Number of different titles in test dataset:', len(dt1.title.unique()))
print(dt1.title.unique())

In [None]:
df1.groupby('title').count()['Age'], dt1.groupby('title').count()['Age']

#### I'll keep only the most frequent titles. The remaining titles and related data will be allocated according to the pattern as detailed below:

##### Capt, Col, Don, Major, Rev, Sir ---> Mr
##### Dr: 1 is female 5 are male.
##### ---------------------------Dr male ---> Mr
##### ---------------------------Dr female ---> Mrs
##### Jonkheer ---> Mr
##### Lady, the Countess, Dona ---> Mrs
##### Mlle, Mme, Ms ---> Miss
##### Master is used for children (male) under 13


In [None]:
df1.at[(df1.title=='Dr') & (df1.Sex==1), 'title']='Mrs'

df1.at[(df1.title=='Mlle'), 'title']='Miss'
df1.at[(df1.title=='Mme'), 'title']='Miss'
df1.at[(df1.title=='Ms'), 'title']='Miss'
df1.at[(df1.title=='Lady'), 'title']='Mrs'
df1.at[(df1.title=='the Countess'), 'title']='Mrs'

df1.at[(df1.title=='Dr'), 'title']='Mr'
df1.at[(df1.title=='Jonkheer'), 'title']='Mr'
df1.at[(df1.title=='Sir'), 'title']='Mr'
df1.at[(df1.title=='Rev'), 'title']='Mr'
df1.at[(df1.title=='Major'), 'title']='Mr'
df1.at[(df1.title=='Don'), 'title']='Mr'
df1.at[(df1.title=='Col'), 'title']='Mr'
df1.at[(df1.title=='Capt'), 'title']='Mr'

#df1.title.unique()

dt1.at[(df1.title=='Ms'), 'title']='Miss'
dt1.at[(dt1.title=='Ms'), 'title']='Miss'
dt1.at[(dt1.title=='Dona'), 'title']='Mrs'

dt1.at[(dt1.title=='Dr'), 'title']='Mr'
dt1.at[(dt1.title=='Rev'), 'title']='Mr'
dt1.at[(dt1.title=='Col'), 'title']='Mr'

#dt1.title.unique()

In [None]:
age_mr = df1.groupby('title').mean()['Age'].loc['Mr'].round(0)
age_mrs = df1.groupby('title').mean()['Age'].loc['Mrs'].round(0)
age_miss = df1.groupby('title').mean()['Age'].loc['Miss'].round(0)
age_master = df1.groupby('title').mean()['Age'].loc['Master'].round(0)
#age_mr, age_mrs, age_miss, age_master

df1.at[(df1.Age.isnull()) & (df1.title=='Mr'), 'Age']=age_mr
df1.at[(df1.Age.isnull()) & (df1.title=='Mrs'), 'Age']=age_mrs
df1.at[(df1.Age.isnull()) & (df1.title=='Miss'), 'Age']=age_miss
df1.at[(df1.Age.isnull()) & (df1.title=='Master'), 'Age']=age_master

In [None]:
age_mr = dt1.groupby('title').mean()['Age'].loc['Mr'].round(0)
age_mrs = dt1.groupby('title').mean()['Age'].loc['Mrs'].round(0)
age_miss = dt1.groupby('title').mean()['Age'].loc['Miss'].round(0)
age_master = dt1.groupby('title').mean()['Age'].loc['Master'].round(0)
#age_mr, age_mrs, age_miss, age_master

dt1.at[(dt1.Age.isnull()) & (dt1.title=='Mr'), 'Age']=age_mr
dt1.at[(dt1.Age.isnull()) & (dt1.title=='Mrs'), 'Age']=age_mrs
dt1.at[(dt1.Age.isnull()) & (dt1.title=='Miss'), 'Age']=age_miss
dt1.at[(dt1.Age.isnull()) & (dt1.title=='Master'), 'Age']=age_master

In [None]:
df1.info(), dt1.info()

#### At this point, we still have missing values in Embarked in train dataset and in Fare in test dataset. In this case, I’ll replace the empty values with the most common value in Embarked and with the mean value in Fare.

In [None]:
df1.groupby('Embarked')['PassengerId'].count()

In [None]:
df1.fillna(value={'Embarked':'S'}, inplace=True)
df1.info()

In [None]:
#dt1.Fare.mean()

In [None]:
dt1.fillna(value={'Fare':dt1.Fare.mean()}, inplace=True)
dt1.info()

#### Now, the two datasets show no missing values and we can start to manipulate them.
#### Pclass and Sex, no change is required: they're already numeric features with acceptable correlation with Survived.
#### Embarked is a categorical feature that can be easily converted into a numeric.


In [None]:
for i, row in df1.iterrows():
    if df1.iloc[i, 8] == 'Q':
        df1.iloc[i, 8] = 0
    if df1.iloc[i, 8] == 'C':
        df1.iloc[i, 8] = 1
    if df1.iloc[i, 8] == 'S':
        df1.iloc[i, 8] = 2

df1.Embarked = df1['Embarked'].astype('int64')
#df1.info()

In [None]:
for i, row in dt1.iterrows():
    if dt1.iloc[i, 7] == 'Q':
        dt1.iloc[i, 7] = 0
    if dt1.iloc[i, 7] == 'C':
        dt1.iloc[i, 7] = 1
    if dt1.iloc[i, 7] == 'S':
        dt1.iloc[i, 7] = 2
        
dt1.Embarked = dt1['Embarked'].astype('int64')
#dt1.info()

#### SibSp and Parch have low correlations with Survived; basically, they tell us if a passenger is travelling alone or not. I summarized this information in a single and more basic feature, 'solo', in order to see if this new feature has a stronger correlation with Survived.

In [None]:
df1['solo'] = 0
df1.at[(df1.SibSp==0) & (df1.Parch==0), 'solo']= 1
#df1.head(6)

In [None]:
dt1['solo'] = 0
dt1.at[(dt1.SibSp==0) & (dt1.Parch==0), 'solo']= 1
#dt1.head(6)

In [None]:
corr = np.abs(df1.drop('PassengerId', axis=1).corr())
mask = np.zeros_like(corr)
mask[np.tril_indices_from(mask)] = True

fig, ax = plt.subplots(figsize=(6,6))
ax = sns.heatmap(corr, mask=mask, annot=True, cmap='YlGnBu')

print(' Correlation Survived/SibSp: ', df1.Survived.corr(df1.SibSp), '\n',
      'Correlation Survived/Parch: ', df1.Survived.corr(df1.Parch), '\n',
      'Correlation Survived/alone: ', df1.Survived.corr(df1.solo), '\n'
     )

In [None]:
l = abs(df1.corr()['Survived']).sort_values(axis=0, ascending=False)
l[['solo', 'Parch', 'SibSp']].plot(kind='bar', color='r');

#### It seems that “solo” has a better correlation with Survived, so I’ll keep it and discard the two original features (SibSp and Parch). 

In [None]:
df1.drop(columns=['SibSp', 'Parch'], inplace=True)
df1.head(1)

In [None]:
dt1.drop(columns=['SibSp', 'Parch'], inplace=True)
dt1.head(1)

#### Convert title into numerical feature.

In [None]:
for i, row in df1.iterrows():
    if df1.iloc[i, 7] == 'Mr':
        df1.iloc[i, 7] = 0
    if df1.iloc[i, 7] == 'Master':
        df1.iloc[i, 7] = 1
    if df1.iloc[i, 7] == 'Miss':
        df1.iloc[i, 7] = 2
    if df1.iloc[i, 7] == 'Mrs':
        df1.iloc[i, 7] = 3
df1.title = df1['title'].astype('int64')
df1.head(3) 

In [None]:
for i, row in dt1.iterrows():
    if dt1.iloc[i, 6] == 'Mr':
        dt1.iloc[i, 6] = 0
    if dt1.iloc[i, 6] == 'Master':
        dt1.iloc[i, 6] = 1
    if dt1.iloc[i, 6] == 'Miss':
        dt1.iloc[i, 6] = 2
    if dt1.iloc[i, 6] == 'Mrs':
        dt1.iloc[i, 6] = 3

dt1.title = dt1['title'].astype('int64')
dt1.head(3) 

#### Now look at Age: it has a low correlation with Survived . Convert its value into log scale and see if it improves correlation.

#### This way, I can drop out Age and keep agelog instead. 


In [None]:
df1['agelog'] = 0
df1.agelog = (df1.Age.transform(np.log))
df1.head(3)

In [None]:
print(' Correlation Survived/Age: ', df1.Survived.corr(df1.Age), '\n',
      'Correlation Survived/agelog: ', df1.Survived.corr(df1.agelog), '\n'
     )

In [None]:
dt1['agelog'] = 0
dt1.agelog = (dt1.Age.transform(np.log))
#dt1.head(6)

#### Not a big improvement, but I can drop out Age and keep agelog.

In [None]:
df1.drop(columns=['Age'], inplace=True)
df1.head(1)

In [None]:
dt1.drop(columns=['Age'], inplace=True)
dt1.head(1)

#### Set proper index.

In [None]:
df1.set_index('PassengerId', inplace=True)
df1.head(3)

In [None]:
dt1.set_index('PassengerId', inplace=True)
dt1.head(3)

## Scale values.

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df2 = pd.DataFrame(scaler.fit_transform(df1),
                   columns = df1.columns,
                   index = df1.index)

# print('_______________________')
# print('Original data:')
# print('Standard Deviation:')
# print(np.std(df1, axis = 0))
# print('_______________________')
# print('Scaled data:')
# print('Standard Deviation:')
# print(np.std(df2, axis = 0))

In [None]:
dt2 = pd.DataFrame(scaler.fit_transform(dt1),
                   columns = dt1.columns,
                   index = dt1.index)

# print('_______________________')
# print('Original data:')
# print('Standard Deviation:')
# print(np.std(dt1, axis = 0))
# print('_______________________')
# print('Scaled data:')
# print('Standard Deviation:')
# print(np.std(dt2, axis = 0))

### Plot graphs of new correlations.

In [None]:
corr = np.abs(df2.corr())
mask = np.zeros_like(corr)
mask[np.tril_indices_from(mask)] = True

fig, ax = plt.subplots(figsize=(6,6))
ax = sns.heatmap(corr, mask=mask, annot=True, cmap='YlGnBu')

In [None]:
l = abs(df2.corr()['Survived']).sort_values(axis=0, ascending=False)
l[1:].plot(kind='bar', color='r');

# Setup models for machine learning.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

def dtree(x_train, y_train):
    clf = DecisionTreeClassifier(max_depth=9) #some tuning is possible here
    clf.fit(x_train, y_train)
    return clf

def rfc(x_train, y_train):
    clf = RandomForestClassifier(n_estimators=200, max_depth=9, random_state=1) #some tuning is possible here
    clf.fit(x_train, y_train)
    return clf 


def setup_model(x_train, y_train, cl_type):
    model = cl_type(x_train, y_train)
    y_pred = model.predict(x_test)
    
    train_score = model.score(x_train, y_train)
    test_score = accuracy_score(y_test, y_pred)
    print('Score on train data: ', train_score)
    print('Score on test data: ', test_score)

## Prepare training data.

In [None]:
col = ['Survived', 'title', 'Sex', 'Pclass', 'Fare', 'solo', 'agelog', 'Embarked']
# modify this list for changing predictors in model

In [None]:
data = df2[col]

X = data.drop(['Survived'], axis=1)
y = data['Survived']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Choose model, train it, evaluate it.

In [None]:
setup_model(x_train, y_train, rfc)

## Set up test data and make prediction.

In [None]:
test = dt2[col[1:]]
test.head(3)

In [None]:
model = rfc(x_train, y_train)
prediction = model.predict(test)
prediction = prediction.astype(int)
#prediction

## Create file for submission.

In [None]:
#next lines from my first tutorial, thanks to Alexis Cook!
output = pd.DataFrame({'PassengerId': test.index, 'Survived': prediction})
output.to_csv('submission_00.csv', index=False)
print("Your submission was successfully saved!")


# Your most recent submission
### Name: submission_00.csv
### Submitted: a few seconds ago (07/04/2020 21:51)
### Wait time: 1 seconds
### Execution time: 1 seconds
### Score: 0.78468
### Complete

### Used: rfc RandomForestClassifier(n_estimators=200, max_depth=9, random_state=1)
### Predictors: 'title', 'Pclass', 'Fare', 'solo', 'agelog', 'Embarked'