In [197]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [198]:
raw_data = pd.read_csv('data/train.csv')
raw_test = pd.read_csv('data/test.csv')
raw_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


###  Column Definitions copied from Kaggle


| Variable | Definition | Key |
| :- | -: | :-: |
|survival|Survival|0 = No, 1 = Yes|
|pclass|Ticket class|1 = 1st, 2 = 2nd, 3 = 3rd|
|sex|Sex|  |	
|Age|Age in years	||
|sibsp|# of siblings / spouses aboard the Titanic	||
|parch|# of parents / children aboard the Titanic	||
|ticket|Ticket number	||
|fare|Passenger fare	||
|cabin|	Cabin number	||
|embarked|Port of Embarkation|C = Cherbourg, Q = Queenstown, S = Southampton|

In [199]:
# Let's copy data into a new variable
# Remove Nominal features such as name, ticket id 

train_data = raw_data.copy()
train_data.set_index('PassengerId', inplace=True, drop=True)
print('Shape : ',train_data.shape)
print(train_data.info())

Shape :  (891, 11)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
None


In [200]:
# Let's copy data into a new variable
# Remove Nominal features such as name, ticket id 

test_data = raw_test.copy()
test_data.set_index('PassengerId', inplace=True, drop=True)
print('Shape : ',test_data.shape)
print(test_data.info())

Shape :  (418, 10)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB
None


- ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare'] => columns have data for all records
- Embarked has around 2 data missing
- We have only 25% data for Cabin. Cabin feature may correlate to survival, but here it's highly useless
- 20% data is missing in age -> We need to fill up this values


#### Age Adjustments 

reference : https://www.kaggle.com/allohvk/titanic-missing-age-imputation-tutorial-advanced

In [209]:
age_test_data = test_data.copy()
age_train_data = train_data.copy()

In [210]:
# we'll list out salutations out of  Names
def Create_salutation(df):
    Split_name1 = df['Name'].str.split(',', expand=True)
    Split_name2 = Split_name1[1].str.split('.', expand=True)
    Split_name2[0].unique()
    Split_name2[0]=Split_name2[0].str.replace(' ', '')
    df['salutation'] = Split_name2[0]
    
Create_salutation(age_test_data)
Create_salutation(age_train_data)

In [211]:
print('Train Data : ',age_train_data[age_train_data['Age'].isnull()]['salutation'].unique())
print('Data : ',age_test_data[age_test_data['Age'].isnull()]['salutation'].unique())

Train Data :  ['Mr' 'Mrs' 'Miss' 'Master' 'Dr']
Data :  ['Mr' 'Mrs' 'Miss' 'Ms' 'Master']


In [217]:
train_sample = age_train_data[age_train_data['salutation'].isin(['Mr', 'Mrs', 'Miss', 'Master', 'Dr'])]
train_age_summary = train_sample.groupby(['salutation', 'Pclass'])['Age'].agg(['mean', 'count'])
train_age_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
salutation,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
Dr,1,43.75,5
Dr,2,38.5,2
Master,1,5.306667,3
Master,2,2.258889,9
Master,3,5.350833,28
Miss,1,30.0,46
Miss,2,22.390625,34
Miss,3,16.123188,102
Mr,1,41.58046,107
Mr,2,32.768293,91


In [213]:
list(train_age_summary['mean'])

[43.75,
 38.5,
 5.306666666666667,
 2.2588888888888885,
 5.350833333333333,
 30.0,
 22.390625,
 16.1231884057971,
 41.58045977011494,
 32.76829268292683,
 28.724890829694324,
 40.88235294117647,
 33.68292682926829,
 33.515151515151516]

In [214]:
age_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Name        891 non-null    object 
 3   Sex         891 non-null    object 
 4   Age         714 non-null    float64
 5   SibSp       891 non-null    int64  
 6   Parch       891 non-null    int64  
 7   Ticket      891 non-null    object 
 8   Fare        891 non-null    float64
 9   Cabin       204 non-null    object 
 10  Embarked    889 non-null    object 
 11  salutation  891 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 90.5+ KB


In [215]:
salutations = ['Dr','Master','Miss','Mr','Mrs']
pclasses = [1,2,3]
ages = [43.75,
 38.5,
 0,
 5.306666666666667,
 2.2588888888888885,
 5.350833333333333,
 30.0,
 22.390625,
 16.1231884057971,
 41.58045977011494,
 32.76829268292683,
 28.724890829694324,
 40.88235294117647,
 33.68292682926829,
 33.515151515151516]
for stn in range(len(salutations)):
    for pcl in range(len(pclasses)):
        age = (stn*3)+pcl
        age_train_data.loc[(age_train_data['Age'].isnull()) & (age_train_data['salutation']==salutations[stn]) & (age_train_data['Pclass'] == pclasses[pcl]),'Age'] = ages[age]        

Salutation :  Dr , Pclass :  1 , Age :  43.75
Salutation :  Dr , Pclass :  2 , Age :  38.5
Salutation :  Dr , Pclass :  3 , Age :  0
Salutation :  Master , Pclass :  1 , Age :  5.306666666666667
Salutation :  Master , Pclass :  2 , Age :  2.2588888888888885
Salutation :  Master , Pclass :  3 , Age :  5.350833333333333
Salutation :  Miss , Pclass :  1 , Age :  30.0
Salutation :  Miss , Pclass :  2 , Age :  22.390625
Salutation :  Miss , Pclass :  3 , Age :  16.1231884057971
Salutation :  Mr , Pclass :  1 , Age :  41.58045977011494
Salutation :  Mr , Pclass :  2 , Age :  32.76829268292683
Salutation :  Mr , Pclass :  3 , Age :  28.724890829694324
Salutation :  Mrs , Pclass :  1 , Age :  40.88235294117647
Salutation :  Mrs , Pclass :  2 , Age :  33.68292682926829
Salutation :  Mrs , Pclass :  3 , Age :  33.515151515151516


In [216]:
age_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Name        891 non-null    object 
 3   Sex         891 non-null    object 
 4   Age         891 non-null    float64
 5   SibSp       891 non-null    int64  
 6   Parch       891 non-null    int64  
 7   Ticket      891 non-null    object 
 8   Fare        891 non-null    float64
 9   Cabin       204 non-null    object 
 10  Embarked    889 non-null    object 
 11  salutation  891 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 90.5+ KB


In [227]:
age_test_data[age_test_data['Age'].isnull()]['salutation'].unique()

array(['Mr', 'Mrs', 'Miss', 'Ms', 'Master'], dtype=object)

In [228]:
test_sample = age_test_data[age_test_data['salutation'].isin(['Mr', 'Mrs', 'Miss', 'Ms', 'Master'])]
test_age_summary = test_sample.groupby(['salutation', 'Pclass'])['Age'].agg(['mean', 'count'])
test_age_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
salutation,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
Master,1,9.5,2
Master,2,5.0,2
Master,3,7.454615,13
Miss,1,31.428571,14
Miss,2,17.37,16
Miss,3,19.872647,34
Mr,1,41.2,45
Mr,2,31.718182,55
Mr,3,27.198795,83
Mrs,1,45.606061,33


In [229]:
list(test_age_summary['mean'])

[9.5,
 5.0,
 7.454615384615384,
 31.428571428571427,
 17.37,
 19.872647058823528,
 41.2,
 31.71818181818182,
 27.198795180722893,
 45.60606060606061,
 33.0,
 29.875,
 nan]

In [None]:
ages_test = [9.5,
 5.0,
 7.454615384615384,
 31.428571428571427,
 17.37,
 19.872647058823528,
 41.2,
 31.71818181818182,
 27.198795180722893,
 45.60606060606061,
 33.0,
 29.875,
 29.875,
 29.875,
 29.875,]

In [156]:
train_data[train_data['Age'].isnull()]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,salutation
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Mr
18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S,Mr
20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C,Mrs
27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C,Mr
29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,Miss
...,...,...,...,...,...,...,...,...,...,...,...,...
860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C,Mr
864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S,Miss
869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S,Mr
879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S,Mr


In [None]:
data[(data['Age'].isnull()) & (data['salutation']=='Mrs')]

data.loc[(data['Age'].isnull()) & (data['salutation']=='Master') & (data['Pclass'] == 3),'Age'] = 5.35
data.loc[(data['Age'].isnull()) & (data['salutation']=='Dr') & (data['Pclass'] == 1),'Age'] = 43.75
data.loc[(data['Age'].isnull()) & (data['salutation']=='Miss') & (data['Pclass'] == 3),'Age'] = 15.12
data.loc[(data['Age'].isnull()) & (data['salutation']=='Miss') & (data['Pclass'] == 2),'Age'] = 22.39
data.loc[(data['Age'].isnull()) & (data['salutation']=='Miss') & (data['Pclass'] == 1),'Age'] = 30
data.loc[(data['Age'].isnull()) & (data['salutation']=='Mr') & (data['Pclass'] == 3),'Age'] = 28.72
data.loc[(data['Age'].isnull()) & (data['salutation']=='Mr') & (data['Pclass'] == 2),'Age'] = 32.76
data.loc[(data['Age'].isnull()) & (data['salutation']=='Mr') & (data['Pclass'] == 1),'Age'] = 41.58
data.loc[(data['Age'].isnull()) & (data['salutation']=='Mrs') & (data['Pclass'] == 3),'Age'] = 33.51
data.loc[(data['Age'].isnull()) & (data['salutation']=='Mrs') & (data['Pclass'] == 1),'Age'] = 40.88

In [41]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Name        891 non-null    object 
 3   Sex         891 non-null    object 
 4   Age         891 non-null    float64
 5   SibSp       891 non-null    int64  
 6   Parch       891 non-null    int64  
 7   Ticket      891 non-null    object 
 8   Fare        891 non-null    float64
 9   Cabin       204 non-null    object 
 10  Embarked    891 non-null    object 
 11  salutation  891 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 122.8+ KB
None


In [21]:
data[data['Embarked'].isnull()]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,salutation
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,Miss
830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,Mrs


In [39]:
import scipy
data.groupby(['Embarked', 'Pclass'])['Fare'].agg([lambda x: scipy.stats.mode(x)[0],'min','max','mean','count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,<lambda_0>,min,max,mean,count
Embarked,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C,1,27.7208,26.55,512.3292,104.718529,85
C,2,41.5792,12.0,41.5792,25.358335,17
C,3,7.2292,4.0125,22.3583,11.214083,66
Q,1,90.0,90.0,90.0,90.0,2
Q,2,12.35,12.35,12.35,12.35,3
Q,3,7.75,6.75,29.125,11.183393,72
S,1,26.55,0.0,263.0,70.364862,127
S,2,13.0,0.0,73.5,20.327439,164
S,3,8.05,0.0,69.55,14.644083,353


In [40]:
data.loc[data['Embarked'].isnull(),'Embarked'] = 'S'

In [None]:
# data['Sex'] = data['Sex'].map({'male':0,'female':1})


In [None]:
### Primary Observations

- Except 

In [None]:
data.describe()