<p align="center">
<img style="width:80%" src="https://c4.wallpaperflare
.com/wallpaper/378/267/803/titanic-ship-cruise-ship-drawing-night-hd-wallpaper-preview.jpg">
</p>

[Image source](https://www.wallpaperflare.com/titanic-ship-cruise-ship-drawing-night-hd-digital-artwork-wallpaper-mzpsf/)

<h1 style="text-align: center; color:#01872A; font-size: 80px;
background:#daf2e1; border-radius: 20px;
">Titanic.</h1>

## Please use nbviewer to read this notebook to use all it's features:

https://nbviewer.org/github/sersonSerson/Projects/blob/master/TimeSeries/LSTMAirlinePassengers/LstmAirlinePassengers.ipynb


In [1]:
# Imports.
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.api.types import is_numeric_dtype

from sklearn.feature_selection import RFECV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, \
    StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

In [2]:
cheat = pd.read_csv('data/test_cheat.csv', delimiter=',',
                        index_col='PassengerId')
from sklearn.metrics import accuracy_score
def cheat_score(preds):
    cheat['Preds'] = preds
    males = cheat[cheat['Sex'] == 'male']
    male_accuracy = accuracy_score(males['Cheat'], males['Preds'])
    print(f'Male accuracy: {male_accuracy}')
    females = cheat[cheat['Sex'] == 'female']
    female_accuracy = accuracy_score(females['Cheat'], females['Preds'])
    print(f'Female accuracy: {female_accuracy}')
    overall_accuracy = accuracy_score(cheat['Cheat'], cheat['Preds'])
    print(f'Overall accuracy: {overall_accuracy}')
    # return accuracy


In [3]:
pd.options.display.max_columns = 80
pd.options.display.max_rows = 30
train = pd.read_csv('data/train.csv', index_col='PassengerId')
test = pd.read_csv('data/test.csv', index_col='PassengerId')
# filled_df.sort_index(inplace=True)
filled_df = pd.concat([train, test])
filled_df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [4]:
manual = pd.read_csv('data/2 age.csv', index_col='PassengerId')
filled_df['ManualParents'] = manual['Parents']
filled_df['ManualChildren'] = manual['Children']


# Fare
According to:
https://autumnmccordckp.weebly.com/tickets-and-accomodations.html
Titanic pricing policy was:
1. First Class - £870 to £30.
2. Second Class- £12.
3. Third Class- £3 to £8.

## Check dataset fares

In [5]:
filled_df.groupby('Pclass').agg({'Fare': ['mean', 'max', 'min',
                                                 'count']})

Unnamed: 0_level_0,Fare,Fare,Fare,Fare
Unnamed: 0_level_1,mean,max,min,count
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,87.508992,512.3292,0.0,323
2,21.179196,73.5,0.0,277
3,13.302889,69.55,0.0,708


## Ideas:
1. There are passengers with 0 Fare - they are probably some additional crew
members.
2. The mean price of all classes is higher than expected, so maybe the
price is stated for all passengers with the same ticket number.

## Define number of passenger for each ticket and append it to the DataFrame.

In [6]:
ticket_passengers = \
    filled_df.groupby('Ticket')['Name'].agg('count')
ticket_passengers

Ticket
110152         3
110413         3
110465         2
110469         1
110489         1
              ..
W./C. 6608     5
W./C. 6609     1
W.E.P. 5734    2
W/C 14208      1
WE/P 5735      2
Name: Name, Length: 929, dtype: int64

In [7]:
filled_df['PassengersCount'] = filled_df['Ticket'].map(ticket_passengers)
filled_df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ManualParents,ManualChildren,PassengersCount
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0,0,1
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,0,2
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,0,1
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0,0,2
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0,0,1
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,0,0,3
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0,0,1
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0,0,1


Fill missing 'Fare' value

In [8]:
filled_df['Fare'].fillna(0, inplace=True)

## Calculate fare per passenger.

In [9]:
filled_df['FarePerPassenger'] = filled_df['Fare'] / filled_df['PassengersCount']
filled_df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ManualParents,ManualChildren,PassengersCount,FarePerPassenger
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0,0,1,7.250000
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,0,2,35.641650
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,0,1,7.925000
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0,0,2,26.550000
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0,0,1,8.050000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0,0,1,8.050000
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,0,0,3,36.300000
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0,0,1,7.250000
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0,0,1,8.050000


In [10]:
filled_df.groupby('Pclass').agg({'FarePerPassenger': ['mean', 'max', 'min',
                                                 'count']})

Unnamed: 0_level_0,FarePerPassenger,FarePerPassenger,FarePerPassenger,FarePerPassenger
Unnamed: 0_level_1,mean,max,min,count
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,33.9105,128.0823,0.0,323
2,11.41101,16.0,0.0,277
3,7.318808,19.9667,0.0,709


## Now the fares are closer to expected values.
## Outline passengers with zero ticket price.

In [11]:
filled_df['ZeroPrice'] = np.where(filled_df['FarePerPassenger'] == 0, 1, 0)
filled_df['ZeroPrice'].value_counts()

0    1291
1      18
Name: ZeroPrice, dtype: int64

# Age
## Age seems to be an important feature, it's worth spending time on filling
it properly.

In [12]:
filled_df['FirstName'] = filled_df['Name'].str.split(',').str[0].str.strip()
filled_df['LastName'] = \
    filled_df['Name'].str.split('.').str[1].str.split('(').str[0].str.strip()
filled_df['MaidenName'] = \
    filled_df['Name'].str.split('(').str[1].str.split(')').str[0].str.strip()
filled_df['Title'] = filled_df['Name'].str.split(',').str[1].str.split('.').str[0].str\
    .strip()
filled_df['Title'].value_counts()
filled_df[['Name', 'FirstName', 'LastName', 'Title', 'MaidenName']]


Unnamed: 0_level_0,Name,FirstName,LastName,Title,MaidenName
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"Braund, Mr. Owen Harris",Braund,Owen Harris,Mr,
2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Cumings,John Bradley,Mrs,Florence Briggs Thayer
3,"Heikkinen, Miss. Laina",Heikkinen,Laina,Miss,
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Futrelle,Jacques Heath,Mrs,Lily May Peel
5,"Allen, Mr. William Henry",Allen,William Henry,Mr,
...,...,...,...,...,...
1305,"Spector, Mr. Woolf",Spector,Woolf,Mr,
1306,"Oliva y Ocana, Dona. Fermina",Oliva y Ocana,Fermina,Dona,
1307,"Saether, Mr. Simon Sivertsen",Saether,Simon Sivertsen,Mr,
1308,"Ware, Mr. Frederick",Ware,Frederick,Mr,


In [13]:
filled_df['Spounce'] = 0
females_with_sibsp = filled_df[(filled_df['Sex'] == 'female')
                               & (filled_df['SibSp'] > 0)
                               &  (filled_df['Title'] != 'Miss')].copy()
males_with_sibsp = filled_df[(filled_df['Sex'] == 'male')
                                & (filled_df['SibSp'] > 0)].copy()
for a, row in females_with_sibsp.iterrows():
    potential_husband = \
        males_with_sibsp[
            (males_with_sibsp['Ticket'] == row['Ticket'])
            & (males_with_sibsp['LastName'].str.split(' ').str[0] ==
            row['LastName'].split(' ')[0])
            & (males_with_sibsp['Title'] != 'Master')]
    if len(potential_husband) > 0:
        filled_df.loc[potential_husband.index, 'Spounce'] = 1
        filled_df.loc[a, 'Spounce'] = 1
# exceptions = ['Skoog, Mrs. William (Anna Bernhardina Karlsson)',
#               'Skoog, Mr. Wilhelm',
#               'Duff Gordon, Lady. (Lucille Christiana Sutherland) ("Mrs '
#               'Morgan")',
#               'Duff Gordon, Sir. Cosmo Edmund ("Mr Morgan")',
#               'Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)',
#               'Faunthorpe, Mr. Harry',
#               'Nakid, Mr. Sahid',
#               'Nakid, Mrs. Said (Waika Mary" Mowad)"',
#               'Goodwin, Mrs. Frederick (Augusta Tyler)',
#               'Goodwin, Mr. Charles Frederick',
#               'Ware, Mr. John James',
#               'Ware, Mrs. John James (Florence Louise Long)',
#               'Crosby, Mrs. Edward Gifford (Catherine Elizabeth Halstead)',
#               'Crosby, Capt. Edward Gifford'
#               ]
# exceptions_mask = filled_df['Name'].isin(exceptions)
# filled_df.loc[exceptions_mask, 'Spounce'] = 1

In [14]:
filled_df['Siblings'] = filled_df['SibSp'] - filled_df['Spounce']
filled_df['Siblings']

PassengerId
1       1
2       0
3       0
4       0
5       0
       ..
1305    0
1306    0
1307    0
1308    0
1309    1
Name: Siblings, Length: 1309, dtype: int64

In [15]:
# filled_df.loc[filled_df['Name'] == 'Andersson, Miss. Erna Alexandra',
#           ['SibSp', 'Siblings', 'Parch']] = 0
# filled_df.loc[filled_df['Name'] == 'Ford, Mr. William Neal',
#           ['SibSp', 'Siblings', 'Parch']] = [3, 3, 1]
# filled_df.loc[filled_df['Name'] == 'Baxter, Mr. Quigg Edmond',
#           ['SibSp', 'Siblings']] = 1
# filled_df.loc[filled_df['Name'] == 'Ford, Mr. Edward Watson',
#           ['SibSp', 'Siblings', 'Spounce', 'Parch',
#            'Parents']] = [3, 3, 0, 1, 1]
# filled_df.loc[filled_df['Name'] == 'Ware, Mr. William Jeffery',
#           ['SibSp', 'Siblings']] = [0, 0]
# filled_df.loc[filled_df['Name'] == 'Abbott, Master. Eugene Joseph',
#           ['SibSp', 'Siblings', 'Parch']] = [1, 1, 1]
# filled_df.loc[filled_df['Name'] == 'Ford, Miss. Robina Maggie "Ruby"',
#           ['SibSp', 'Siblings', 'Parch']] = [3, 3, 1]
# filled_df.loc[filled_df['Name'] == 'Natsch, Mr. Charles H',
#           ['Parch']] = [0]
# filled_df.loc[filled_df['Name'] == 'Abbott, Mrs. Stanton (Rosa Hunt)',
#           ['SibSp', 'Siblings', 'Parch', 'Children']] = [0, 0, 2, 2]
# filled_df.loc[filled_df['Name'] == 'Lahtinen, Mrs. William (Anna Sylfven)',
#           ['SibSp', 'Siblings', 'Spounce', 'Parch']] = [2, 1, 1, 0]
# filled_df.loc[filled_df['Name'] == 'Ware, Mrs. John James (Florence Louise Long)',
#           ['SibSp', 'Siblings', 'Spounce', 'Parch']] = [1, 0, 1, 0,]
# filled_df.loc[filled_df['Name'] == 'Lahtinen, Rev. William',
#           ['Parch']] = [0]
# filled_df.loc[filled_df['Name'] == 'Silven, Miss. Lyyli Karoliina',
#           ['SibSp', 'Siblings', 'Parch']] = [1, 1, 0]
# filled_df.loc[filled_df['Name'] == 'Ford, Miss. Doolina Margaret "Daisy"',
#           ['SibSp', 'Siblings', 'Parch', 'Parents']] = [3, 3, 1, 1]
# filled_df.loc[filled_df['Name'] == 'Ford, Mrs. Edward (Margaret Ann Watson)',
#           ['Siblings', 'Spounce', 'Parch']] = [1, 0, 4]
# filled_df.loc[filled_df['Name'] == 'Johnston, Mrs. Andrew G (Elizabeth Lily" Watson)"',
#           ['SibSp', 'Siblings', 'Spounce']] = [2, 1, 1]
# filled_df

# Parents and children

In [16]:
def divide_in_parents_and_children(group):
        parents_group = None
        children_group = None

        if group['Age'].isna().sum() == 0:
            age_filled = True
            max_age = group['Age'].max()
            max_children_age = max_age - 12
        else:
            age_filled = False

        if any(group['Spounce'] > 0):
            spounce = True
        else:
            spounce = False
        if spounce:
            spounce_grouped = group.groupby(['Spounce'])
            for name, spounce_group in spounce_grouped:
                if all(spounce_group['Spounce'] == 1):
                    parents_group = spounce_group
                elif all(spounce_group['Spounce'] != 1):
                    children_group = spounce_group
        if children_group is None or parents_group is None:
            if age_filled:
                children_group = group[group['Age'] <= max_children_age]
                parents_group = group[group['Age'] > max_children_age]
            else:
                parch_grouped = group.groupby(['Parch'])
                for name, parch_group in parch_grouped:
                    if all(parch_group['Title'].isin(['Dr', 'Mr', 'Mrs',
                                                      'Capt'])):
                        if parents_group is None:
                            parents_group = parch_group
                    elif all(parch_group['Title'].isin(['Miss', 'Mr', 'Master'])):
                        if children_group is None:
                            children_group = parch_group
        return parents_group, children_group

In [17]:
filled_parch = filled_df[filled_df['Parch'] > 0]
filled_df['Parents'] = 0
filled_df['Children'] = 0
ticket_grouped = filled_parch.sort_values('Ticket').groupby(['Ticket'])
for name, ticket_group in ticket_grouped:
    parents_group, children_group = divide_in_parents_and_children(ticket_group)
    if children_group is not None and parents_group is not None\
            and len(children_group) > 0 and len(parents_group) > 0:
        filled_df.loc[parents_group.index, 'Children'] = parents_group['Parch']
        filled_df.loc[children_group.index, 'Parents'] = children_group['Parch']

In [18]:
incorrect_parch = filled_df[(filled_df['Children'] + filled_df['Parents']) !=
            filled_df['Parch']]
incorrect_parch.shape

(29, 24)

In [19]:
name_grouped = incorrect_parch.sort_values('FirstName').groupby(['FirstName'])
for name, name_group in name_grouped:
    parents_group, children_group = divide_in_parents_and_children(name_group)
    if children_group is not None and parents_group is not None\
            and len(children_group) > 0 and len(parents_group) > 0:
        filled_df.loc[parents_group.index, 'Children'] = parents_group['Parch']
        filled_df.loc[children_group.index, 'Parents'] = children_group['Parch']

In [20]:
# filled_df.loc[filled_df['Name'] == 'Chibnall, Mrs. (Edith Martha Bowerman)',
#           ['Children']] = [1]
# filled_df.loc[filled_df['Name'] == 'Bowerman, Miss. Elsie Edith',
#           ['Parents']] = [1]
# filled_df.loc[filled_df['Name'] == 'Klasen, Mr. Klas Albin',
#           ['Parents']] = [1]
# filled_df.loc[filled_df['Name'] == 'Newsom, Miss. Helen Monypeny',
#           ['Parents']] = [2]
# filled_df.loc[filled_df['Name'] == 'Beckwith, Mr. Richard Leonard',
#           ['Children']] = [1]
# filled_df.loc[filled_df['Name'] == 'Beckwith, Mrs. Richard Leonard (Sallie Monypeny)',
#           ['Children']] = [1]
# filled_df.loc[filled_df['Name'] == 'Hocking, Mrs. Elizabeth (Eliza Needs)',
#           ['Children']] = [3]
# filled_df.loc[filled_df['Name'] == 'Hocking, Mr. Richard George',
#           ['Parents']] = [1]
# filled_df.loc[filled_df['Name'] == 'Hays, Mr. Charles Melville',
#           ['Children']] = [1]
# filled_df.loc[filled_df['Name'] == 'Hays, Mrs. Charles Melville (Clara Jennings Gregg)',
#           ['Children']] = [1]
# filled_df.loc[filled_df['Name'] == 'Davidson, Mrs. Thornton (Orian Hays)',
#           ['Parents']] = [2]
# filled_df.loc[filled_df['Name'] == 'Crosby, Mrs. Edward Gifford (Catherine Elizabeth Halstead)',
#           ['Children']] = [1]
# filled_df.loc[filled_df['Name'] == 'Frolicher, Miss. Hedwig Margaritha',
#           ['Parents']] = [2]
# filled_df.loc[filled_df['Name'] == 'Hiltunen, Miss. Marta',
#           ['Parch']] = [0]
# filled_df.loc[filled_df['Name'] == 'Andersson, Miss. Ida Augusta Margareta',
#           ['Parch']] = [0]
# filled_df.loc[filled_df['Name'] == 'Newell, Mr. Arthur Webster',
#           ['Children']] = [2]
# filled_df.loc[filled_df['Name'] == 'Jacobsohn, Mrs. Sidney Samuel (Amy Frances Christy)',
#           ['Parents']] = [1]
# filled_df[(filled_df['Children'] + filled_df['Parents']) !=
#             filled_df['Parch']]

In [21]:
filled_df[(filled_df['ManualParents'] != filled_df['Parents'])]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ManualParents,ManualChildren,PassengersCount,FarePerPassenger,ZeroPrice,FirstName,LastName,MaidenName,Title,Spounce,Siblings,Parents,Children
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
69,1.0,3,"Andersson, Miss. Erna Alexandra",female,17.0,4,2,3101281,7.925,,S,0,0,1,7.925,0,Andersson,Erna Alexandra,,Miss,0,4,2,0
87,0.0,3,"Ford, Mr. William Neal",male,16.0,1,3,W./C. 6608,34.375,,S,1,0,5,6.875,0,Ford,William Neal,,Mr,0,1,3,0
137,1.0,1,"Newsom, Miss. Helen Monypeny",female,19.0,0,2,11752,26.2833,D47,S,2,0,1,26.2833,0,Newsom,Helen Monypeny,,Miss,0,0,0,0
148,0.0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9.0,2,2,W./C. 6608,34.375,,S,1,0,5,6.875,0,Ford,"Robina Maggie ""Ruby""",,Miss,0,2,2,0
176,0.0,3,"Klasen, Mr. Klas Albin",male,18.0,1,1,350404,7.8542,,S,1,0,1,7.8542,0,Klasen,Klas Albin,,Mr,0,1,0,0
357,1.0,1,"Bowerman, Miss. Elsie Edith",female,22.0,0,1,113505,55.0,E33,S,1,0,2,27.5,0,Bowerman,Elsie Edith,,Miss,0,0,0,0
437,0.0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21.0,2,2,W./C. 6608,34.375,,S,1,0,5,6.875,0,Ford,"Doolina Margaret ""Daisy""",,Miss,0,2,2,0
438,1.0,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24.0,2,3,29106,18.75,,S,1,2,3,6.25,0,Richards,Sidney,Emily Hocking,Mrs,0,2,0,3
530,0.0,2,"Hocking, Mr. Richard George",male,23.0,2,1,29104,11.5,,S,1,0,1,11.5,0,Hocking,Richard George,,Mr,0,2,0,0
540,1.0,1,"Frolicher, Miss. Hedwig Margaritha",female,22.0,0,2,13568,49.5,B39,C,2,0,1,49.5,0,Frolicher,Hedwig Margaritha,,Miss,0,0,0,0


In [22]:
filled_df[(filled_df['ManualChildren'] != filled_df['Children'])]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ManualParents,ManualChildren,PassengersCount,FarePerPassenger,ZeroPrice,FirstName,LastName,MaidenName,Title,Spounce,Siblings,Parents,Children
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
167,1.0,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55.0,E33,S,0,1,2,27.5,0,Chibnall,,Edith Martha Bowerman,Mrs,0,0,0,0
249,1.0,1,"Beckwith, Mr. Richard Leonard",male,37.0,1,1,11751,52.5542,D35,S,0,1,2,26.2771,0,Beckwith,Richard Leonard,,Mr,1,0,0,0
280,1.0,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,C.A. 2673,20.25,,S,0,2,3,6.75,0,Abbott,Stanton,Rosa Hunt,Mrs,0,1,0,1
438,1.0,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24.0,2,3,29106,18.75,,S,1,2,3,6.25,0,Richards,Sidney,Emily Hocking,Mrs,0,2,0,3
660,0.0,1,"Newell, Mr. Arthur Webster",male,58.0,0,2,35273,113.275,D48,C,0,2,3,37.758333,0,Newell,Arthur Webster,,Mr,0,0,0,0
679,0.0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43.0,1,6,CA 2144,46.9,,S,6,0,8,5.8625,0,Goodwin,Frederick,Augusta Tyler,Mrs,0,1,0,6
737,0.0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",female,48.0,1,3,W./C. 6608,34.375,,S,0,4,5,6.875,0,Ford,Edward,Margaret Ann Watson,Mrs,1,0,0,3
821,1.0,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S,0,1,4,23.375,0,Hays,Charles Melville,Clara Jennings Gregg,Mrs,1,0,0,0
872,1.0,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S,0,1,2,26.2771,0,Beckwith,Richard Leonard,Sallie Monypeny,Mrs,1,0,0,0
1031,,3,"Goodwin, Mr. Charles Frederick",male,40.0,1,6,CA 2144,46.9,,S,6,0,8,5.8625,0,Goodwin,Charles Frederick,,Mr,0,1,0,6


In [23]:
filled_df = filled_df[['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Ticket',
                     'Fare',
           'Cabin', 'Embarked',
           'FirstName', 'LastName', 'Title', 'SibSp', 'Siblings',
           'Spounce', 'Parch',
                       'Parents', 'Children',
                       'PassengersCount',
           'FarePerPassenger', 'ZeroPrice']]
filled_df.to_csv('data/Before age no hands.csv', index=True, header=True,
                 index_label='PassengerId')

In [24]:
def fill_age_by_mask(mask, df):
    mean = df[mask]['Age'].mean()
    print(f'Mean: {mean}')
    masked_copy = df.loc[mask].copy()
    masked_copy['Age'].fillna(mean, inplace=True)
    df.loc[mask] = masked_copy
    print(filled_df['Age'].isna().sum())
    return df

'Master' title seems to be used for a little boy.

In [25]:
filled_df.groupby('Title').agg({'Age': ['min', 'mean', 'max', 'count']})

Unnamed: 0_level_0,Age,Age,Age,Age
Unnamed: 0_level_1,min,mean,max,count
Title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Capt,70.0,70.0,70.0,1
Col,47.0,54.0,60.0,4
Don,40.0,40.0,40.0,1
Dona,39.0,39.0,39.0,1
Dr,23.0,43.571429,54.0,7
Jonkheer,38.0,38.0,38.0,1
Lady,48.0,48.0,48.0,1
Major,45.0,48.5,52.0,2
Master,0.33,5.482642,14.5,53
Miss,0.17,21.774238,63.0,210


In [26]:
master_mask = filled_df['Title'] == 'Master'
filled_df = fill_age_by_mask(master_mask, filled_df)
filled_df.loc[master_mask]['Age'].value_counts(dropna=False)

Mean: 5.482641509433963
255


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)


5.482642     8
2.000000     5
4.000000     5
1.000000     5
9.000000     5
3.000000     4
6.000000     4
13.000000    3
7.000000     3
8.000000     3
0.830000     3
10.000000    2
11.000000    2
0.920000     1
12.000000    1
0.670000     1
0.420000     1
11.500000    1
0.330000     1
0.750000     1
14.500000    1
5.000000     1
Name: Age, dtype: int64

In [27]:
filled_df['TravelsAlone'] = \
    np.where((filled_df['Parch'] + filled_df['SibSp']) == 0, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filled_df['TravelsAlone'] = \


In [28]:
travels_alone_mask = filled_df['TravelsAlone'] == 1
filled_df = fill_age_by_mask(travels_alone_mask, filled_df)
filled_df.loc[travels_alone_mask]['Age'].value_counts(dropna=False)

Mean: 31.467821728442363
56


31.467822    199
22.000000     31
30.000000     30
21.000000     30
24.000000     28
            ... 
70.000000      1
20.500000      1
80.000000      1
66.000000      1
38.500000      1
Name: Age, Length: 79, dtype: int64

In [29]:
married_mask = filled_df['Spounce'] == 1
filled_df = fill_age_by_mask(married_mask, filled_df)
filled_df.loc[married_mask]['Age'].value_counts(dropna=False)

Mean: 36.84615384615385
36


36.846154    20
31.000000     8
24.000000     8
36.000000     8
29.000000     8
             ..
67.000000     1
76.000000     1
61.000000     1
36.500000     1
53.000000     1
Name: Age, Length: 48, dtype: int64

In [30]:
travels_with_parents_mask = (filled_df['Parents'] > 0) & (filled_df['Spounce']
                             == 0)
filled_df = fill_age_by_mask(travels_with_parents_mask, filled_df)
filled_df.loc[travels_with_parents_mask]['Age'].value_counts(dropna=False)

Mean: 10.520188679245283
23


10.520189    13
2.000000     12
4.000000     10
1.000000     10
9.000000     10
             ..
39.000000     1
0.420000      1
0.670000      1
31.000000     1
14.500000     1
Name: Age, Length: 43, dtype: int64

In [31]:
not_married_mask = filled_df['Spounce'] == 0
filled_df = fill_age_by_mask(not_married_mask, filled_df)
filled_df.loc[not_married_mask]['Age'].value_counts(dropna=False)

Mean: 28.59076578308643
0


31.467822    199
21.000000     41
22.000000     39
24.000000     39
30.000000     33
            ... 
24.500000      1
55.500000      1
80.000000      1
20.500000      1
38.500000      1
Name: Age, Length: 100, dtype: int64

In [32]:
# filled_df['Age'].fillna(filled_df['Age'].mean(), inplace=True)

In [33]:
filled_df['Age'].isna().sum()

0

# Embarked

In [34]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
filled_df['Embarked'] = imputer.fit_transform(filled_df[['Embarked']])
filled_df.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filled_df['Embarked'] = imputer.fit_transform(filled_df[['Embarked']])


Survived             418
Pclass                 0
Name                   0
Sex                    0
Age                    0
Ticket                 0
Fare                   0
Cabin               1014
Embarked               0
FirstName              0
LastName               0
Title                  0
SibSp                  0
Siblings               0
Spounce                0
Parch                  0
Parents                0
Children               0
PassengersCount        0
FarePerPassenger       0
ZeroPrice              0
TravelsAlone           0
dtype: int64

# Feature encoding

In [35]:
encoder = OneHotEncoder(sparse=False)
encoded_sex = encoder.fit_transform(filled_df[['Pclass']])
for column_index, category in enumerate(encoder.categories_[0]):
    print(category, column_index)
    if category == 1:
        category = 'First'
    elif category == 2:
        category = 'Second'
    elif category == 3:
        category = 'Third'
    filled_df['Pclass' + category.capitalize()] = encoded_sex[:, column_index]
filled_df.drop('Pclass', axis=1, inplace=True)
filled_df

1 0
2 1
3 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filled_df['Pclass' + category.capitalize()] = encoded_sex[:, column_index]


Unnamed: 0_level_0,Survived,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,FirstName,LastName,Title,SibSp,Siblings,Spounce,Parch,Parents,Children,PassengersCount,FarePerPassenger,ZeroPrice,TravelsAlone,PclassFirst,PclassSecond,PclassThird
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,0.0,"Braund, Mr. Owen Harris",male,22.000000,A/5 21171,7.2500,,S,Braund,Owen Harris,Mr,1,1,0,0,0,0,1,7.250000,0,0,0.0,0.0,1.0
2,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,PC 17599,71.2833,C85,C,Cumings,John Bradley,Mrs,1,0,1,0,0,0,2,35.641650,0,0,1.0,0.0,0.0
3,1.0,"Heikkinen, Miss. Laina",female,26.000000,STON/O2. 3101282,7.9250,,S,Heikkinen,Laina,Miss,0,0,0,0,0,0,1,7.925000,0,1,0.0,0.0,1.0
4,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,113803,53.1000,C123,S,Futrelle,Jacques Heath,Mrs,1,0,1,0,0,0,2,26.550000,0,0,1.0,0.0,0.0
5,0.0,"Allen, Mr. William Henry",male,35.000000,373450,8.0500,,S,Allen,William Henry,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,"Spector, Mr. Woolf",male,31.467822,A.5. 3236,8.0500,,S,Spector,Woolf,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0
1306,,"Oliva y Ocana, Dona. Fermina",female,39.000000,PC 17758,108.9000,C105,C,Oliva y Ocana,Fermina,Dona,0,0,0,0,0,0,3,36.300000,0,1,1.0,0.0,0.0
1307,,"Saether, Mr. Simon Sivertsen",male,38.500000,SOTON/O.Q. 3101262,7.2500,,S,Saether,Simon Sivertsen,Mr,0,0,0,0,0,0,1,7.250000,0,1,0.0,0.0,1.0
1308,,"Ware, Mr. Frederick",male,31.467822,359309,8.0500,,S,Ware,Frederick,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0


In [36]:
encoder = OneHotEncoder(sparse=False)
encoded_sex = encoder.fit_transform(filled_df[['Sex']])
for column_index, category in enumerate(encoder.categories_[0]):
    print(category, column_index)
    filled_df['Sex' + category.capitalize()] = encoded_sex[:, column_index]
filled_df.drop('Sex', axis=1, inplace=True)
filled_df

female 0
male 1


Unnamed: 0_level_0,Survived,Name,Age,Ticket,Fare,Cabin,Embarked,FirstName,LastName,Title,SibSp,Siblings,Spounce,Parch,Parents,Children,PassengersCount,FarePerPassenger,ZeroPrice,TravelsAlone,PclassFirst,PclassSecond,PclassThird,SexFemale,SexMale
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1,0.0,"Braund, Mr. Owen Harris",22.000000,A/5 21171,7.2500,,S,Braund,Owen Harris,Mr,1,1,0,0,0,0,1,7.250000,0,0,0.0,0.0,1.0,0.0,1.0
2,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,PC 17599,71.2833,C85,C,Cumings,John Bradley,Mrs,1,0,1,0,0,0,2,35.641650,0,0,1.0,0.0,0.0,1.0,0.0
3,1.0,"Heikkinen, Miss. Laina",26.000000,STON/O2. 3101282,7.9250,,S,Heikkinen,Laina,Miss,0,0,0,0,0,0,1,7.925000,0,1,0.0,0.0,1.0,1.0,0.0
4,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,113803,53.1000,C123,S,Futrelle,Jacques Heath,Mrs,1,0,1,0,0,0,2,26.550000,0,0,1.0,0.0,0.0,1.0,0.0
5,0.0,"Allen, Mr. William Henry",35.000000,373450,8.0500,,S,Allen,William Henry,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,"Spector, Mr. Woolf",31.467822,A.5. 3236,8.0500,,S,Spector,Woolf,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0,0.0,1.0
1306,,"Oliva y Ocana, Dona. Fermina",39.000000,PC 17758,108.9000,C105,C,Oliva y Ocana,Fermina,Dona,0,0,0,0,0,0,3,36.300000,0,1,1.0,0.0,0.0,1.0,0.0
1307,,"Saether, Mr. Simon Sivertsen",38.500000,SOTON/O.Q. 3101262,7.2500,,S,Saether,Simon Sivertsen,Mr,0,0,0,0,0,0,1,7.250000,0,1,0.0,0.0,1.0,0.0,1.0
1308,,"Ware, Mr. Frederick",31.467822,359309,8.0500,,S,Ware,Frederick,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0,0.0,1.0


In [37]:
encoder =OneHotEncoder(sparse=False)
encoded_embarked = encoder.fit_transform(filled_df[['Embarked']])
for column_index, category in enumerate(encoder.categories_[0]):
    print(category, column_index)
    filled_df['Embarked' + category] = encoded_embarked[:, column_index]
filled_df.drop('Embarked', axis=1, inplace=True)
filled_df

C 0
Q 1
S 2


Unnamed: 0_level_0,Survived,Name,Age,Ticket,Fare,Cabin,FirstName,LastName,Title,SibSp,Siblings,Spounce,Parch,Parents,Children,PassengersCount,FarePerPassenger,ZeroPrice,TravelsAlone,PclassFirst,PclassSecond,PclassThird,SexFemale,SexMale,EmbarkedC,EmbarkedQ,EmbarkedS
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
1,0.0,"Braund, Mr. Owen Harris",22.000000,A/5 21171,7.2500,,Braund,Owen Harris,Mr,1,1,0,0,0,0,1,7.250000,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,PC 17599,71.2833,C85,Cumings,John Bradley,Mrs,1,0,1,0,0,0,2,35.641650,0,0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,"Heikkinen, Miss. Laina",26.000000,STON/O2. 3101282,7.9250,,Heikkinen,Laina,Miss,0,0,0,0,0,0,1,7.925000,0,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,113803,53.1000,C123,Futrelle,Jacques Heath,Mrs,1,0,1,0,0,0,2,26.550000,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5,0.0,"Allen, Mr. William Henry",35.000000,373450,8.0500,,Allen,William Henry,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,"Spector, Mr. Woolf",31.467822,A.5. 3236,8.0500,,Spector,Woolf,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1306,,"Oliva y Ocana, Dona. Fermina",39.000000,PC 17758,108.9000,C105,Oliva y Ocana,Fermina,Dona,0,0,0,0,0,0,3,36.300000,0,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1307,,"Saether, Mr. Simon Sivertsen",38.500000,SOTON/O.Q. 3101262,7.2500,,Saether,Simon Sivertsen,Mr,0,0,0,0,0,0,1,7.250000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1308,,"Ware, Mr. Frederick",31.467822,359309,8.0500,,Ware,Frederick,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


Total relatives.

In [38]:
filled_df['TotalRelatives'] = filled_df['Parch'] + filled_df['SibSp']
filled_df['TotalRelatives']

PassengerId
1       1
2       1
3       0
4       1
5       0
       ..
1305    0
1306    0
1307    0
1308    0
1309    2
Name: TotalRelatives, Length: 1309, dtype: int64

In [39]:
filled_df['ALone'] = np.where(filled_df['Parch'] + filled_df['SibSp'] == 0,
                              1, 0)
filled_df['ALone']

PassengerId
1       0
2       0
3       1
4       0
5       1
       ..
1305    1
1306    1
1307    1
1308    1
1309    0
Name: ALone, Length: 1309, dtype: int32

Deck from the cabin

In [40]:
filled_df['Deck'] = filled_df['Cabin'].str[0]
filled_df['Deck'].fillna('Empty', inplace=True)
encoder = OneHotEncoder(sparse=False)
encoded_sex = encoder.fit_transform(filled_df[['Deck']])
for column_index, category in enumerate(encoder.categories_[0]):
    print(category, column_index)
    filled_df['Deck' + category.capitalize()] = encoded_sex[:, column_index]
filled_df.drop('Deck', axis=1, inplace=True)
filled_df

A 0
B 1
C 2
D 3
E 4
Empty 5
F 6
G 7
T 8


Unnamed: 0_level_0,Survived,Name,Age,Ticket,Fare,Cabin,FirstName,LastName,Title,SibSp,Siblings,Spounce,Parch,Parents,Children,PassengersCount,FarePerPassenger,ZeroPrice,TravelsAlone,PclassFirst,PclassSecond,PclassThird,SexFemale,SexMale,EmbarkedC,EmbarkedQ,EmbarkedS,TotalRelatives,ALone,DeckA,DeckB,DeckC,DeckD,DeckE,DeckEmpty,DeckF,DeckG,DeckT
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
1,0.0,"Braund, Mr. Owen Harris",22.000000,A/5 21171,7.2500,,Braund,Owen Harris,Mr,1,1,0,0,0,0,1,7.250000,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,PC 17599,71.2833,C85,Cumings,John Bradley,Mrs,1,0,1,0,0,0,2,35.641650,0,0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,"Heikkinen, Miss. Laina",26.000000,STON/O2. 3101282,7.9250,,Heikkinen,Laina,Miss,0,0,0,0,0,0,1,7.925000,0,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,113803,53.1000,C123,Futrelle,Jacques Heath,Mrs,1,0,1,0,0,0,2,26.550000,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,"Allen, Mr. William Henry",35.000000,373450,8.0500,,Allen,William Henry,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,"Spector, Mr. Woolf",31.467822,A.5. 3236,8.0500,,Spector,Woolf,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1306,,"Oliva y Ocana, Dona. Fermina",39.000000,PC 17758,108.9000,C105,Oliva y Ocana,Fermina,Dona,0,0,0,0,0,0,3,36.300000,0,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1307,,"Saether, Mr. Simon Sivertsen",38.500000,SOTON/O.Q. 3101262,7.2500,,Saether,Simon Sivertsen,Mr,0,0,0,0,0,0,1,7.250000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1308,,"Ware, Mr. Frederick",31.467822,359309,8.0500,,Ware,Frederick,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Women with only husband

In [41]:
filled_df['OnlyHusband'] = np.where((filled_df['SexFemale'] == 1)
                                    & (filled_df['Spounce'] == 1)
                                    & (filled_df['Parents'] == 0)
                                    & (filled_df['Children'] == 0), 1, 0)
filled_df['OnlyHusband'].value_counts()

0    1239
1      70
Name: OnlyHusband, dtype: int64

Families that have many children.

In [42]:
filled_df['ManyChildren'] = np.where(filled_df['Children'] > 3, 1, 0)

In [43]:
from templates.data_transform.feature_encoding import encode_df
columns_to_encode = ['Ticket', 'Cabin', 'Title', 'SibSp', 'Siblings', 'Spounce',
                    'Parch', 'Parents', 'Children', 'PassengersCount',
                    'FarePerPassenger', 'TravelsAlone', 'PclassFirst',
                    'PclassSecond', 'PclassThird', 'SexFemale', 'SexMale',
                    'EmbarkedC', 'EmbarkedQ', 'EmbarkedS', 'TotalRelatives',
                    'ALone', 'OnlyHusband', 'ManyChildren', 'DeckA', 'DeckB',
                    'DeckC', 'DeckD', 'DeckE', 'DeckEmpty', 'DeckF', 'DeckG',
                    'DeckT']
new_df, new_columns = encode_df(filled_df.drop('Survived', axis=1),
                               filled_df['Survived'], type='MeanEncoding',
                                columns=columns_to_encode)
new_column_names = [column + 'MeanEncoded' for column in new_columns]
filled_df[new_column_names] =  new_df[new_columns]
for column in new_column_names:
    filled_df[column].fillna(-1, inplace=True)
filled_df

Unnamed: 0_level_0,Survived,Name,Age,Ticket,Fare,Cabin,FirstName,LastName,Title,SibSp,Siblings,Spounce,Parch,Parents,Children,PassengersCount,FarePerPassenger,ZeroPrice,TravelsAlone,PclassFirst,PclassSecond,PclassThird,SexFemale,SexMale,EmbarkedC,EmbarkedQ,EmbarkedS,TotalRelatives,ALone,DeckA,DeckB,DeckC,DeckD,DeckE,DeckEmpty,DeckF,DeckG,DeckT,OnlyHusband,ManyChildren,TicketMeanEncoded,CabinMeanEncoded,TitleMeanEncoded,SibSpMeanEncoded,SiblingsMeanEncoded,SpounceMeanEncoded,ParchMeanEncoded,ParentsMeanEncoded,ChildrenMeanEncoded,PassengersCountMeanEncoded,FarePerPassengerMeanEncoded,TravelsAloneMeanEncoded,PclassFirstMeanEncoded,PclassSecondMeanEncoded,PclassThirdMeanEncoded,SexFemaleMeanEncoded,SexMaleMeanEncoded,EmbarkedCMeanEncoded,EmbarkedQMeanEncoded,EmbarkedSMeanEncoded,TotalRelativesMeanEncoded,ALoneMeanEncoded,OnlyHusbandMeanEncoded,ManyChildrenMeanEncoded,DeckAMeanEncoded,DeckBMeanEncoded,DeckCMeanEncoded,DeckDMeanEncoded,DeckEMeanEncoded,DeckEmptyMeanEncoded,DeckFMeanEncoded,DeckGMeanEncoded,DeckTMeanEncoded
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1
1,0.0,"Braund, Mr. Owen Harris",22.000000,A/5 21171,7.2500,,Braund,Owen Harris,Mr,1,1,0,0,0,0,1,7.250000,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.165450,0.545455,0.611111,0.374793,0.355140,0.373984,0.390698,0.284211,0.187500,0.522491,0.317254,0.381206,0.261538,0.201299,0.201299,0.356522,0.396024,0.348837,0.565891,0.522491,0.373887,0.401418,0.395448,0.378900,0.383459,0.383721,0.384615,0.310909,0.394286,0.397183,0.398317
2,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,PC 17599,71.2833,C85,Cumings,John Bradley,Mrs,1,0,1,0,0,0,2,35.641650,0,0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,-1.0,-1.0,0.791667,0.526946,0.373942,0.486239,0.344444,0.361564,0.377504,0.496599,-1.000000,0.492908,0.611765,0.360424,0.547468,0.744939,0.744939,0.527132,0.379630,0.492228,0.539683,0.492908,0.810811,0.386364,0.381636,0.365385,0.545455,0.371720,0.367496,0.644172,0.379801,0.382228,0.383966
3,1.0,"Heikkinen, Miss. Laina",26.000000,STON/O2. 3101282,7.9250,,Heikkinen,Laina,Miss,0,0,0,0,0,0,1,7.925000,0,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.682759,0.340206,0.380546,0.351485,0.345656,0.368932,0.372881,0.253927,0.411765,0.294393,0.295327,0.365248,0.230570,0.746094,0.746094,0.339655,0.388633,0.337838,0.294393,0.294393,0.355655,0.388652,0.384286,0.359584,0.367625,0.371720,0.371925,0.293578,0.384068,0.385915,0.386236
4,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,113803,53.1000,C123,Futrelle,Jacques Heath,Mrs,1,0,1,0,0,0,2,26.550000,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,-1.0,-1.0,0.790000,0.545455,0.383959,0.527273,0.355140,0.373984,0.390698,0.514085,0.500000,0.522491,0.649425,0.381206,0.563467,0.760956,0.760956,0.356522,0.396024,0.348837,0.565891,0.522491,0.820513,0.401418,0.395448,0.378900,0.604167,0.383721,0.384615,0.693252,0.394286,0.397183,0.398317
5,0.0,"Allen, Mr. William Henry",35.000000,373450,8.0500,,Allen,William Henry,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.158654,0.338809,0.373514,0.357025,0.347826,0.367576,0.373660,0.269231,0.121951,0.304850,0.308688,0.357016,0.237852,0.189130,0.189130,0.349736,0.386154,0.345850,0.304850,0.304850,0.358209,0.389518,0.385164,0.369118,0.367868,0.370803,0.374090,0.306715,0.382311,0.385915,0.386236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,"Spector, Mr. Woolf",31.467822,A.5. 3236,8.0500,,Spector,Woolf,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.156673,0.345395,0.374659,0.359684,0.343658,0.363165,0.374384,0.270270,0.134615,0.303538,0.305185,0.360679,0.242363,0.188908,0.188908,0.344398,0.383292,0.339009,0.303538,0.303538,0.358244,0.387060,0.382420,0.363744,0.368990,0.369464,0.370198,0.299854,0.380410,0.383315,0.384270
1306,,"Oliva y Ocana, Dona. Fermina",39.000000,PC 17758,108.9000,C105,Oliva y Ocana,Fermina,Dona,0,0,0,0,0,0,3,36.300000,0,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.5,-1.0,-1.000000,0.345395,0.374659,0.359684,0.343658,0.363165,0.374384,0.653465,0.500000,0.303538,0.629630,0.360679,0.557500,0.742038,0.742038,0.553571,0.383292,0.502041,0.303538,0.303538,0.358244,0.387060,0.382420,0.363744,0.593220,0.369464,0.370198,0.666667,0.380410,0.383315,0.384270
1307,,"Saether, Mr. Simon Sivertsen",38.500000,SOTON/O.Q. 3101262,7.2500,,Saether,Simon Sivertsen,Mr,0,0,0,0,0,0,1,7.250000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.156673,0.345395,0.374659,0.359684,0.343658,0.363165,0.374384,0.270270,0.157895,0.303538,0.305185,0.360679,0.242363,0.188908,0.188908,0.344398,0.383292,0.339009,0.303538,0.303538,0.358244,0.387060,0.382420,0.363744,0.368990,0.369464,0.370198,0.299854,0.380410,0.383315,0.384270
1308,,"Ware, Mr. Frederick",31.467822,359309,8.0500,,Ware,Frederick,Mr,0,0,0,0,0,0,1,8.050000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.156673,0.345395,0.374659,0.359684,0.343658,0.363165,0.374384,0.270270,0.134615,0.303538,0.305185,0.360679,0.242363,0.188908,0.188908,0.344398,0.383292,0.339009,0.303538,0.303538,0.358244,0.387060,0.382420,0.363744,0.368990,0.369464,0.370198,0.299854,0.380410,0.383315,0.384270


In [44]:
new_columns

['Ticket',
 'Cabin',
 'Title',
 'SibSp',
 'Siblings',
 'Spounce',
 'Parch',
 'Parents',
 'Children',
 'PassengersCount',
 'FarePerPassenger',
 'TravelsAlone',
 'PclassFirst',
 'PclassSecond',
 'PclassThird',
 'SexFemale',
 'SexMale',
 'EmbarkedC',
 'EmbarkedQ',
 'EmbarkedS',
 'TotalRelatives',
 'ALone',
 'OnlyHusband',
 'ManyChildren',
 'DeckA',
 'DeckB',
 'DeckC',
 'DeckD',
 'DeckE',
 'DeckEmpty',
 'DeckF',
 'DeckG',
 'DeckT']

# Save the modified data

In [45]:
filled_df.to_csv('data/No hands data.csv', index=True, header=True,
                 index_label='PassengerId')

check Decision tree

In [46]:
train_df = filled_df[filled_df['Survived'].isna() == False]
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

In [47]:
test_df = filled_df[filled_df['Survived'].isna() == True]
X_test = test_df.drop('Survived', axis=1)


In [48]:
redundant_columns = ['Name', 'FirstName', 'LastName', 'Ticket', 'Cabin',
                     'Title']
X = X.drop(redundant_columns, axis=1)
X_test = X_test.drop(redundant_columns, axis=1)

In [49]:
X

Unnamed: 0_level_0,Age,Fare,SibSp,Siblings,Spounce,Parch,Parents,Children,PassengersCount,FarePerPassenger,ZeroPrice,TravelsAlone,PclassFirst,PclassSecond,PclassThird,SexFemale,SexMale,EmbarkedC,EmbarkedQ,EmbarkedS,TotalRelatives,ALone,DeckA,DeckB,DeckC,DeckD,DeckE,DeckEmpty,DeckF,DeckG,DeckT,OnlyHusband,ManyChildren,TicketMeanEncoded,CabinMeanEncoded,TitleMeanEncoded,SibSpMeanEncoded,SiblingsMeanEncoded,SpounceMeanEncoded,ParchMeanEncoded,ParentsMeanEncoded,ChildrenMeanEncoded,PassengersCountMeanEncoded,FarePerPassengerMeanEncoded,TravelsAloneMeanEncoded,PclassFirstMeanEncoded,PclassSecondMeanEncoded,PclassThirdMeanEncoded,SexFemaleMeanEncoded,SexMaleMeanEncoded,EmbarkedCMeanEncoded,EmbarkedQMeanEncoded,EmbarkedSMeanEncoded,TotalRelativesMeanEncoded,ALoneMeanEncoded,OnlyHusbandMeanEncoded,ManyChildrenMeanEncoded,DeckAMeanEncoded,DeckBMeanEncoded,DeckCMeanEncoded,DeckDMeanEncoded,DeckEMeanEncoded,DeckEmptyMeanEncoded,DeckFMeanEncoded,DeckGMeanEncoded,DeckTMeanEncoded
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1
1,22.000000,7.2500,1,1,0,0,0,0,1,7.25000,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.165450,0.545455,0.611111,0.374793,0.355140,0.373984,0.390698,0.284211,0.187500,0.522491,0.317254,0.381206,0.261538,0.201299,0.201299,0.356522,0.396024,0.348837,0.565891,0.522491,0.373887,0.401418,0.395448,0.378900,0.383459,0.383721,0.384615,0.310909,0.394286,0.397183,0.398317
2,38.000000,71.2833,1,0,1,0,0,0,2,35.64165,0,0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,-1.0,-1.0,0.791667,0.526946,0.373942,0.486239,0.344444,0.361564,0.377504,0.496599,-1.000000,0.492908,0.611765,0.360424,0.547468,0.744939,0.744939,0.527132,0.379630,0.492228,0.539683,0.492908,0.810811,0.386364,0.381636,0.365385,0.545455,0.371720,0.367496,0.644172,0.379801,0.382228,0.383966
3,26.000000,7.9250,0,0,0,0,0,0,1,7.92500,0,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.682759,0.340206,0.380546,0.351485,0.345656,0.368932,0.372881,0.253927,0.411765,0.294393,0.295327,0.365248,0.230570,0.746094,0.746094,0.339655,0.388633,0.337838,0.294393,0.294393,0.355655,0.388652,0.384286,0.359584,0.367625,0.371720,0.371925,0.293578,0.384068,0.385915,0.386236
4,35.000000,53.1000,1,0,1,0,0,0,2,26.55000,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,-1.0,-1.0,0.790000,0.545455,0.383959,0.527273,0.355140,0.373984,0.390698,0.514085,0.500000,0.522491,0.649425,0.381206,0.563467,0.760956,0.760956,0.356522,0.396024,0.348837,0.565891,0.522491,0.820513,0.401418,0.395448,0.378900,0.604167,0.383721,0.384615,0.693252,0.394286,0.397183,0.398317
5,35.000000,8.0500,0,0,0,0,0,0,1,8.05000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.158654,0.338809,0.373514,0.357025,0.347826,0.367576,0.373660,0.269231,0.121951,0.304850,0.308688,0.357016,0.237852,0.189130,0.189130,0.349736,0.386154,0.345850,0.304850,0.304850,0.358209,0.389518,0.385164,0.369118,0.367868,0.370803,0.374090,0.306715,0.382311,0.385915,0.386236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,27.000000,13.0000,0,0,0,0,0,0,1,13.00000,0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.000000,0.340206,0.380546,0.351485,0.345656,0.368932,0.372881,0.253927,0.416667,0.294393,0.295327,0.463087,0.568807,0.183807,0.183807,0.339655,0.388633,0.337838,0.294393,0.294393,0.355655,0.388652,0.384286,0.359584,0.367625,0.371720,0.371925,0.293578,0.384068,0.385915,0.386236
888,19.000000,30.0000,0,0,0,0,0,0,1,30.00000,0,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.703448,0.349794,0.373942,0.364842,0.344444,0.361564,0.377504,0.287206,0.714286,0.311628,0.611765,0.360424,0.547468,0.744939,0.744939,0.351630,0.379630,0.342967,0.311628,0.311628,0.360000,0.386364,0.381636,0.722222,0.372754,0.371720,0.367496,0.644172,0.379801,0.382228,0.383966
889,10.520189,23.4500,1,1,0,2,2,0,4,5.86250,0,0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,3,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,0.0,-1.0,0.734694,0.545455,0.611111,0.374793,0.492754,0.458333,0.390698,0.777778,0.000000,0.522491,0.317254,0.381206,0.261538,0.760956,0.760956,0.356522,0.396024,0.348837,0.769231,0.522491,0.373887,0.401418,0.395448,0.378900,0.383459,0.383721,0.384615,0.310909,0.394286,0.397183,0.398317
890,26.000000,30.0000,0,0,0,0,0,0,1,30.00000,0,1,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.154412,0.340206,0.380546,0.351485,0.345656,0.368932,0.372881,0.253927,0.800000,0.294393,0.657303,0.365248,0.568807,0.183807,0.183807,0.586466,0.388633,0.512821,0.294393,0.294393,0.355655,0.388652,0.384286,0.359584,0.615385,0.371720,0.371925,0.684524,0.384068,0.385915,0.386236


In [50]:
scaler = MinMaxScaler()
scaler.fit(X)
X_scaled_knn = \
    pd.DataFrame(scaler.transform(X), columns=X.columns, index=X.index)
X_test_scaled_knn = \
    pd.DataFrame(scaler.transform(X_test), columns=X_test.columns,
                 index=X_test.index)

In [51]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns,
                        index=X.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns,
                             index=X_test.index)

In [52]:
def generate_submission(model, X, y, X_test):
    model.fit(X, y)
    preds = model.predict(X_test).astype(int)
    submission_df = pd.DataFrame({'PassengerId': X_test.index,
                                'Survived': preds})
    submission_df.to_csv('data/DecisionTreeSubmission.csv', index=False)
    print('Ready')
    return preds

In [53]:
def search_grid(model, grid, X, y):
    grid_search = GridSearchCV(model, param_grid=grid,
                               n_jobs=-1, cv=2)
    grid_search.fit(X, y)
    return grid_search.best_params_, grid_search.best_score_

In [54]:
X

Unnamed: 0_level_0,Age,Fare,SibSp,Siblings,Spounce,Parch,Parents,Children,PassengersCount,FarePerPassenger,ZeroPrice,TravelsAlone,PclassFirst,PclassSecond,PclassThird,SexFemale,SexMale,EmbarkedC,EmbarkedQ,EmbarkedS,TotalRelatives,ALone,DeckA,DeckB,DeckC,DeckD,DeckE,DeckEmpty,DeckF,DeckG,DeckT,OnlyHusband,ManyChildren,TicketMeanEncoded,CabinMeanEncoded,TitleMeanEncoded,SibSpMeanEncoded,SiblingsMeanEncoded,SpounceMeanEncoded,ParchMeanEncoded,ParentsMeanEncoded,ChildrenMeanEncoded,PassengersCountMeanEncoded,FarePerPassengerMeanEncoded,TravelsAloneMeanEncoded,PclassFirstMeanEncoded,PclassSecondMeanEncoded,PclassThirdMeanEncoded,SexFemaleMeanEncoded,SexMaleMeanEncoded,EmbarkedCMeanEncoded,EmbarkedQMeanEncoded,EmbarkedSMeanEncoded,TotalRelativesMeanEncoded,ALoneMeanEncoded,OnlyHusbandMeanEncoded,ManyChildrenMeanEncoded,DeckAMeanEncoded,DeckBMeanEncoded,DeckCMeanEncoded,DeckDMeanEncoded,DeckEMeanEncoded,DeckEmptyMeanEncoded,DeckFMeanEncoded,DeckGMeanEncoded,DeckTMeanEncoded
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1
1,22.000000,7.2500,1,1,0,0,0,0,1,7.25000,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.165450,0.545455,0.611111,0.374793,0.355140,0.373984,0.390698,0.284211,0.187500,0.522491,0.317254,0.381206,0.261538,0.201299,0.201299,0.356522,0.396024,0.348837,0.565891,0.522491,0.373887,0.401418,0.395448,0.378900,0.383459,0.383721,0.384615,0.310909,0.394286,0.397183,0.398317
2,38.000000,71.2833,1,0,1,0,0,0,2,35.64165,0,0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,-1.0,-1.0,0.791667,0.526946,0.373942,0.486239,0.344444,0.361564,0.377504,0.496599,-1.000000,0.492908,0.611765,0.360424,0.547468,0.744939,0.744939,0.527132,0.379630,0.492228,0.539683,0.492908,0.810811,0.386364,0.381636,0.365385,0.545455,0.371720,0.367496,0.644172,0.379801,0.382228,0.383966
3,26.000000,7.9250,0,0,0,0,0,0,1,7.92500,0,1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.682759,0.340206,0.380546,0.351485,0.345656,0.368932,0.372881,0.253927,0.411765,0.294393,0.295327,0.365248,0.230570,0.746094,0.746094,0.339655,0.388633,0.337838,0.294393,0.294393,0.355655,0.388652,0.384286,0.359584,0.367625,0.371720,0.371925,0.293578,0.384068,0.385915,0.386236
4,35.000000,53.1000,1,0,1,0,0,0,2,26.55000,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,-1.0,-1.0,0.790000,0.545455,0.383959,0.527273,0.355140,0.373984,0.390698,0.514085,0.500000,0.522491,0.649425,0.381206,0.563467,0.760956,0.760956,0.356522,0.396024,0.348837,0.565891,0.522491,0.820513,0.401418,0.395448,0.378900,0.604167,0.383721,0.384615,0.693252,0.394286,0.397183,0.398317
5,35.000000,8.0500,0,0,0,0,0,0,1,8.05000,0,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.158654,0.338809,0.373514,0.357025,0.347826,0.367576,0.373660,0.269231,0.121951,0.304850,0.308688,0.357016,0.237852,0.189130,0.189130,0.349736,0.386154,0.345850,0.304850,0.304850,0.358209,0.389518,0.385164,0.369118,0.367868,0.370803,0.374090,0.306715,0.382311,0.385915,0.386236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,27.000000,13.0000,0,0,0,0,0,0,1,13.00000,0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.000000,0.340206,0.380546,0.351485,0.345656,0.368932,0.372881,0.253927,0.416667,0.294393,0.295327,0.463087,0.568807,0.183807,0.183807,0.339655,0.388633,0.337838,0.294393,0.294393,0.355655,0.388652,0.384286,0.359584,0.367625,0.371720,0.371925,0.293578,0.384068,0.385915,0.386236
888,19.000000,30.0000,0,0,0,0,0,0,1,30.00000,0,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.703448,0.349794,0.373942,0.364842,0.344444,0.361564,0.377504,0.287206,0.714286,0.311628,0.611765,0.360424,0.547468,0.744939,0.744939,0.351630,0.379630,0.342967,0.311628,0.311628,0.360000,0.386364,0.381636,0.722222,0.372754,0.371720,0.367496,0.644172,0.379801,0.382228,0.383966
889,10.520189,23.4500,1,1,0,2,2,0,4,5.86250,0,0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,3,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,0.0,-1.0,0.734694,0.545455,0.611111,0.374793,0.492754,0.458333,0.390698,0.777778,0.000000,0.522491,0.317254,0.381206,0.261538,0.760956,0.760956,0.356522,0.396024,0.348837,0.769231,0.522491,0.373887,0.401418,0.395448,0.378900,0.383459,0.383721,0.384615,0.310909,0.394286,0.397183,0.398317
890,26.000000,30.0000,0,0,0,0,0,0,1,30.00000,0,1,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-1.0,-1.0,0.154412,0.340206,0.380546,0.351485,0.345656,0.368932,0.372881,0.253927,0.800000,0.294393,0.657303,0.365248,0.568807,0.183807,0.183807,0.586466,0.388633,0.512821,0.294393,0.294393,0.355655,0.388652,0.384286,0.359584,0.615385,0.371720,0.371925,0.684524,0.384068,0.385915,0.386236


In [55]:
dt_grid = {'min_samples_split': [2, 3, 5],
        'max_leaf_nodes': [None, 3, 5, 10, 15],
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'min_samples_leaf': [2, 3, 4, 5, 10, 15, 20]
        }
dt = DecisionTreeClassifier(random_state=0)
dt_best_params, dt_best_score = search_grid(dt, dt_grid, X, y)
dt.set_params(**dt_best_params)
print(dt_best_params, dt_best_score)
dt_preds = generate_submission(dt, X, y, X_test)
cheat_score(dt_preds)

{'max_depth': 4, 'max_leaf_nodes': 5, 'min_samples_leaf': 2, 'min_samples_split': 2} 0.8226684133622211
Ready
Male accuracy: 0.8195488721804511
Female accuracy: 0.7171052631578947
Overall accuracy: 0.7822966507177034


In [56]:
rf_grid = {
    'min_samples_split': [2, 3, 5],
        'max_leaf_nodes': [None, 3, 5, 10, 15],
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11],
        'min_samples_leaf': [2, 3, 4, 5, 10, 15, 20],
        'n_estimators': [30, 40, 50, 60, 100],
        }
rf = RandomForestClassifier(random_state=0)
rf_best_params, rf_best_score = search_grid(rf, rf_grid, X, y)
rf.set_params(**rf_best_params)
print(rf_best_params, rf_best_score)
rf_preds = generate_submission(rf, X, y, X_test)
cheat_score(rf_preds)

{'max_depth': 11, 'max_leaf_nodes': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 60} 0.836138963067466
Ready
Male accuracy: 0.8007518796992481
Female accuracy: 0.756578947368421
Overall accuracy: 0.784688995215311


In [57]:
knn_grid = {'n_neighbors': [i for i in range(1, 30)],
        'metric': ['euclidean'],
        'p': [0.5, 1, 2, 3, 4, 5]}
knn = KNeighborsClassifier(n_jobs=-1)
knn_best_params, knn_best_score = search_grid(knn, knn_grid, X_scaled_knn, y)
knn.set_params(**knn_best_params)
print(knn_best_params, knn_best_score)
knn_preds = generate_submission(knn, X_scaled_knn, y, X_test_scaled_knn)
cheat_score(knn_preds)

{'metric': 'euclidean', 'n_neighbors': 7, 'p': 0.5} 0.8114551317579484
Ready
Male accuracy: 0.7969924812030075
Female accuracy: 0.7368421052631579
Overall accuracy: 0.7751196172248804


In [58]:
kbest = RFECV(LogisticRegression(random_state=0), cv=5).fit(X_scaled, y)
used_features = kbest.get_support()
X_scaled_rfe = X_scaled.loc[:, used_features]
X_test_scaled_rfe = X_test_scaled.loc[:, used_features]

lr_grid = {
        'penalty': ['l2', 'l1'],
        'solver': ['lbfgs', 'liblinear', 'sag', 'saga'],
        'C': [0.01, 0.1, 0.3, 0.5, 0.7, 1, 10, 100],
        'max_iter': [100, 500]
            }
lr = LogisticRegression(random_state=0, n_jobs=-1)
lr_best_params, lr_best_score = search_grid(lr, lr_grid, X_scaled_rfe, y)
lr.set_params(**lr_best_params)
print(lr_best_params, lr_best_score)
lr_preds = generate_submission(lr, X_scaled_rfe, y, X_test_scaled_rfe)
cheat_score(lr_preds)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 0.5, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'} 0.8125736887186981
Ready
Male accuracy: 0.8007518796992481
Female accuracy: 0.743421052631579
Overall accuracy: 0.7799043062200957


In [59]:
xgb_grid = {
    'n_estimators': [10, 20, 30],
    'learning_rate': [0.001, 0.1, 0.2, 0.3],
    'colsample_bytree': [0.4, 0.6, 0.8, 1],
    'colsample_bylevel': [0.4, 0.6, 0.8, 1],
    'max_depth': [1, 2, 3, 4, 5]
        }
xgb_cl = XGBClassifier(random_state=0, n_jobs=-1)
xgb_best_params, xgb_best_score = search_grid(xgb_cl, xgb_grid, X, y)
xgb_cl.set_params(**xgb_best_params)
print(xgb_best_params, xgb_best_score)
xgb_preds = generate_submission(xgb_cl, X, y, X_test)
cheat_score(xgb_preds)


{'colsample_bylevel': 0.4, 'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 10} 0.8439915352446213
Ready
Male accuracy: 0.7706766917293233
Female accuracy: 0.743421052631579
Overall accuracy: 0.7607655502392344




In [60]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
tf.random.set_seed(0)
model = Sequential()
model.add(Dense(units=20, activation='relu'))
model.add(Dropout(rate=0.15))
model.add(Dense(units=20, activation='relu'))
model.add(Dropout(rate=0.15))
model.add(Dense(units=1, activation='tanh'))
model.compile(optimizer='adam', loss='squared_hinge',
              metrics=['accuracy'])
model.fit(X_scaled, y, batch_size=1, epochs=40, verbose=False)
ann_preds = model.predict(X_test_scaled)

In [61]:
ann_preds = np.where(ann_preds >= 0.5, 1, 0)
cheat_score(ann_preds)
ann_preds = ann_preds.reshape(418,)

Male accuracy: 0.8007518796992481
Female accuracy: 0.7368421052631579
Overall accuracy: 0.777511961722488


In [62]:
full_preds = pd.DataFrame({'DT': dt_preds,
                          'KNN': knn_preds,
                          'LogReg': lr_preds,
                          'XGB': xgb_preds,
                          'RF': rf_preds,
                          'ANN': ann_preds,
                           },
                         index=X_test.index)
full_preds

Unnamed: 0_level_0,DT,KNN,LogReg,XGB,RF,ANN
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,0,0,0,0,0,0
893,0,0,0,1,0,0
894,0,0,0,0,0,0
895,0,0,0,0,0,0
896,1,0,1,1,1,1
...,...,...,...,...,...,...
1305,0,0,0,0,0,0
1306,0,1,1,1,1,1
1307,0,0,0,0,0,0
1308,0,0,0,0,0,0


In [63]:
cheat_score(full_preds['DT'].round(0))

Male accuracy: 0.8195488721804511
Female accuracy: 0.7171052631578947
Overall accuracy: 0.7822966507177034


In [64]:
cheat_score(full_preds['KNN'].round(0))

Male accuracy: 0.7969924812030075
Female accuracy: 0.7368421052631579
Overall accuracy: 0.7751196172248804


In [65]:
cheat_score(full_preds['LogReg'].round(0))

Male accuracy: 0.8007518796992481
Female accuracy: 0.743421052631579
Overall accuracy: 0.7799043062200957


In [66]:
cheat_score(full_preds['XGB'].round(0))

Male accuracy: 0.7706766917293233
Female accuracy: 0.743421052631579
Overall accuracy: 0.7607655502392344


In [67]:
cheat_score(full_preds['RF'].round(0))

Male accuracy: 0.8007518796992481
Female accuracy: 0.756578947368421
Overall accuracy: 0.784688995215311


In [68]:
cheat_score(full_preds['ANN'].round(0))

Male accuracy: 0.8007518796992481
Female accuracy: 0.7368421052631579
Overall accuracy: 0.777511961722488


In [69]:
full_preds['Pred'] = (
                    full_preds['DT']
                    + full_preds['KNN']
                    + full_preds['LogReg']
                    # + full_preds['RF']
                    + full_preds['XGB']
                    # + full_preds['ANN']
                    ) / 4

full_preds['PredRounded'] = full_preds['Pred'].round(0)
full_preds['PredRounded'].value_counts()
cheat_score(full_preds['PredRounded'])

Male accuracy: 0.8233082706766918
Female accuracy: 0.756578947368421
Overall accuracy: 0.7990430622009569


In [70]:
full_preds['PredRounded']

PassengerId
892     0.0
893     0.0
894     0.0
895     0.0
896     1.0
       ... 
1305    0.0
1306    1.0
1307    0.0
1308    0.0
1309    1.0
Name: PredRounded, Length: 418, dtype: float64

In [71]:
submission_df = pd.DataFrame({'PassengerId': X_test.index,
                              'Survived': full_preds['PredRounded'].astype
                              (int)})
submission_df.to_csv('data/FullSubmission.csv', index=False)