In [1]:
# Load packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [21]:
#load the data and concatenate in "dfall" to process it
train=pd.read_csv('/kaggle/input/titanic/train.csv')
test=pd.read_csv('/kaggle/input/titanic/test.csv')
submission=pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
dfall = pd.concat([train.loc[:,'Pclass':'Embarked'], test.loc[:,'Pclass':'Embarked']])
#It is easier to have a passenger identifier so we don't deal with duplicate index values. So before doing anything...
dfall.reset_index(inplace=True)
dfall['PassId'] = dfall.index
dfall['SibSp1'] = dfall.SibSp
dfall['Parch1'] = dfall.Parch
dfall.head()

Unnamed: 0,index,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,PassId,SibSp1,Parch1
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,1,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,1,0
2,2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2,0,0
3,3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,3,1,0
4,4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,4,0,0


In [None]:
test.describe()

From a quick first analysis we can see that there are only 891 samples for the training set and 418 for the test. The data set size is quite limited, so we should really strive to take advantage of all the data we have. My initial thoughts on the survival is "children and woman go first", so age and gender may play a role. I also remember that the bad guy in the Titanic movie took a baby to look like a father and be saved, the point is: family perception may play a role. Is not the same to be a woman, than to be the mother of 3 kids under 10. Another factors that I suspect may influence the output is the position of the cabin. The ones on the upper floor have a bit of advantage. The iceberg hit one side, so that side may have disadvantages. The economic position of the passenger may also have played a role in perception and in position, so the Fare, passenger class and title may count. In any case, we should try to deduct the most we can from the data we have. But let's see first missing values.

In [None]:
#dealing with missing values
total=dfall.isnull().sum()
print(total)

Most missing data is on the Cabin field. Probably there is not much to infere about it. Age has 263 missing values. We could use a simple average average to fill the Age, but since we have gender and we can use the titles(Miss, Mr, Major, etc) from the Name field, and use them to do something a bit more elegant and precise. Let's split the titles in a column

In [24]:
#Now let's use a regular expression to split the Name in 4 fields
dfall[['NaFirst','NaTitle','NaLast','NaRest']] = dfall['Name'].str.extract('^(.+), (.+?)\. ([\w\s\"]+)?(?:\s*\((.*?)\))?')
#a bit of cleaning is easier here than in the regexp
dfall['NaLast'] = dfall['NaLast'].str.strip()
dfall['NaRest'] = dfall['NaRest'].str.replace('"','')
dfall[['NaFirst','NaTitle','NaLast','NaRest']].isnull().sum()

NaFirst       0
NaTitle       0
NaLast       20
NaRest     1088
dtype: int64

In [None]:
#Lets see the values of the Titles.
dfall['NaTitle'].unique()
# print(dfall.loc[dfall.Name.str.contains('Mrs. Martin')])

There are titles French, Italian, Spanish, and some other related to nobility or profession. All cases found are over age 23 so, instead of translating, we set those to Mr and Mrs depending on the gender, that way we tackle the tricky ones as "Dr." that can be male or female.

In [25]:
#replacing the abnormal titles with Mr/Mrs depending on gender
abnormal_title = ~dfall['NaTitle'].isin(['Mr', 'Mrs', 'Miss', 'Master'])
dfall.loc[abnormal_title & (dfall.Sex == 'male'), 'NaTitle'] = 'Mr'
dfall.loc[abnormal_title & (dfall.Sex == 'female'), 'NaTitle'] = 'Mrs'
print(dfall.loc[abnormal_title].sort_values('Age').head(5))


     index  Pclass                                              Name     Sex  \
398    398       2                                  Pain, Dr. Alfred    male   
641    641       1                              Sagesser, Mlle. Emma  female   
369    369       1                     Aubart, Mme. Leontine Pauline  female   
710    710       1  Mayne, Mlle. Berthe Antonine ("Mrs de Villiers")  female   
886    886       2                             Montvila, Rev. Juozas    male   

      Age  SibSp  Parch    Ticket     Fare Cabin Embarked  PassId  SibSp1  \
398  23.0      0      0    244278  10.5000   NaN        S     398       0   
641  24.0      0      0  PC 17477  69.3000   B35        C     641       0   
369  24.0      0      0  PC 17477  69.3000   B35        C     369       0   
710  24.0      0      0  PC 17482  49.5042   C90        C     710       0   
886  27.0      0      0    211536  13.0000   NaN        S     886       0   

     Parch1   NaFirst NaTitle            NaLast         

**TA DAAAA! ** we have now names nicely split and with standard titles. Let's now assign the family position. For that the interesting fields are ParCh and SibSp, those aggregate number of Parent/Children and Sibling/Spouse. Let's de-agregate them. We will look for the married couples to deagregate them from SibSp. In the next cell I do a bit of cleaning to fix errors in Name. I got these errors by finding couples (two cells below), then looking for couples with relaxed restrictions.

In [None]:
dfall.loc[dfall.Name.str.contains('Hocking, Mr')]

Let's look for couples. It is easier to first find wives (NaTitle==Mrs, NaRest!=NaN) and then look for the husband (same NaFirst, same NaLast and NaTitle=Mr). We will take advantage of the same loop to look for kids of these suspected wives. 

In [26]:
def updateSpouse(couple):
    if len(couple) == 2:
        dfall.at[couple[0], 'Sp'] = couple[1]
        dfall.at[couple[0], 'SibSp'] -= 1
        dfall.at[couple[1], 'Sp'] = couple[0]
        dfall.at[couple[1], 'SibSp'] -= 1
    
def updateFamily(parents,kids):
    if len(kids)>0:
        parentsStr = ', '.join(map(str, parents))
        kidsStr = ', '.join(map(str, kids))
        dfall.at[parents[0], 'Ch'] = kidsStr
        dfall.at[parents[0], 'Parch'] -= len(kids)
        dfall.at[parents[1], 'Ch'] = kidsStr
        dfall.at[parents[1], 'Parch'] -= len(kids)
        for kidIndex in kids:
            dfall.at[kidIndex, 'Par'] = parentsStr
            dfall.at[kidIndex, 'Parch'] -= len(parents)
            if len(kids) > 1:
                dfall.at[kidIndex, 'Sib'] = kidsStr
                dfall.at[kidIndex, 'SibSp'] -= (len(kids)-1)
    
def updateSiblings(siblings):
    if len(siblings)>1:
        sibStr = ', '.join(map(str, siblings))
        for sibIndex in siblings:
            dfall.at[sibIndex, 'Sib'] = sibStr
            dfall.at[sibIndex, 'SibSp'] -= (len(siblings)-1)    

In [27]:
def updateFamily(parents,kids):
    if len(kids)>0:
        parentsStr = ', '.join(map(str, parents))
        kidsStr = ', '.join(map(str, kids))
        dfall.at[parents[0], 'Ch'] = kidsStr
        dfall.at[parents[0], 'Parch'] -= len(kids)
        dfall.at[parents[1], 'Ch'] = kidsStr
        dfall.at[parents[1], 'Parch'] -= len(kids)
        for kidIndex in kids:
            dfall.at[kidIndex, 'Par'] = parentsStr
            dfall.at[kidIndex, 'Parch'] -= len(parents)
            if len(kids) > 1:
                dfall.at[kidIndex, 'Sib'] = kidsStr
                dfall.at[kidIndex, 'SibSp'] -= (len(kids)-1)

In [28]:
def updateSiblings(siblings):
    if len(siblings)>1:
        sibStr = ', '.join(map(str, siblings))
        for sibIndex in siblings:
            dfall.at[sibIndex, 'Sib'] = sibStr
            dfall.at[sibIndex, 'SibSp'] -= (len(siblings)-1)           
                

In [29]:
#let's start building from the wives, since those are easier to find and... are the center of the family. Right?
wives = dfall[(dfall['NaTitle'] == 'Mrs') & (~dfall['NaRest'].isnull()) & (dfall['SibSp']>0)]
dfall['Sib']=""
dfall['Sp']=""
dfall['Par']=""
dfall['Ch']=""
#Traditional couples have the same NaFirst, a married title and either same NaLast
for wifeInd, wifeRow in wives.iterrows():
    parentsMask = ((dfall['NaFirst']==wifeRow['NaFirst']) & (dfall['NaTitle']=='Mr') & (dfall['SibSp']>0) & \
                   (dfall['NaLast']==wifeRow['NaLast']))
    coupleDf=dfall.index[(parentsMask)|(dfall.index==wifeInd)]
    updateSpouse(coupleDf)

In [32]:
#Couples with different NaLast, but similar Ticket  
wives = dfall[(dfall['NaTitle'] == 'Mrs') & (~dfall['NaRest'].isnull()) & (dfall['SibSp']>0)]
for wifeInd, wifeRow in wives.iterrows():
    parentsMask = ((dfall['NaFirst']==wifeRow['NaFirst']) & (dfall['NaTitle']=='Mr') & (dfall['SibSp']>0) & \
                   (dfall.Ticket==wifeRow['Ticket']) & (dfall['Pclass']==wifeRow['Pclass']))
    coupleDf=dfall.index[(parentsMask)|(dfall.index==wifeInd)]
    updateSpouse(coupleDf)

In [33]:
dfallbackup=dfall.copy()

In [37]:
dfall.loc[dfall.Name.str.contains('Andersson')].sort_values('Ticket')

Unnamed: 0,index,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,SibSp1,Parch1,NaFirst,NaTitle,NaLast,NaRest,Sib,Sp,Par,Ch
68,68,3,"Andersson, Miss. Erna Alexandra",female,17.0,4,2,3101281,7.925,,...,4,2,Andersson,Miss,Erna Alexandra,,,,,
981,90,3,"Dyker, Mrs. Adolf Fredrik (Anna Elisabeth Judi...",female,22.0,0,0,347072,13.9,,...,1,0,Dyker,Mrs,Adolf Fredrik,Anna Elisabeth Judith Andersson,,1063.0,,
1211,320,3,"Andersson, Mr. Johan Samuel",male,26.0,0,0,347075,7.775,,...,0,0,Andersson,Mr,Johan Samuel,,,,,
13,13,3,"Andersson, Mr. Anders Johan",male,39.0,0,0,347082,31.275,,...,1,5,Andersson,Mr,Anders Johan,,,610.0,,"119, 541, 542, 813, 850"
119,119,3,"Andersson, Miss. Ellis Anna Maria",female,2.0,0,0,347082,31.275,,...,4,2,Andersson,Miss,Ellis Anna Maria,,"119, 541, 542, 813, 850",,"13, 610",
541,541,3,"Andersson, Miss. Ingeborg Constanzia",female,9.0,0,0,347082,31.275,,...,4,2,Andersson,Miss,Ingeborg Constanzia,,"119, 541, 542, 813, 850",,"13, 610",
542,542,3,"Andersson, Miss. Sigrid Elisabeth",female,11.0,0,0,347082,31.275,,...,4,2,Andersson,Miss,Sigrid Elisabeth,,"119, 541, 542, 813, 850",,"13, 610",
610,610,3,"Andersson, Mrs. Anders Johan (Alfrida Konstant...",female,39.0,0,0,347082,31.275,,...,1,5,Andersson,Mrs,Anders Johan,Alfrida Konstantia Brogren,,13.0,,"119, 541, 542, 813, 850"
813,813,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,0,0,347082,31.275,,...,4,2,Andersson,Miss,Ebba Iris Alfrida,,"119, 541, 542, 813, 850",,"13, 610",
850,850,3,"Andersson, Master. Sigvard Harald Elias",male,4.0,0,0,347082,31.275,,...,4,2,Andersson,Master,Sigvard Harald Elias,,"119, 541, 542, 813, 850",,"13, 610",


In [36]:
moms = dfall[(dfall.Sex=='female') & (dfall.Sp!='') & (dfall.Parch>0)]
#Traditional families have kids with same NaFirst, 2 parents, kids'number of siblings= parent number of kids-1
for momInd, momRow in moms.iterrows():
    parentsDf = dfall.index[(dfall.index==momRow['Sp'])|(dfall.index==momInd)]
    kidsMask = (dfall.Pclass==momRow['Pclass']) & ((dfall.SibSp+1)==min(momRow['Parch'],dfall.at[momRow['Sp'],'Parch'])) &\
        ((dfall.NaFirst==momRow['NaFirst'])|(dfall.NaRest.str.contains(momRow['NaFirst']))) & (dfall.Parch==2) &\
        (dfall.Fare==momRow['Fare'])
    kidsDf=dfall.index[kidsMask]
    updateFamily(parentsDf, kidsDf)

In [39]:
#Same as before, but traveling separate
moms = dfall[(dfall.Sex=='female') & (dfall.Sp!='') & (dfall.Parch>0)]
for momInd, momRow in moms.iterrows():
    parentsDf = dfall.index[(dfall.index==momRow['Sp'])|(dfall.index==momInd)]
    kidsMask = (dfall.Pclass==momRow['Pclass']) & ((dfall.SibSp+1)==min(momRow['Parch'],dfall.at[momRow['Sp'],'Parch'])) &\
        (dfall.Name.str.contains(momRow['NaFirst'])) & (dfall.Parch==2)
    kidsDf=dfall.index[kidsMask]
    updateFamily(parentsDf, kidsDf)

In [40]:
siblings = dfall[(dfall.SibSp>0)]
for sibInd, sibRow in siblings.iterrows():
    sibMask = (dfall.Pclass==sibRow['Pclass']) & (dfall.SibSp==sibRow['SibSp']) &\
        ((dfall.NaFirst==sibRow['NaFirst'])|(dfall.NaRest.str.contains(sibRow['NaFirst'])))
    sibDf=dfall.index[sibMask]
    if len(sibDf)==(sibRow['SibSp']+1):
        updateSiblings(sibDf)

In [41]:
siblings = dfall[(dfall.SibSp>0)]
for sibInd, sibRow in siblings.iterrows():
    sibMask = (dfall.Pclass==sibRow['Pclass']) & (dfall.SibSp==sibRow['SibSp']) &\
        (dfall.Name.str.contains(sibRow['NaFirst']))
    sibDf=dfall.index[sibMask]
    if len(sibDf)==(sibRow['SibSp']+1):
        updateSiblings(sibDf)

In [65]:
#Lets look now for sisters who changed name
siblings = dfall[(~dfall.NaRest.isna()) & (dfall.SibSp>0)& (dfall.Sex=='female')]
for sibInd, sibRow in siblings.iterrows():
    lastName=str(sibRow['NaRest']).split()
    sibMask = (dfall.SibSp==sibRow['SibSp']) & (dfall.Name.str.contains(lastName[-1]))
    sibDf=dfall.index[sibMask]
    if len(sibDf)==(sibRow['SibSp']+1):
        updateSiblings(sibDf)

In [62]:
siblings = dfall[(~dfall.NaRest.isna()) & (dfall.SibSp>0)& (dfall.Sex=='female')]
for sibInd, sibRow in siblings.iterrows():
    x=str(sibRow['NaRest']).split()
    print(sibRow['Name'],sibRow['SibSp'],)
    print(x[-1])

Ahlin, Mrs. Johan (Johanna Persdotter Larsson) 1
Larsson
Appleton, Mrs. Edward Dale (Charlotte Lamson) 2
Lamson
Hocking, Mrs. Elizabeth (Eliza Needs) 1
Needs
Wilkes, Mrs. James (Ellen Needs) 1
Needs
Cornell, Mrs. Robert Clifford (Malvina Helen Lamson) 2
Lamson
Douglas, Mrs. Frederick Charles (Mary Helene Baxter) 1
Baxter
Brown, Mrs. John Murray (Caroline Lane Lamson) 2
Lamson


In [None]:
dfall=dfallbackup.copy()

In [68]:
# print(moms)
dfall.loc[dfall.SibSp>0].sort_values('Ticket')

Unnamed: 0,index,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,SibSp1,Parch1,NaFirst,NaTitle,NaLast,NaRest,Sib,Sp,Par,Ch
755,755,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,...,1,1,Hamalainen,Master,Viljo,,,,,
1129,238,2,"Hiltunen, Miss. Marta",female,18.0,1,1,250650,13.0,,...,1,1,Hiltunen,Miss,Marta,,,,,
68,68,3,"Andersson, Miss. Erna Alexandra",female,17.0,4,2,3101281,7.925,,...,4,2,Andersson,Miss,Erna Alexandra,,,,,
442,442,3,"Petterson, Mr. Johan Emil",male,25.0,1,0,347076,7.775,,...,1,0,Petterson,Mr,Johan Emil,,,,,
1105,214,3,"Andersson, Miss. Ida Augusta Margareta",female,38.0,4,2,347091,7.775,,...,4,2,Andersson,Miss,Ida Augusta Margareta,,,,,
40,40,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40.0,1,0,7546,9.475,,...,1,0,Ahlin,Mrs,Johan,Johanna Persdotter Larsson,,,,
145,145,2,"Nicholls, Mr. Joseph Charles",male,19.0,1,1,C.A. 33112,36.75,,...,1,1,Nicholls,Mr,Joseph Charles,,,,,
549,549,2,"Davies, Master. John Morgan Jr",male,8.0,1,1,C.A. 33112,36.75,,...,1,1,Davies,Master,John Morgan Jr,,,,,
1075,184,1,"Douglas, Mrs. Frederick Charles (Mary Helene B...",female,27.0,1,1,PC 17558,247.5208,B58 B60,...,1,1,Douglas,Mrs,Frederick Charles,Mary Helene Baxter,,,,


In [None]:
dfall.loc[ (dfall.SibSp < dfall.SibSp1) & (dfall['NaTitle'].isin(['Mr','Mrs']))].sort_values('NaFirst')

In [None]:
def updateFamily(parents,kids):
    kidsStr = ', '.join(map(str, kids))
    if len(parents) == 2:
        parentsStr = ', '.join(map(str, parents))
        dfall.at[parents[0], 'Sp'] = parents[1]
        dfall.at[parents[0], 'SibSp'] -= 1
        dfall.at[parents[1], 'Sp'] = parents[0]
        dfall.at[parents[1], 'SibSp'] -= 1
        if (len(kids)>0):
            dfall.at[parents[0], 'Ch'] = kidsStr
            dfall.at[parents[0], 'Parch'] -= 1
            dfall.at[parents[1], 'Ch'] = kidsStr
            dfall.at[parents[1], 'Parch'] -= 1
    elif len(parents) == 1:
        parentsStr = str(parents[0])
        dfall.at[parents[0], 'Ch'] = kidsStr
        dfall.at[parents[0], 'Parch'] -= len(kids)
    else:
        parentsStr=""
    for kidIndex in kids:
        if len(parents)>0:
            dfall.at[kidIndex, 'Par'] = parentsStr
            dfall.at[kidIndex, 'Parch'] -= len(parents)
        if len(kids) > 1:
            dfall.at[kidIndex, 'Sib'] = kidsStr
            dfall.at[kidIndex, 'SibSp'] -= (len(kids)-1)

In [None]:
dfall['SibSp']=dfall['SibSp1']
dfall['Parch']=dfall['Parch1']
dfall['Sib']=""
dfall['Sp']=""
dfall['Par']=""
dfall['Ch']=""
#let's start building from the wives, since those are easier to find and... are the center of the family. Right?
wives = dfall[(dfall['NaTitle'] == 'Mrs') & (~dfall['NaRest'].isnull()) & (dfall['SibSp']>0)]
for wifeInd, wifeRow in wives.iterrows():
    #couples have the same NaFirst, a married title, SibSp>0 and either same NaLast, or same Ticket  
    parentsMask = ((dfall['NaFirst']==wifeRow['NaFirst']) & (dfall['NaTitle']=='Mr') & (dfall['SibSp']>0) & \
                   (dfall['NaLast']==wifeRow['NaLast']))
    parentsDf=dfall.index[(parentsMask)|(dfall.index==wifeInd)]
    if wifeRow['Parch']>0:
        #Kids travel in the same room as mom. So they share first name and ticket, but different last name 
        kidsMask = (dfall['NaFirst']==wifeRow['NaFirst']) & (dfall.Ticket.str.contains(wifeRow['Ticket'][:-2])) & \
            (dfall['NaLast']!=wifeRow['NaLast']) | (dfall['NaTitle'].isin(['Master','Miss'])) )
        kidsdf= dfall.index[kidsMask]
    if len(parentsdf)==2:
        updateFamily(parentsDf,kidsdf)

In [None]:
# dfall.loc[(dfall.Sib!="") | (dfall.Par!= "")]
dfall.loc[dfall.Name.str.contains('Fortune')]

In [None]:
dfall.iloc[[1063, 981]]

In [None]:
wives = dfall[(dfall['NaTitle'] == 'Mrs') & (~dfall['NaRest'].isnull()) & (dfall['SibSp']>0)]
for _, wifeRow in wives.iterrows():
    #couples have the same NaFirst, a married title, SibSp>0 and either same NaLast, or same Ticket  
    parentsMask = ((dfall['NaFirst']==wifeRow['NaFirst']) & (dfall['NaTitle'].isin(['Mr','Mrs'])) & (dfall['SibSp']>0) & \
                   (abs(dfall['Age']-wifeRow['Age'])<14))
    #Kids travel in the same room as mom. So they share first name and ticket, but different last name 
    kidsMask = (dfall['NaFirst']==wifeRow['NaFirst']) & (dfall.Ticket.str.contains(wifeRow['Ticket'][:-2])) & \
        ( (dfall['NaLast']!=wifeRow['NaLast']) | (dfall['NaTitle'].isin(['Master','Miss'])) )
    parentsdf= dfall.index[parentsMask]
    kidsdf= dfall.index[kidsMask]
    if len(parentsdf)==2:
        updateFamily(parentsdf,kidsdf)

In [None]:
#single parents
singParents=dfall[(dfall.SibSp== 0) & (dfall.Parch>0) & (dfall.NaTitle.isin(['Mr','Mrs']))]
# for parentInd,
for parentInd, parentRow in singParents.iterrows():
    #Kids travel in the same room as parent. So they share first name and ticket, but different last name 
    kidsMask = (dfall.NaFirst==parentRow['NaFirst']) & (dfall.Parch>0) & (dfall.index!=parentInd) & \
         ( (dfall.Ticket.str.contains(parentRow['Ticket'][:-2])) | (dfall['NaTitle'].isin(['Master','Miss'])) )
    parentsdf= dfall.index[dfall['Name']==parentRow['Name']]
    kidsDf= dfall.index[kidsMask]
    if len(kidsDf) == parentRow['Parch']:
        updateFamily(parentsdf,kidsDf)

In [None]:
# dfall=dfall.dropna(subset=['Name'])
dfall.loc[dfall.Name.isna()]
# dfall.sample(30)

In [None]:
#Siblings with existing age
siblings=dfall[(dfall.SibSp> 0) & (~dfall.Age.isna())]
# for parentInd,
for sibInd, sibRow in siblings.iterrows():
    #Siblings have same NaFirst, same Pclass, similar age and SibSp>0 
    sibMask = (dfall.NaFirst==sibRow['NaFirst']) & (dfall.Pclass==sibRow['Pclass']) & (dfall.SibSp>0) \
    & (~dfall.Age.isna()) & (abs(dfall.Age - sibRow['Age'])<15)
    sibDf= dfall.index[sibMask]
    if (sibRow['Parch']>0):
        #the parent(s) of these kids have same Pclass, Parch>=len(sibDf) same NaFirst and are not in sibDf
        parentMask= (dfall.NaFirst==sibRow['NaFirst']) & (dfall.Pclass==sibRow['Pclass']) & \
        (dfall.Parch>=len(sibDf)) & (~dfall.index.isin(sibDf))
        parentDf=dfall.index[parentMask]
    else:
        parentDf=pd.DataFrame()
    updateFamily(parentDf,sibDf)

In [None]:
#Siblings with NaN age
siblings=dfall[dfall.SibSp> 0]
# for parentInd,
for sibInd, sibRow in siblings.iterrows():
    #Siblings have same NaFirst, same Pclass, similar age and SibSp>0 
    sibMask = (dfall.NaFirst==sibRow['NaFirst']) & (dfall.Pclass==sibRow['Pclass']) & (dfall.SibSp>0) \
    & (dfall.index!=sibInd) & (dfall.Parch==sibRow['Parch'])
    sibDf= dfall.index[sibMask]
    if (sibRow['Parch']>0):
        #the parent(s) of these kids have same Pclass, Parch>=len(sibDf) same NaFirst and are not in sibDf
        parentMask= (dfall.NaFirst==sibRow['NaFirst']) & (dfall.Pclass==sibRow['Pclass']) & \
        (dfall.Parch>=len(sibDf)) & (~dfall.index.isin(sibDf))
        parentDf=dfall.index[parentMask]
    else:
        parentDf=pd.DataFrame()
    updateFamily(parentDf,sibDf)

In [None]:
dfall.loc[dfall.SibSp>0]

In [None]:
#This is the number of travelers per ticket
dfall["TravNum"] = dfall.groupby("Ticket")["Ticket"].transform("count")

dfall.loc[(dfall.NaTitle=='Mrs')&(dfall.SibSp>0)&(dfall.duplicated(subset=['NaFirst','Ticket'], keep=False)),'FamPos'] = 'Wife'
dfall.loc[(dfall.NaTitle=='Mr')&(dfall.SibSp>0)&(dfall.duplicated(subset=['NaFirst','Ticket'], keep=False)),'FamPos'] = 'Husband'
dfall.loc[(dfall.SibSp==0)&(dfall.Parch==1)&(dfall.NumOfTrav==2)&(dfall.NaTitle.str.contains('Mr')),'FamPos']='Sparent'
dfall.loc[(dfall.SibSp==0)&(dfall.Parch==1)&(dfall.NumOfTrav==2)&(~dfall.NaTitle.str.contains('Mr')),'FamPos']='Skid'
#dfall.loc[(dfall.FamilyPosition.isna())&(dfall.Parch>=1)&(dfall.Parch<=2)&(dfall.TravelsWithNumber>1)&(dfall.duplicated(subset=['LastName','Ticket'], keep=False)),'FamilyPosition'] = 'Kid'
print(dfall.loc[:,['TicketNumber','Name','FamPos','Age','SibSp','Parch','TravNum']].sort_values(['TravNum','TicketNumber'],ascending=True).tail(60))

In [None]:
#Now let's split Cabin and title into something that may be more interesting
dfall.loc[dfall.Cabin.isnull(),'Cabin'] = 'X0'
dfall['CabinLetter'] = dfall['Cabin'].str.extract('([A-Z]+)')
dfall['CabinNumber'] = dfall['Cabin'].str.extract(r'(\d{1,3})', expand=False).astype(float)
pattern = r'[A-W]\d+'
dfall['NumberOfCabins'] = dfall['Cabin'].apply(lambda x: len(re.findall(pattern, x)))
dfall['TicketPrefix'] = dfall['Ticket'].str.extract('(.*) \d+')
dfall['TicketNumber'] = dfall['Ticket'].str.extract('(\d+)$').astype(float)
#dfall.loc[dfall.TicketPrefix.notna()]
dfall.head()


Let's fill now the missing values from the Age. We will use NameTitle, SibSp, Parch to try to predict Age.

In [None]:
# Create a data frame subset with its dummies
inputs = ['Age', 'NameTitle','SibSp', 'Parch', 'Fare', 'Pclass', 'Sex']
df_subset=dfall[inputs]
df_subset['NameTitle'] = df_subset['NameTitle'].str.replace('Lady', 'Mrs')
df_subset['NameTitle'] = df_subset['NameTitle'].str.replace('Jonkheer', 'Mr')
df_subset['NameTitle'] = df_subset['NameTitle'].str.replace('Rev', 'Mr')
df_subset['NameTitle'] = df_subset['NameTitle'].str.replace('Capt', 'Mr')
df_subset['NameTitle'] = df_subset['NameTitle'].str.replace('Dr', 'Mr')
df_subset['NameTitle'] = df_subset['NameTitle'].str.replace('Ms', 'Mrs')
df_subset['NameTitle'] = df_subset['NameTitle'].str.replace('Sir', 'Mr')
df_subset=pd.get_dummies(df_subset)
#Get the training set
dfTrain=df_subset.dropna(how='any')
print(dfTrain.columns)
print(dfTrain[dfTrain['NameTitle_ Master']==True])
trainY=dfTrain.loc[:,'Age']
trainX=dfTrain.loc[:,'SibSp':]
#Get the predict set
dfFill=df_subset.loc[df_subset.Age.isnull(), 'SibSp':]
print(dfFill.shape)
#indexes of nan ages
dtIndex=df_subset[df_subset['Age'].isnull()].index
print('age emptly',dtIndex)

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state =1)
rf_model.fit(trainX, trainY)
rfPred = rf_model.predict(dfFill)

# create a copy of the original DataFrame
df_filled = dfall.copy()

# replace NaN values in the Age column with the predicted values from rfPred
df_filled['Age'].fillna(pd.Series(rfPred), inplace=True)

# print the first 10 rows of the filled DataFrame
print(df_filled.iloc[dtIndex].head(30))


In [None]:
dfall.loc[(dfall.SibSp==0)&(dfall.Parch==1)&(dfall.TravelsWithNumber==2)].sort_values('Ticket')

In [None]:
os.getcwd()

In [None]:
dfall['Age'] = dfall.apply(lambda row: agebytitle[row['NameTitle']] if pd.isnull(row['Age']) else row['Age'], axis=1)
dfall.head()

In [None]:
df_clean=dfall.loc[(dfall.CabinLetter!='X')&(dfall.Embarked.notnull())&(dfall.Cabin.str.len() < 4)]
print(df_clean.groupby(['CabinLetter','Embarked'])['Fare'].mean())

In [None]:
df_c=df_clean.loc[df_clean.CabinLetter != 'X'].sort_values(by='Fare')
colors = {'A': 'red', 'B': 'blue', 'C': 'green', 'D': 'orange', 'E': 'purple',
          'F': 'brown', 'G': 'gray', 'H': 'pink', 'I': 'olive', 'J': 'cyan',
          'K': 'magenta', 'L': 'black', 'M': 'navy', 'N': 'teal', 'O': 'coral',
          'P': 'gold', 'Q': 'plum', 'R': 'peru', 'S': 'crimson', 'T': 'darkgreen'}

# create a list of colors based on the values in the "Letter" column
colors_list = [colors[letter] for letter in df_c['CabinLetter']]

plt.scatter(df_c['Fare'],df_c['CabinNumber'],c=colors_list)
plt.show()

In [None]:

# dfall['Name'] = dfall['Name'].str.replace(', Dona.', ', Miss.')
# dfall['Name'] = dfall['Name'].str.replace(', Dona.', ', Miss.')


In [None]:
dfall.NameTitle.value_counts()

In [None]:
dfall.loc[(dfall.Name.str.contains(', Capt'))&(dfall.Age.notnull()),['Age','Name','SibSp','Parch','Fare', 'Cabin']]


In [None]:
dfall.loc[(dfall.Name.str.contains('Mrs'))&(dfall.Age.notnull()),[ 'Age','Name','SibSp','Parch']]
dfall.loc[(dfall.Name.str.contains('Mrs'))&(dfall.Age.notnull()),'Age'].mean()


In [None]:
colors = {'S': 'blue', 'C': 'red', 'Q': 'green'}
for embarked in colors:
    subdata = data[(data['Embarked'] == embarked) & (data.CabinLetter.notnull()) & (data.Fare>0)]
    plt.scatter(subdata['CabinLetter'], subdata['Fare'], color=colors[embarked], label=embarked)
    
# set the labels and title
plt.xlabel('Cabin Letter')
plt.ylabel('Fare')
plt.title('Relationship between Cabin Letter and Fare')

# show the legend
plt.legend()

# show the plot
plt.show()

In [None]:
data.loc[data.Cabin.isna(),'Cabin'] = "G"


In [None]:
def get_div(cabin_letter):
    if len(cabin_letter) == 15:
        return 4
    elif len(cabin_letter) == 11:
        return 3
    elif len(cabin_letter) == 7:
        return 2
    else:
        return 1
    
data['newFare']=pd.Series(len(data['Fare']),index=data.index)
data['newFare']=0
data.head()
data.loc[data['Cabin'].notnull(),'newFare']=1
data.head()

In [None]:
data.loc[data['newFare']==1,'CabinNumber']=data[]/data['Cabin'].apply(get_div).values

In [None]:
data.loc[data['newFare'] == 1, 'CabinNumber'] = data[data['newFare'] == 1]['Cabin'].dropna().apply(get_div)
data.head(30)

In [None]:
data[data.newFare==1]['newFare'] = data['Fare'].values/data['CabinNumber'].values

In [None]:
sns.set()
sns.pairplot(x ,height = 1.5)
plt.show()

In [None]:
dfall.head(20)

In [None]:
data.columns