The ideas implemented in this notebook are based on a discussion about some rules to fill NaNs by Vincent Debout: https://www.kaggle.com/competitions/spaceship-titanic/discussion/315987

Appreciating your feedback on the implementation and your suggestions for more ideas to fill NaNs!!

In [None]:
# importing needed libraries
# pandas for data analysis and manipulation
import pandas as pd
# matplotlib for visualization
import matplotlib.pyplot as plt
# Python data visualization library based on matplotlib
import seaborn as sns
# Numerical Python
import numpy as np

In [None]:
# read the datasets
titanicDSTrain = pd.read_csv("../input/spaceship-titanic/train.csv")
titanicDSTest = pd.read_csv("../input/spaceship-titanic/test.csv")

In [None]:
# print the shape of the datasets
print(titanicDSTrain.shape)
print(titanicDSTest.shape)

In [None]:
# print the columns of the datasets
print(titanicDSTrain.columns)

In [None]:
# Calculate the sum of all missing values in both datasets
# This sum will be recalculated at the end of this notebook
nullTrain = sum(list(titanicDSTrain.isnull().sum()))
nullTest = sum(list(titanicDSTest.isnull().sum()))
print("nullTrain:", nullTrain, " - nullTest:", nullTest)

<h4>Observation: HomePlanet</h4> Passengers in the same group should be from the same HomePlanet

In [None]:
# First, let's split PassengerId into PassengerGrId and PassengerGrIdNb
titanicDSTrain['PassengerGrId'] = titanicDSTrain.PassengerId.str.slice(0, 4).astype('int32')
titanicDSTrain['PassengerGrIdNb'] = titanicDSTrain.PassengerId.str.slice(5, 7).astype('int32')

In [None]:
# Same for titanicDSTest
# Let's split PassengerId into PassengerGrId and PassengerGrIdNb
titanicDSTest['PassengerGrId'] = titanicDSTest.PassengerId.str.slice(0, 4).astype('int32')
titanicDSTest['PassengerGrIdNb'] = titanicDSTest.PassengerId.str.slice(5, 7).astype('int32')

In [None]:
# Now, let's identify passengers inGroup by adding the number of Siblings for each passenger
# If Siblings is equal to zero, passenger is alone, otherwise passenger is in group
titanicDSTrain['Siblings'] = 0 # Siblings = 0 means no siblings
colSiblings = titanicDSTrain.columns.get_loc("Siblings")
for passId in np.unique(titanicDSTrain.PassengerGrId):
    # check the index of all passenger with same passId
    rowIndex = titanicDSTrain.index[titanicDSTrain['PassengerGrId'] == passId].tolist()
    # check the max of PassengerGrIdNb
    sib = max(titanicDSTrain.iloc[rowIndex].PassengerGrIdNb)
    if (sib > 1): # Passenger is inGroup
        titanicDSTrain.iloc[rowIndex, colSiblings] = sib - 1

In [None]:
# same for titanicDSTest
# Let's identify passengers inGroup by adding the number of Siblings for each passenger
# If Siblings is equal to zero, passenger is alone, otherwise passenger is in group
titanicDSTest['Siblings'] = 0 # Siblings = 0 means no siblings
colSiblings = titanicDSTest.columns.get_loc("Siblings")
for passId in np.unique(titanicDSTest.PassengerGrId):
    # check the index of all passenger with same passId
    rowIndex = titanicDSTest.index[titanicDSTest['PassengerGrId'] == passId].tolist()
    # check the max of PassengerGrIdNb
    sib = max(titanicDSTest.iloc[rowIndex].PassengerGrIdNb)
    if (sib > 1): # Passenger is inGroup
        titanicDSTest.iloc[rowIndex, colSiblings] = sib - 1

In [None]:
# Now let's check the validity of the observation

# Passengers in Group
titanicDSTrainInGroup = titanicDSTrain[titanicDSTrain['Siblings'] > 1]

obser = True # let's suppose that observation is True
for i in range(0, len(titanicDSTrainInGroup)):
    # locate passengerGrId
    passengerGrId = titanicDSTrainInGroup.iloc[i]['PassengerGrId']
    # locate homePlanet
    homePlanetGr = list(dict.fromkeys(titanicDSTrainInGroup[titanicDSTrainInGroup['PassengerGrId'] == passengerGrId].HomePlanet))
    # remove nan from homePlanet
    homePlanetGr = [x for x in homePlanetGr if x == x]
    # if the length is greater than 1, so observation is wrong
    if (len(homePlanetGr) > 1):
        print(passengerGrId, homePlanetGr)
        obser = False
if (obser):
    print ('Observation is True for titanicDSTrain')

In [None]:
# Also for titanicDSTest, let's check the validity of the observation

# Passengers in Group
titanicDSTestInGroup = titanicDSTest[titanicDSTest['Siblings'] > 1]

obser = True # let's suppose that observation is True
for i in range(0, len(titanicDSTestInGroup)):
    # locate passengerGrId
    passengerGrId = titanicDSTestInGroup.iloc[i]['PassengerGrId']
    # locate homePlanet
    homePlanetGr = list(dict.fromkeys(titanicDSTestInGroup[titanicDSTestInGroup['PassengerGrId'] == passengerGrId].HomePlanet))
    # remove nan from homePlanet
    homePlanetGr = [x for x in homePlanetGr if x == x]
    # if the length is greater than 1, so observation is wrong
    if (len(homePlanetGr) > 1):
        print(passengerGrId, homePlanetGr)
        obser = False
if (obser):
    print ('Observation is True for titanicDSTest')

In [None]:
# Since observation is correct for both train and test datasets
# Let's identify where the group is from and update homePlanet, if needed

# Passengers in Group
titanicDSTrainInGroup = titanicDSTrain[titanicDSTrain['Siblings'] > 1]
# Passengers in Group with null HomePlanet
titanicDSTrainInGroupWithNullHomePlanet = titanicDSTrainInGroup[titanicDSTrainInGroup.HomePlanet.isna()]

colHomePlanet = titanicDSTrain.columns.get_loc("HomePlanet")
for i in range(0, len(titanicDSTrainInGroupWithNullHomePlanet)):
    # locate passengerGrId
    passengerId = titanicDSTrainInGroupWithNullHomePlanet.iloc[i]['PassengerId']
    passengerGrId = titanicDSTrainInGroupWithNullHomePlanet.iloc[i]['PassengerGrId']
    # locate homePlanet
    homePlanetGr = list(dict.fromkeys(titanicDSTrainInGroup[titanicDSTrainInGroup['PassengerGrId'] == passengerGrId].HomePlanet))
    # remove nan from homePlanet
    homePlanetGr = [x for x in homePlanetGr if x == x]
    # update homePlanet
    rowIndex = titanicDSTrain.index[titanicDSTrain['PassengerId'] == passengerId].tolist()
    titanicDSTrain.iloc[rowIndex, colHomePlanet] = homePlanetGr

In [None]:
# same for titanicDSTest
# Let's identify where the group is from and update homePlanet, if needed
# Passengers in Group
titanicDSTestInGroup = titanicDSTest[titanicDSTest['Siblings'] > 1]
# Passengers in Group with null HomePlanet
titanicDSTestInGroupWithNullHomePlanet = titanicDSTestInGroup[titanicDSTestInGroup.HomePlanet.isna()]

colHomePlanet = titanicDSTest.columns.get_loc("HomePlanet")
for i in range(0, len(titanicDSTestInGroupWithNullHomePlanet)):
    # locate passengerGrId
    passengerId = titanicDSTestInGroupWithNullHomePlanet.iloc[i]['PassengerId']
    passengerGrId = titanicDSTestInGroupWithNullHomePlanet.iloc[i]['PassengerGrId']
    # locate homePlanet
    homePlanetGr = list(dict.fromkeys(titanicDSTestInGroup[titanicDSTestInGroup['PassengerGrId'] == passengerGrId].HomePlanet))
    # remove nan from homePlanet
    homePlanetGr = [x for x in homePlanetGr if x == x]
    # update homePlanet
    rowIndex = titanicDSTest.index[titanicDSTest['PassengerId'] == passengerId].tolist()
    titanicDSTest.iloc[rowIndex, colHomePlanet] = homePlanetGr

In [None]:
# Calculate the sum of all missing values in both datasets
nullTrain = sum(list(titanicDSTrain.isnull().sum()))
nullTest = sum(list(titanicDSTest.isnull().sum()))
print("nullTrain:", nullTrain, " - nullTest:", nullTest)

<h4>Observation: cabin </h4> Passengers in the same group with the same family name should be in the same cabin.
<br/>P.S.: This observation was not mentioned by Vincent Debout

In [None]:
# First, let's split Passenger name into PassengerFN and PassengerLN
titanicDSTrain['PassengerFN'] = titanicDSTrain.Name.str.split(" ", n=1, expand=True)[0]
titanicDSTrain['PassengerLN'] = titanicDSTrain.Name.str.split(" ", n=1, expand=True)[1]

In [None]:
# same for titanicDSTest
# Let's split Passenger name into PassengerFN and PassengerLN
titanicDSTest['PassengerFN'] = titanicDSTest.Name.str.split(" ", n=1, expand=True)[0]
titanicDSTest['PassengerLN'] = titanicDSTest.Name.str.split(" ", n=1, expand=True)[1]

In [None]:
# Now let's check the validity of the observation
# Passengers in Group
titanicDSTrainInGroup = titanicDSTrain[titanicDSTrain['Siblings'] > 1]

obser = True # let's suppose that observation is True
for i in range(0, len(titanicDSTrainInGroup)):
    # locate passengerGrId
    passengerGrId = titanicDSTrainInGroup.iloc[i]['PassengerGrId']
    # locate homePlanet
    cabin = list(dict.fromkeys(titanicDSTrainInGroup[titanicDSTrainInGroup['PassengerGrId'] == passengerGrId].Cabin))
    passengerLN = list(dict.fromkeys(titanicDSTrainInGroup[titanicDSTrainInGroup['PassengerGrId'] == passengerGrId].PassengerLN))
    # remove nan from homePlanet
    cabin = [x for x in cabin if x == x]
    passengerLN = [x for x in passengerLN if x == x]
    #print (cabin, passengerLN)
    # if the length is greater than 1, so observation is wrong
    if ((len(cabin) > 1) & (len(passengerLN) == 1)):
        # print(passengerGrId)
        obser = False
if (obser):
    print ('Observation is True for titanicDSTrain')
else:
    print ('Observation is Wrong for titanicDSTrain')

## Observation is wrong

In [None]:
# Passengers with GroupId 103 are from the same family name but in different cabins
titanicDSTrainInGroup[titanicDSTrainInGroup['PassengerGrId'] == 103]

<h4>Observation: HomePlanet</h4>
<ul>
    <li>Europa have cabins: A B C D E T</li>
    <li>Earth have cabins: E F G</li>
    <li>Mars have cabins: D E F</li>
</ul>
<ol>
    <li>If cabin A or B or C or T, then homePlanet is Europa</li>
    <li>If cabin G, then homePlanet is Earth</li>
</ol>

In [None]:
# First, let's Check the Observation
# We need to add three columns: cabinDeck, cabinNum, cabinSide
titanicDSTrain['cabinDeck'] = titanicDSTrain['Cabin'].str.split("/", n=2, expand=True)[0]
titanicDSTrain['cabinNum'] = titanicDSTrain['Cabin'].str.split("/", n=2, expand=True)[1]
titanicDSTrain['cabinSide'] = titanicDSTrain['Cabin'].str.split("/", n=2, expand=True)[2]

In [None]:
# HomePlanet vs cabinDeck
print(titanicDSTrain.groupby(['HomePlanet','cabinDeck']).size())
# Plot the results if needed
# titanicDSTrain.groupby(['HomePlanet','cabinDeck']).size().unstack().plot(kind='bar')
# plt.show()

In [None]:
# same for titanicDSTest
# Let's Check the Observation
# We need to add three columns: cabinDeck, cabinNum, cabinSide
titanicDSTest['cabinDeck'] = titanicDSTest['Cabin'].str.split("/", n=2, expand=True)[0]
titanicDSTest['cabinNum'] = titanicDSTest['Cabin'].str.split("/", n=2, expand=True)[1]
titanicDSTest['cabinSide'] = titanicDSTest['Cabin'].str.split("/", n=2, expand=True)[2]

In [None]:
# HomePlanet vs cabinDeck
print(titanicDSTest.groupby(['HomePlanet','cabinDeck']).size())

In [None]:
# Since observation is True for both train and test datasets
# so let's fill missing values for homePlanet for cabin A or B or C or T (Europa) and cabin G (Earth)
colHomePlanet = titanicDSTrain.columns.get_loc("HomePlanet")
rowIndex = titanicDSTrain.index[(titanicDSTrain.HomePlanet.isna()) & (titanicDSTrain.cabinDeck == 'A')].tolist()
titanicDSTrain.iloc[rowIndex, colHomePlanet] = 'Europa'
rowIndex = titanicDSTrain.index[(titanicDSTrain.HomePlanet.isna()) & (titanicDSTrain.cabinDeck == 'B')].tolist()
titanicDSTrain.iloc[rowIndex, colHomePlanet] = 'Europa'
rowIndex = titanicDSTrain.index[(titanicDSTrain.HomePlanet.isna()) & (titanicDSTrain.cabinDeck == 'C')].tolist()
titanicDSTrain.iloc[rowIndex, colHomePlanet] = 'Europa'
rowIndex = titanicDSTrain.index[(titanicDSTrain.HomePlanet.isna()) & (titanicDSTrain.cabinDeck == 'T')].tolist()
titanicDSTrain.iloc[rowIndex, colHomePlanet] = 'Europa'
rowIndex = titanicDSTrain.index[(titanicDSTrain.HomePlanet.isna()) & (titanicDSTrain.cabinDeck == 'G')].tolist()
titanicDSTrain.iloc[rowIndex, colHomePlanet] = 'Earth'

In [None]:
# Same for titanicDSTest
# since observation is True for both train and test datasets
# so let's fill missing values for homePlanet for cabin A or B or C or T (Europa) and cabin G (Earth)
colHomePlanet = titanicDSTest.columns.get_loc("HomePlanet")
rowIndex = titanicDSTest.index[(titanicDSTest.HomePlanet.isna()) & (titanicDSTest.cabinDeck == 'A')].tolist()
titanicDSTest.iloc[rowIndex, colHomePlanet] = 'Europa'
rowIndex = titanicDSTest.index[(titanicDSTest.HomePlanet.isna()) & (titanicDSTest.cabinDeck == 'B')].tolist()
titanicDSTest.iloc[rowIndex, colHomePlanet] = 'Europa'
rowIndex = titanicDSTest.index[(titanicDSTest.HomePlanet.isna()) & (titanicDSTest.cabinDeck == 'C')].tolist()
titanicDSTest.iloc[rowIndex, colHomePlanet] = 'Europa'
rowIndex = titanicDSTest.index[(titanicDSTest.HomePlanet.isna()) & (titanicDSTest.cabinDeck == 'T')].tolist()
titanicDSTest.iloc[rowIndex, colHomePlanet] = 'Europa'
rowIndex = titanicDSTest.index[(titanicDSTest.HomePlanet.isna()) & (titanicDSTest.cabinDeck == 'G')].tolist()
titanicDSTest.iloc[rowIndex, colHomePlanet] = 'Earth'

In [None]:
# Calculate the sum of all missing values in both datasets
nullTrain = sum(list(titanicDSTrain.isnull().sum()))
nullTest = sum(list(titanicDSTest.isnull().sum()))
print("nullTrain:", nullTrain, " - nullTest:", nullTest)

<h4>Observation: Luxury</h4> Luxury is 0 when cryoSleep is True or when age is less or equal to 12

In [None]:
# First let's check if Observation is correct
# CryoSleep and Age vs Luxury
rowIndex = titanicDSTrain.index[(titanicDSTrain.CryoSleep == True) | (titanicDSTrain.Age < 13)].tolist()
print(len(rowIndex))
print(max(titanicDSTrain.iloc[rowIndex].RoomService))
print(max(titanicDSTrain.iloc[rowIndex].FoodCourt))
print(max(titanicDSTrain.iloc[rowIndex].ShoppingMall))
print(max(titanicDSTrain.iloc[rowIndex].Spa))
print(max(titanicDSTrain.iloc[rowIndex].VRDeck))
# print(list(dict.fromkeys(titanicDSTrain.iloc[rowIndex].VRDeck)))

In [None]:
# Same for titanicDSTest
# Let's check if Observation is correct
# CryoSleep and Age vs Luxury
rowIndex = titanicDSTest.index[(titanicDSTest.CryoSleep == True) | (titanicDSTest.Age < 13)].tolist()
print(len(rowIndex))
print(max(titanicDSTest.iloc[rowIndex].RoomService))
print(max(titanicDSTest.iloc[rowIndex].FoodCourt))
print(max(titanicDSTest.iloc[rowIndex].ShoppingMall))
print(max(titanicDSTest.iloc[rowIndex].Spa))
print(max(titanicDSTest.iloc[rowIndex].VRDeck))

In [None]:
# Since observation is correct for both train and test datasets
# So let's fill missing values for luxury with 0 when cryoSleep is True or when age is <= 12
luxury = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
rowIndex = titanicDSTrain.index[(titanicDSTrain.CryoSleep == True) | (titanicDSTrain.Age < 13)].tolist()
for lux in luxury:
    colLuxury = titanicDSTrain.columns.get_loc(lux)
    titanicDSTrain.iloc[rowIndex, colLuxury] = 0

In [None]:
# Same for titanicDSTest
# Since observation is correct for both train and test datasets
# So let's fill missing values for luxury with 0 when cryoSleep is True or when age is <= 12
luxury = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
rowIndex = titanicDSTest.index[(titanicDSTest.CryoSleep == True) | (titanicDSTest.Age < 13)].tolist()
for lux in luxury:
    colLuxury = titanicDSTest.columns.get_loc(lux)
    titanicDSTest.iloc[rowIndex, colLuxury] = 0

In [None]:
# Calculate the sum of all missing values in both datasets
nullTrain = sum(list(titanicDSTrain.isnull().sum()))
nullTest = sum(list(titanicDSTest.isnull().sum()))
print("nullTrain:", nullTrain, " - nullTest:", nullTest)

<h4>Observation: CryoSleep</h4> Passengers who have paid for any luxury are not sleeping. Which means, when luxury is greater than zero, cryoSleep should be False

In [None]:
# First let's add a new feature luxury, which is the sum of all other luxuries
# Before adding the new feature, let's fill the missing values with mean value
titanicDSTrain.RoomService.fillna((int)(titanicDSTrain.RoomService.mean()), inplace=True)
titanicDSTrain.FoodCourt.fillna((int)(titanicDSTrain.FoodCourt.mean()), inplace=True)
titanicDSTrain.ShoppingMall.fillna((int)(titanicDSTrain.ShoppingMall.mean()), inplace=True)
titanicDSTrain.Spa.fillna((int)(titanicDSTrain.Spa.mean()), inplace=True)
titanicDSTrain.VRDeck.fillna((int)(titanicDSTrain.VRDeck.mean()), inplace=True)
titanicDSTrain['paidLuxury'] = titanicDSTrain.RoomService + titanicDSTrain.FoodCourt + titanicDSTrain.ShoppingMall + titanicDSTrain.Spa + titanicDSTrain.VRDeck

In [None]:
# CryoSleep vs paidLuxury
print(titanicDSTrain.groupby(['CryoSleep','paidLuxury']).size())

In [None]:
# Same for titanicDSTest
# First let's add a new feature luxury, which is the sum of all other luxuries
# Before adding the new feature, let's fill the missing values with 0
titanicDSTest.RoomService.fillna((int)(titanicDSTest.RoomService.mean()), inplace=True)
titanicDSTest.FoodCourt.fillna((int)(titanicDSTest.FoodCourt.mean()), inplace=True)
titanicDSTest.ShoppingMall.fillna((int)(titanicDSTest.ShoppingMall.mean()), inplace=True)
titanicDSTest.Spa.fillna((int)(titanicDSTest.Spa.mean()), inplace=True)
titanicDSTest.VRDeck.fillna((int)(titanicDSTest.VRDeck.mean()), inplace=True)
titanicDSTest['paidLuxury'] = titanicDSTest.RoomService + titanicDSTest.FoodCourt + titanicDSTest.ShoppingMall + titanicDSTest.Spa + titanicDSTest.VRDeck

In [None]:
# CryoSleep vs paidLuxury
print(titanicDSTest.groupby(['CryoSleep','paidLuxury']).size())

In [None]:
# Since observation is correct for both train and test datasets
# So let's fill missing values for CryoSleep with False when paidLuxury is greater than zero
colCryoSleep = titanicDSTrain.columns.get_loc("CryoSleep")
rowIndex = titanicDSTrain.index[(titanicDSTrain.CryoSleep.isna()) & (titanicDSTrain.paidLuxury > 0)].tolist()
titanicDSTrain.iloc[rowIndex, colCryoSleep] = False

In [None]:
# Same for titanicDSTest
# Since observation is correct for both train and test datasets
# So let's fill missing values for CryoSleep with False when paidLuxury is greater than zero
colCryoSleep = titanicDSTest.columns.get_loc("CryoSleep")
rowIndex = titanicDSTest.index[(titanicDSTest.CryoSleep.isna()) & (titanicDSTest.paidLuxury > 0)].tolist()
titanicDSTest.iloc[rowIndex, colCryoSleep] = False

In [None]:
# Calculate the sum of all missing values in both datasets
nullTrain = sum(list(titanicDSTrain.isnull().sum()))
nullTest = sum(list(titanicDSTest.isnull().sum()))
print("nullTrain:", nullTrain, " - nullTest:", nullTest)

<h4>Observation: VIP</h4> VIP are only from Europa or Mars, so when Earth fill VIP with False

In [None]:
# First let's check if Observation is correct
# VIP vs HomePlanet
print(titanicDSTrain.groupby(['VIP','HomePlanet']).size())

In [None]:
# Same for titanicDSTest
# VIP vs HomePlanet
print(titanicDSTest.groupby(['VIP','HomePlanet']).size())

In [None]:
# Since observation is correct for both train and test datasets
# So let's fill missing values for VIP with False when HomePlanet is Earth
colVIP = titanicDSTrain.columns.get_loc("VIP")
rowIndex = titanicDSTrain.index[(titanicDSTrain.VIP.isna()) & (titanicDSTrain.HomePlanet == 'Earth')].tolist()
titanicDSTrain.iloc[rowIndex, colVIP] = False

In [None]:
# Same for titanicDSTest
# So let's fill missing values for VIP with False when HomePlanet is Earth
colVIP = titanicDSTest.columns.get_loc("VIP")
rowIndex = titanicDSTest.index[(titanicDSTest.VIP.isna()) & (titanicDSTest.HomePlanet == 'Earth')].tolist()
titanicDSTest.iloc[rowIndex, colVIP] = False

In [None]:
# Check if there are missing values in any column
nullTrain = sum(list(titanicDSTrain.isnull().sum()))
nullTest = sum(list(titanicDSTest.isnull().sum()))
print("nullTrain:", nullTrain, " - nullTest:", nullTest)

<h4>Observation: VIP</h4> VIP is False for all passengers located in cabinDeck T (also for cabinDeck G, but all passengers are from Earth), so if cabinDeck is T, then VIP should be False

In [None]:
# First let's check if Observation is correct
# VIP vs cabinDeck
print(titanicDSTrain.groupby(['VIP','cabinDeck']).size())

In [None]:
# Same for titanicDSTest
# Let's check if Observation is correct
# VIP vs cabinDeck
print(titanicDSTest.groupby(['VIP','cabinDeck']).size())

In [None]:
# Since observation is correct for both train and test datasets
# so let's fill missing values for VIP with False when cabinDeck is T
colVIP = titanicDSTrain.columns.get_loc("VIP")
rowIndex = titanicDSTrain.index[(titanicDSTrain.VIP.isna()) & (titanicDSTrain.cabinDeck == 'T')].tolist()
titanicDSTrain.iloc[rowIndex, colVIP] = False

In [None]:
# Same for titanicDSTest
# Since observation is correct for both train and test datasets
# so let's fill missing values for VIP with False when cabinDeck is T
colVIP = titanicDSTest.columns.get_loc("VIP")
rowIndex = titanicDSTest.index[(titanicDSTest.VIP.isna()) & (titanicDSTest.cabinDeck == 'T')].tolist()
titanicDSTest.iloc[rowIndex, colVIP] = False

In [None]:
# Check if there are missing values in any column
nullTrain = sum(list(titanicDSTrain.isnull().sum()))
nullTest = sum(list(titanicDSTest.isnull().sum()))
print("nullTrain:", nullTrain, " - nullTest:", nullTest)

<h4>Observation: VIP</h4> Europa VIP have Age >= 25, so VIP FALSE if Europa and Age under 25

In [None]:
# First let's check if Observation is correct
# VIP vs HomePlanet
rowIndex = titanicDSTrain.index[(titanicDSTrain.VIP == True) & (titanicDSTrain.HomePlanet == 'Europa')].tolist()
min(titanicDSTrain.iloc[rowIndex].Age)

In [None]:
# Same for titanicDSTest
# Let's check if Observation is correct
# VIP vs HomePlanet
rowIndex = titanicDSTest.index[(titanicDSTest.VIP == True) & (titanicDSTest.HomePlanet == 'Europa')].tolist()
min(titanicDSTest.iloc[rowIndex].Age)

In [None]:
# Since observation is correct for both train and test datasets
# So let's fill missing values for VIP with False when HomePlanet is Europa and Age is under 25
colVIP = titanicDSTrain.columns.get_loc("VIP")
rowIndex = titanicDSTrain.index[(titanicDSTrain.VIP.isna()) & (titanicDSTrain.HomePlanet == 'Europa') & (titanicDSTrain.Age < 25)].tolist()
titanicDSTrain.iloc[rowIndex, colVIP] = False

In [None]:
# Same for titanicDSTest
# Since observation is correct for both train and test datasets
# So let's fill missing values for VIP with False when HomePlanet is Europa and Age is under 25
colVIP = titanicDSTest.columns.get_loc("VIP")
rowIndex = titanicDSTest.index[(titanicDSTest.VIP.isna()) & (titanicDSTest.HomePlanet == 'Europa') & (titanicDSTest.Age < 25)].tolist()
titanicDSTest.iloc[rowIndex, colVIP] = False

In [None]:
# Check if there are missing values in any column
nullTrain = sum(list(titanicDSTrain.isnull().sum()))
nullTest = sum(list(titanicDSTest.isnull().sum()))
print("nullTrain:", nullTrain, " - nullTest:", nullTest)

<h4>Observation: VIP</h4> Mars VIP have Age >= 18 and no CryoSleep and never goes to "55 Cancri e"

In [None]:
# First let's check if Observation is correct
# VIP vs HomePlanet
rowIndex = titanicDSTrain.index[(titanicDSTrain.VIP == True) & (titanicDSTrain.HomePlanet == 'Mars')].tolist()
print(len(rowIndex))
print(list(dict.fromkeys(titanicDSTrain.iloc[rowIndex].CryoSleep)))
print(min(titanicDSTrain.iloc[rowIndex].Age))
print(list(dict.fromkeys(titanicDSTrain.iloc[rowIndex].Destination)))

In [None]:
# Same for titanicDSTest
# Let's check if Observation is correct
# VIP vs HomePlanet
rowIndex = titanicDSTest.index[(titanicDSTest.VIP == True) & (titanicDSTest.HomePlanet == 'Mars')].tolist()
print(len(rowIndex))
print(list(dict.fromkeys(titanicDSTest.iloc[rowIndex].CryoSleep)))
print(min(titanicDSTest.iloc[rowIndex].Age))
print(list(dict.fromkeys(titanicDSTest.iloc[rowIndex].Destination)))

In [None]:
# Since observation is correct for both train and test datasets
# So let's fill missing values for VIP with False 
# When HomePlanet is Mars, Age is under 18, CryoSleep is False and Destination is different than 55 Cancri e
colVIP = titanicDSTrain.columns.get_loc("VIP")
rowIndex = titanicDSTrain.index[(titanicDSTrain.VIP.isna()) & (titanicDSTrain.HomePlanet == 'Mars') & (titanicDSTrain.Age < 18) 
                                & (titanicDSTrain.CryoSleep == False) & (titanicDSTrain.Destination != '55 Cancri e')].tolist()
titanicDSTrain.iloc[rowIndex, colVIP] = False

In [None]:
# Same for titanicDSTest
# Since observation is correct for both train and test datasets
# So let's fill missing values for VIP with False 
# When HomePlanet is Mars, Age is under 18, CryoSleep is False and Destination is different than 55 Cancri e
colVIP = titanicDSTest.columns.get_loc("VIP")
rowIndex = titanicDSTest.index[(titanicDSTest.VIP.isna()) & (titanicDSTest.HomePlanet == 'Mars') & (titanicDSTest.Age < 18) 
                                & (titanicDSTest.CryoSleep == False) & (titanicDSTest.Destination != '55 Cancri e')].tolist()
print (rowIndex)

In [None]:
# Check if there are missing values in any column
nullTrain = sum(list(titanicDSTrain.isnull().sum()))
nullTest = sum(list(titanicDSTest.isnull().sum()))
print("nullTrain:", nullTrain, " - nullTest:", nullTest)

<h4>Observation: Destination</h4> People not Children (Age >=18) and not CryoSleep with no bill all have Destination TRAPPIST-1e

In [None]:
# First let's check if Observation is correct
# Age > 18, CryoSleep == False, bills == 0
# Destination TRAPPIST-1e or nan
rowIndex = titanicDSTrain.index[(titanicDSTrain.Age >=18) & (titanicDSTrain.CryoSleep == False) & (titanicDSTrain.paidLuxury == 0)].tolist()
print(len(rowIndex))
print(list(dict.fromkeys(titanicDSTrain.iloc[rowIndex].Destination)))

In [None]:
# Same for titanicDSTest
# Let's check if Observation is correct
# Age > 18, CryoSleep == False, bills == 0
# Destination TRAPPIST-1e or nan
rowIndex = titanicDSTest.index[(titanicDSTest.Age >=18) & (titanicDSTest.CryoSleep == False) & (titanicDSTest.paidLuxury == 0)].tolist()
print(len(rowIndex))
print(list(dict.fromkeys(titanicDSTest.iloc[rowIndex].Destination)))

In [None]:
# Since observation is correct for both train and test datasets
# so let's fill missing values for Destination with TRAPPIST-1e 
# when Age > 18, CryoSleep == False, bills == 0
colDestination = titanicDSTrain.columns.get_loc("Destination")
rowIndex = titanicDSTrain.index[(titanicDSTrain.Age >=18) & (titanicDSTrain.CryoSleep == False) & (titanicDSTrain.paidLuxury == 0)].tolist()
titanicDSTrain.iloc[rowIndex, colDestination] = 'TRAPPIST-1e'

# No Need to do the same for titanicDSTest

In [None]:
# Check if there are missing values in any column
nullTrain = sum(list(titanicDSTrain.isnull().sum()))
nullTest = sum(list(titanicDSTest.isnull().sum()))
print("nullTrain:", nullTrain, " - nullTest:", nullTest)

<h4>Observation: HomePlanet</h4> Surnames (and First) are found in a unique HomePlanet

In [None]:
# Let's check if Observation is correct
# First, we build a list of unique family names in both Train and Test datasets
distinctFN = list(dict.fromkeys(titanicDSTrain.PassengerLN))
for fn in list(dict.fromkeys(titanicDSTest.PassengerLN)):
    if (not fn in distinctFN):
        distinctFN.append(fn)
# create an dictionary to hold all families and their homePlanet
dictFN = {}
# create a list to check wrong values
wrongFN = []
# check all distinct family names
for fn in distinctFN:
    # first we look for their homePlanet in Train dataset
    homePlanet = list(dict.fromkeys(titanicDSTrain[titanicDSTrain.PassengerLN == fn].HomePlanet))
    # then we combine it with the list from Test dataset
    homePlanetTest = list(dict.fromkeys(titanicDSTest[titanicDSTest.PassengerLN == fn].HomePlanet))
    for hp in homePlanetTest:
        if (not hp in homePlanet):
            homePlanet.append(hp)
    # then we remove nan values from homePlanet
    homePlanet = [x for x in homePlanet if x == x]
    # if length is equal to 1, fn is from one homePlanet, so we add to the dictionary
    if (len(homePlanet) == 1):
        dictFN[fn]=homePlanet[0]
    else: # either nan or family name from different planet
        wrongFN.append(fn)
        print (homePlanet)

# since all printing are empty lists, it means homePlanet for these families is nan

In [None]:
# Since observation is correct for both train and test datasets
# So let's fill nan homePlanet with their value from dictFN
colHomePlanet = titanicDSTrain.columns.get_loc("HomePlanet")
rowIndex = titanicDSTrain.index[(titanicDSTrain.HomePlanet.isna()) & (titanicDSTrain.PassengerLN.notna())].tolist()
for ri in rowIndex:
    passengerLN = titanicDSTrain.iloc[ri].PassengerLN
    #print(passengerLN)
    if passengerLN in dictFN.keys():
        titanicDSTrain.iloc[ri, colHomePlanet] = dictFN[passengerLN]

In [None]:
# Same for titanicDSTest
# Since observation is correct for both train and test datasets
# So let's fill nan homePlanet with their value from dictFN
colHomePlanet = titanicDSTest.columns.get_loc("HomePlanet")
rowIndex = titanicDSTest.index[(titanicDSTest.HomePlanet.isna()) & (titanicDSTest.PassengerLN.notna())].tolist()
for ri in rowIndex:
    passengerLN = titanicDSTest.iloc[ri].PassengerLN
    #print(passengerLN)
    if passengerLN in dictFN.keys():
        titanicDSTest.iloc[ri, colHomePlanet] = dictFN[passengerLN]

In [None]:
# Check if there are missing values in any column
nullTrain = sum(list(titanicDSTrain.isnull().sum()))
nullTest = sum(list(titanicDSTest.isnull().sum()))
print("nullTrain:", nullTrain, " - nullTest:", nullTest)

Once we've finished filling NaNs, let's remove newly added columns and calculate the sum of all missing values

In [None]:
titanicDSTrain.columns

In [None]:
# Drop newly added columns
titanicDSTrain.drop(['PassengerGrId', 'PassengerGrIdNb', 'Siblings', 'PassengerFN', 'PassengerLN', 'cabinDeck', 'cabinNum', 'cabinSide', 'paidLuxury'],axis=1, inplace=True)
titanicDSTest.drop(['PassengerGrId', 'PassengerGrIdNb', 'Siblings', 'PassengerFN', 'PassengerLN', 'cabinDeck', 'cabinNum', 'cabinSide', 'paidLuxury'],axis=1, inplace=True)

In [None]:
# Check if there are missing values in any column
nullTrain = sum(list(titanicDSTrain.isnull().sum()))
nullTest = sum(list(titanicDSTest.isnull().sum()))
print("nullTrain:", nullTrain, " - nullTest:", nullTest)

In [None]:
titanicDSTrain.isnull().sum()

In [None]:
titanicDSTest.isnull().sum()