# Machine learning project

source : https://www.kaggle.com/competitions/spaceship-titanic/data


In this project we are predicting whether a passenger was transported to an alternate dimension during the Spaceship Titanic's 
collision with the spacetime anomaly

We are using classification accuracy as the evaluatin metric

## Details on dataset
- train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
    - PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
    - HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
    - CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
    - Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
    - Destination - The planet the passenger will be debarking to.
    - Age - The age of the passenger.
    - VIP - Whether the passenger has paid for special VIP service during the voyage.
    - RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
    - Name - The first and last names of the passenger.
    - Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv(r"data\train.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [6]:
df["Transported"].isnull().sum()

0

There are no missing labels

In [7]:
len(df)

8693

In [8]:
df_tmp = df.copy()

In [9]:
df_tmp.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [10]:
df_tmp.dropna(inplace = True)

In [11]:
df_tmp.shape

(6606, 14)

In [12]:
# As possibly name does not have impact on whether they will be transported or not (assumption)
df_tmp = df_tmp.drop("Name", axis = 1)

In [13]:
len(df_tmp)

6606

In [14]:
df_tmp.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0
mean,28.894036,222.991674,478.958523,178.356494,313.16152,303.780048
std,14.533429,644.987936,1678.592291,576.328407,1144.016291,1127.142166
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,49.0,82.75,30.0,65.0,52.0
max,79.0,9920.0,29813.0,12253.0,22408.0,20336.0


In [15]:
df_tmp.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Transported'],
      dtype='object')

In [16]:
df_tmp.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [17]:
df_tmp

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


In [18]:
df_tmp['ID_last'] = df_tmp['PassengerId'].str.split("_").str[1]

In [19]:
df_tmp['ID_last']

0       01
1       01
2       01
3       02
4       01
        ..
8688    01
8689    01
8690    01
8691    01
8692    02
Name: ID_last, Length: 6606, dtype: object

In [20]:
df_tmp

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,ID_last
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,01
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,01
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,01
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,02
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,01
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,01
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,01
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,01


In [21]:
df_tmp["HomePlanet"].value_counts()

Earth     3566
Europa    1673
Mars      1367
Name: HomePlanet, dtype: int64

### changing True and falses columns to numerical

In [22]:
Tr_fl_columns = ["CryoSleep", "VIP", "Transported"]
for col in Tr_fl_columns:
    df_tmp[col] = pd.to_numeric(df_tmp[col])

In [23]:
df_tmp

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,ID_last
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,01
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,01
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,01
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,02
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,01
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,01
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,01
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,01


In [24]:
df["Cabin"]

0          B/0/P
1          F/0/S
2          A/0/S
3          A/0/S
4          F/1/S
          ...   
8688      A/98/P
8689    G/1499/S
8690    G/1500/S
8691     E/608/S
8692     E/608/S
Name: Cabin, Length: 8693, dtype: object

In [25]:
# Splitting cabin in to three (deck, num,side)
df_tmp["cabin_deck"] = df_tmp["Cabin"].str.split("/").str[0]
df_tmp["cabin_num"] = df_tmp["Cabin"].str.split("/").str[1]
df_tmp["cabin_side"] = df_tmp["Cabin"].str.split("/").str[2]

In [26]:
df_tmp.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,ID_last,cabin_deck,cabin_num,cabin_side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,1,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,1,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,2,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,1,F,1,S


In [27]:
df_tmp.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,ID_last,cabin_deck,cabin_num,cabin_side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,1,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,1,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,2,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,1,F,1,S


In [28]:
df_tmp["ID_first"] = df_tmp["PassengerId"].str[:4]

In [29]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6606 entries, 0 to 8692
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   6606 non-null   object 
 1   HomePlanet    6606 non-null   object 
 2   CryoSleep     6606 non-null   bool   
 3   Cabin         6606 non-null   object 
 4   Destination   6606 non-null   object 
 5   Age           6606 non-null   float64
 6   VIP           6606 non-null   bool   
 7   RoomService   6606 non-null   float64
 8   FoodCourt     6606 non-null   float64
 9   ShoppingMall  6606 non-null   float64
 10  Spa           6606 non-null   float64
 11  VRDeck        6606 non-null   float64
 12  Transported   6606 non-null   bool   
 13  ID_last       6606 non-null   object 
 14  cabin_deck    6606 non-null   object 
 15  cabin_num     6606 non-null   object 
 16  cabin_side    6606 non-null   object 
 17  ID_first      6606 non-null   object 
dtypes: bool(3), float64(6), obje

In [30]:
df_tmp.drop("PassengerId", axis = 1, inplace = True)

In [31]:
potential_numeric = ["ID_last", "ID_first", "cabin_num"]
for col in potential_numeric:
    df_tmp[col] = pd.to_numeric(df_tmp[col])

In [32]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6606 entries, 0 to 8692
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    6606 non-null   object 
 1   CryoSleep     6606 non-null   bool   
 2   Cabin         6606 non-null   object 
 3   Destination   6606 non-null   object 
 4   Age           6606 non-null   float64
 5   VIP           6606 non-null   bool   
 6   RoomService   6606 non-null   float64
 7   FoodCourt     6606 non-null   float64
 8   ShoppingMall  6606 non-null   float64
 9   Spa           6606 non-null   float64
 10  VRDeck        6606 non-null   float64
 11  Transported   6606 non-null   bool   
 12  ID_last       6606 non-null   int64  
 13  cabin_deck    6606 non-null   object 
 14  cabin_num     6606 non-null   int64  
 15  cabin_side    6606 non-null   object 
 16  ID_first      6606 non-null   int64  
dtypes: bool(3), float64(6), int64(3), object(5)
memory usage: 793.5+ KB


In [33]:
for label, content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

HomePlanet
Cabin
Destination
cabin_deck
cabin_side


In [34]:
# Cabin column can be dropped as we have made it into several columns
df_tmp.drop("Cabin", axis = 1, inplace = True)

In [35]:
def change_cat_string_to_numeric(df, column_list):
    df_dict = {}
    for col in column_list:
        df_dict[f"{col}_df"] = pd.get_dummies(df_tmp[col])
    return df_dict

In [36]:
column_list = ["cabin_side", "cabin_deck", "HomePlanet", "Destination"]
df_dict = change_cat_string_to_numeric(df_tmp, column_list)

In [37]:
df_dict['cabin_side_df']

Unnamed: 0,P,S
0,1,0
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
8688,1,0
8689,0,1
8690,0,1
8691,0,1


In [38]:
df_tmp = pd.concat([df_tmp,df_dict['cabin_side_df'], df_dict["cabin_deck_df"], df_dict["HomePlanet_df"],
           df_dict["Destination_df"]], axis = "columns")

In [39]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6606 entries, 0 to 8692
Data columns (total 32 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   HomePlanet     6606 non-null   object 
 1   CryoSleep      6606 non-null   bool   
 2   Destination    6606 non-null   object 
 3   Age            6606 non-null   float64
 4   VIP            6606 non-null   bool   
 5   RoomService    6606 non-null   float64
 6   FoodCourt      6606 non-null   float64
 7   ShoppingMall   6606 non-null   float64
 8   Spa            6606 non-null   float64
 9   VRDeck         6606 non-null   float64
 10  Transported    6606 non-null   bool   
 11  ID_last        6606 non-null   int64  
 12  cabin_deck     6606 non-null   object 
 13  cabin_num      6606 non-null   int64  
 14  cabin_side     6606 non-null   object 
 15  ID_first       6606 non-null   int64  
 16  P              6606 non-null   uint8  
 17  S              6606 non-null   uint8  
 18  A       

In [40]:
# Now we can remove the object columns
for col in column_list:
    df_tmp.drop(col, axis = 1, inplace = True)

In [41]:
df_tmp.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,ID_last,cabin_num,ID_first,P,...,E,F,G,T,Earth,Europa,Mars,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
count,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0,...,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0
mean,28.894036,222.991674,478.958523,178.356494,313.16152,303.780048,1.516803,599.177414,4662.211777,0.493642,...,0.103391,0.325764,0.298668,0.000303,0.539812,0.253255,0.206933,0.212988,0.094308,0.692704
std,14.533429,644.987936,1678.592291,576.328407,1144.016291,1127.142166,1.054433,513.450585,2670.13394,0.499997,...,0.304492,0.468695,0.457709,0.017399,0.49845,0.434908,0.405138,0.40945,0.292279,0.461408
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0,1.0,166.0,2368.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0,1.0,424.0,4647.5,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
75%,38.0,49.0,82.75,30.0,65.0,52.0,2.0,994.0,6910.75,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
max,79.0,9920.0,29813.0,12253.0,22408.0,20336.0,8.0,1894.0,9280.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [44]:
df_tmp.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,ID_last,...,E,F,G,T,Earth,Europa,Mars,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
0,False,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,...,0,0,0,0,0,1,0,0,0,1
1,False,24.0,False,109.0,9.0,25.0,549.0,44.0,True,1,...,0,1,0,0,1,0,0,0,0,1
2,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,1,...,0,0,0,0,0,1,0,0,0,1
3,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,2,...,0,0,0,0,0,1,0,0,0,1
4,False,16.0,False,303.0,70.0,151.0,565.0,2.0,True,1,...,0,1,0,0,1,0,0,0,0,1


In [46]:
# Checking if there is any column which is not numeric
for label, content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

#### Now our data is fully numeric