# kaggle competition: Space Titanic Prediction Model
In this competition your task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. To help you make these predictions, you're given a set of personal records recovered from the ship's damaged computer system

Importing relevant libraries

In [131]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

Import train and test dataset

In [132]:
train_raw = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')

### train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
#       PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
#       HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
#       CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
#       Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
#       Destination - The planet the passenger will be debarking to.
#       Age - The age of the passenger.
#       VIP - Whether the passenger has paid for special VIP service during the voyage.
#       RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
#       Name - The first and last names of the passenger.
#       Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [133]:
# Format raw data into pandas dataframe
# train_raw : Drop nulls from raw data as we consider it is irrelvant
# train_dataset : Dropped 'Transported' from raw data
# trian_result : Contains all the results from raw data

train_dataset = pd.DataFrame(train_raw)

train_result = pd.DataFrame(train_dataset['Transported'])
train_dataset = train_dataset.drop(['Transported'], axis = 1)

In [134]:
# Checking the correlation of each training dataset attributes to 'Transported'
# Changing column variable type from boolean to float32
train_dataset['CryoSleep'] = train_dataset['CryoSleep'].astype(np.float32)
train_dataset['VIP'] = train_dataset['VIP'].astype(np.float32)
# sns.pairplot(train_dataset)

In [135]:
# Decompose Cabin Column to Deck, side and num column. 
print(len(train_dataset['Cabin'].unique()))
train_dataset = train_dataset.drop(['Cabin'], axis=1)
# Dropping 'Name' columns as it creates bias to ML model 
train_dataset = train_dataset.drop(['Name'], axis = 1)
train_dataset = train_dataset.drop(['PassengerId'], axis = 1)
train_dataset.info()

6561
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   float32
 2   Destination   8511 non-null   object 
 3   Age           8514 non-null   float64
 4   VIP           8490 non-null   float32
 5   RoomService   8512 non-null   float64
 6   FoodCourt     8510 non-null   float64
 7   ShoppingMall  8485 non-null   float64
 8   Spa           8510 non-null   float64
 9   VRDeck        8505 non-null   float64
dtypes: float32(2), float64(6), object(2)
memory usage: 611.4+ KB


In [136]:
# Apply one-hot encoding to HomePlanet, Destination, Deck and Side
test = pd.get_dummies(train_dataset['HomePlanet']).astype('float32')
train_dataset = train_dataset.join(test)
train_dataset = train_dataset.drop(['HomePlanet'], axis=1)

test = pd.get_dummies(train_dataset['Destination']).astype('float32')
train_dataset = train_dataset.join(test)
train_dataset = train_dataset.drop(['Destination'], axis=1)

In [137]:
# Filling NA values with Means Approach
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(train_dataset)
train_dataset = pd.DataFrame(imp_mean.transform(train_dataset))

In [138]:
# Summary of train_dataset
# ensure all of the variables are converted to numeric fields
# Run Decision Tree Learning on this data frame
train_dataset.info()

# Output CSV file of the working dataset to mannually check if the dataframe is in correct format or not
train_dataset.to_csv('train_dataset.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       8693 non-null   float64
 1   1       8693 non-null   float64
 2   2       8693 non-null   float64
 3   3       8693 non-null   float64
 4   4       8693 non-null   float64
 5   5       8693 non-null   float64
 6   6       8693 non-null   float64
 7   7       8693 non-null   float64
 8   8       8693 non-null   float64
 9   9       8693 non-null   float64
 10  10      8693 non-null   float64
 11  11      8693 non-null   float64
 12  12      8693 non-null   float64
 13  13      8693 non-null   float64
dtypes: float64(14)
memory usage: 950.9 KB


# Applying DTL to DataSet

In [139]:
# Import Testing data
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [140]:
# Creating DTL from the formatted Data
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_dataset.values, train_result.values)

In [141]:
# format test data
# test_data = test_data.dropna(axis=0, how = 'any')

test_data['CryoSleep'] = test_data['CryoSleep'].astype(np.float32)
test_data['VIP'] = test_data['VIP'].astype(np.float32)

test_data = test_data.drop(['Name'], axis = 1)
test_data = test_data.drop(['Cabin'], axis=1)

test = pd.get_dummies(test_data['HomePlanet']).astype('float32')
test_data = test_data.join(test)
test_data = test_data.drop(['HomePlanet'], axis=1)

test = pd.get_dummies(test_data['Destination']).astype('float32')
test_data = test_data.join(test)
test_data = test_data.drop(['Destination'], axis=1)

id_row = test_data['PassengerId']
id_row.index = range(0, len(test_data['PassengerId']))
test_data = test_data.drop(['PassengerId'], axis = 1)

# Filling NA values with Means Approach
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(test_data)
test_data = pd.DataFrame(imp_mean.transform(test_data))

test_data.to_csv('test_dataset.csv', index=False)

In [142]:
predictions_test = []
for i in range(0, len(id_row)):
    output = str(clf.predict([test_data.iloc[i]]))[1:-1]
    
    if output[0] == ' ':
        output = output[1:]
            
    predictions_test.append(output)
print(predictions_test)

['True', 'False', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False', 'True', 'True', 'True', 'False', 'False', 'True', 'False', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'True', 'True', 'True', 'False', 'False', 'False', 'True', 'False', 'True', 'True', 'False', 'True', 'False', 'False', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'True', 'True', 'False', 'True', 'True', 'False', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'True', 'True', 'True', 'False', 'True', 'False', 'False', 'True', 'False', 'True', 'True', 'False', 'True', 'True', 'False', 'True', 'True', 'False', 

In [143]:
df4=pd.DataFrame({'PassengerId':id_row,'Transported':predictions_test})
df4.to_csv("submissions.csv", index=False)