In [1]:
'''
PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group 
the passenger is travelling with and pp is their number within the group. People in a group are often family members, 
but not always.

HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.

CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the 
voyage. Passengers in cryosleep are confined to their cabins.

Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P 
for Port or S for Starboard.

Destination - The planet the passenger will be debarking to.

Age - The age of the passenger.

VIP - Whether the passenger has paid for special VIP service during the voyage.

RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship 
Titanic's many luxury amenities.

Name - The first and last names of the passenger.

Transported - Whether the passenger was transported to another dimension. This is the target, the column you are 
trying to predict.
'''

"\nPassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group \nthe passenger is travelling with and pp is their number within the group. People in a group are often family members, \nbut not always.\n\nHomePlanet - The planet the passenger departed from, typically their planet of permanent residence.\n\nCryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the \nvoyage. Passengers in cryosleep are confined to their cabins.\n\nCabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P \nfor Port or S for Starboard.\n\nDestination - The planet the passenger will be debarking to.\n\nAge - The age of the passenger.\n\nVIP - Whether the passenger has paid for special VIP service during the voyage.\n\nRoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship \nTitanic's many luxury amenit

In [2]:
# Import Dependencies for Download and Clean
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file(
'/Users/spicious/Desktop/News Datasets/Resources/spaceship-titanic-387720-729aac731f9f.json')

In [3]:
# Import Dependencies for Random Forest Model
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [4]:
# Import for Data Imputation
from scipy import stats
import numpy as np
from sklearn import impute

In [5]:
project_id = 'spaceship-titanic-387720'
client = bigquery.Client(credentials= credentials,project=project_id)

In [6]:
query_Amenities = client.query("""
   SELECT *
   FROM Starship_Titanic.Amenities""")

query_PassengerInfo = client.query("""
   SELECT *
   FROM Starship_Titanic.PassengerInfo""")

query_PlanetInfo = client.query("""
   SELECT *
   FROM Starship_Titanic.PlanetInfo""")

results_Amenities = query_Amenities.result()
results_PassengerInfo = query_PassengerInfo.result()
results_PlanetInfo = query_PlanetInfo.result()

Amenities_df = pd.DataFrame(results_Amenities)
PassengerInfo_df = pd.DataFrame(results_PassengerInfo)
PlanetInfo_df = pd.DataFrame(results_PlanetInfo)

In [7]:
print(Amenities_df.loc[0][0])
print(PassengerInfo_df.loc[0][0])
print(PlanetInfo_df.loc[0][0])

Row(('4446_05', 'B/175/S', False, 0.0, 4017.0, None, None, 2260.0), {'PassengerId': 0, 'Cabin': 1, 'VIP': 2, 'RoomService': 3, 'FoodCourt': 4, 'ShoppingMall': 5, 'Spa': 6, 'VRDeck': 7})
Row(('6512_02', 'Photons Drivery', None, 'A/80/S', None, True), {'PassengerId': 0, 'Name': 1, 'HomePlanet': 2, 'Cabin': 3, 'Age': 4, 'Transported': 5})
Row(('0119_01', None, 'TRAPPIST-1e', False), {'PassengerId': 0, 'HomePlanet': 1, 'Destination': 2, 'CryoSleep': 3})


In [8]:
Amenities_sorted = pd.DataFrame()
i = 0
while i < 8:
    Amenities_sorted[i] = Amenities_df[0].apply(lambda x: x[i])
    i += 1

Amenities_sorted = Amenities_sorted.rename(columns={0: "PassengerId", 
                   1: "Cabin", 
                   2: "VIP", 
                   3: "RoomService",
                   4: "FoodCourt",
                   5: "ShoppingMall",
                   6: "Spa",
                   7: "VRDeck"})

In [9]:
PassengerInfo_sorted = pd.DataFrame()
i = 0
while i < 6:
    PassengerInfo_sorted[i] = PassengerInfo_df[0].apply(lambda x: x[i])
    i += 1
    
PassengerInfo_sorted = PassengerInfo_sorted.rename(columns={0: "PassengerId", 
                   1: "Name", 
                   2: "HomePlanet", 
                   3: "Cabin",
                   4: "Age",
                   5: "Transported"})

In [10]:
PlanetInfo_sorted = pd.DataFrame()
i = 0
while i < 4:
    PlanetInfo_sorted[i] = PlanetInfo_df[0].apply(lambda x: x[i])
    i += 1
    
PlanetInfo_sorted = PlanetInfo_sorted.rename(columns={0: "PassengerId", 
                   1: "HomePlanet", 
                   2: "Destination", 
                   3: "CryoSleep"})

In [11]:
PlanetInfo_sorted

Unnamed: 0,PassengerId,HomePlanet,Destination,CryoSleep
0,0119_01,,TRAPPIST-1e,False
1,0242_01,,TRAPPIST-1e,False
2,0321_01,,TRAPPIST-1e,False
3,0382_01,,,False
4,0444_02,,TRAPPIST-1e,False
...,...,...,...,...
12965,9177_01,Europa,55 Cancri e,True
12966,9177_02,Europa,55 Cancri e,True
12967,9206_03,Europa,55 Cancri e,True
12968,9215_01,Europa,TRAPPIST-1e,True


In [12]:
# Merging tables
merge_df = pd.merge(Amenities_sorted, PassengerInfo_sorted, on = 'PassengerId', how = 'inner')
df = pd.merge(merge_df, PlanetInfo_sorted, on = 'PassengerId', how = 'inner')

In [14]:
# Cleaning up Column Duplications
df = df.drop(['Cabin_y', 'HomePlanet_y'], axis=1)

In [15]:
# Renaming Columns
df = df.rename(columns={'Cabin_x': "Cabin", 
                   'HomePlanet_x': "HomePlanet"})

In [16]:
#check dataframe
print(df.shape)
df.head(5)

(12970, 14)


Unnamed: 0,PassengerId,Cabin,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,HomePlanet,Age,Transported,Destination,CryoSleep
0,4446_05,B/175/S,False,0.0,4017.0,,,2260.0,Phah Chocaters,Europa,33.0,True,TRAPPIST-1e,True
1,8906_01,F/1723/S,False,720.0,2.0,,,0.0,Rena Gainney,Earth,19.0,False,55 Cancri e,False
2,2122_01,E/154/S,False,0.0,618.0,,,0.0,Mollen Wolfaddox,Earth,24.0,True,PSO J318.5-22,False
3,3008_01,D/92/P,False,0.0,0.0,,,0.0,Ancham Timanable,Europa,17.0,True,55 Cancri e,True
4,0008_02,B/1/P,False,0.0,0.0,,0.0,0.0,Altardr Flatic,Europa,34.0,True,TRAPPIST-1e,True


In [17]:
# Check columns
columns = list(df.columns)
columns

['PassengerId',
 'Cabin',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Name',
 'HomePlanet',
 'Age',
 'Transported',
 'Destination',
 'CryoSleep']

# Data Munging

In [18]:
# Make plans
'''
1) Check for mising values in all columns
1.5) Consider Imputation
2) remove unnecessary columns such as PassengerId and Name
3) Convert categorical variables into indicator variables for HomePlanet, CryoSleep, Deck, Side, Destination, VIP, Transported
4) Split up Cabin column into three different features
'''

'\n1) Check for mising values in all columns\n1.5) Consider Imputation\n2) remove unnecessary columns such as PassengerId and Name\n3) Convert categorical variables into indicator variables for HomePlanet, CryoSleep, Deck, Side, Destination, VIP, Transported\n4) Split up Cabin column into three different features\n'

In [19]:
# Split Cabin column in three different Columns
df[['Deck','RoomNum','Side']] = df.Cabin.str.split("/", expand = True)
del df['Cabin']

# Reorder dataset to make me happy
df = df[['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Deck',
 'RoomNum',
 'Side',
 'Destination',
 'Age',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Name',
 'Transported']]

# Check to see everything is going to plan
print(df.head())

  PassengerId HomePlanet  CryoSleep Deck RoomNum Side    Destination   Age  \
0     4446_05     Europa       True    B     175    S    TRAPPIST-1e  33.0   
1     8906_01      Earth      False    F    1723    S    55 Cancri e  19.0   
2     2122_01      Earth      False    E     154    S  PSO J318.5-22  24.0   
3     3008_01     Europa       True    D      92    P    55 Cancri e  17.0   
4     0008_02     Europa       True    B       1    P    TRAPPIST-1e  34.0   

     VIP  RoomService  FoodCourt  ShoppingMall  Spa  VRDeck              Name  \
0  False          0.0     4017.0           NaN  NaN  2260.0    Phah Chocaters   
1  False        720.0        2.0           NaN  NaN     0.0      Rena Gainney   
2  False          0.0      618.0           NaN  NaN     0.0  Mollen Wolfaddox   
3  False          0.0        0.0           NaN  NaN     0.0  Ancham Timanable   
4  False          0.0        0.0           NaN  0.0     0.0    Altardr Flatic   

   Transported  
0         True  
1        F

In [20]:
# Drop unnecessary columns
del df['PassengerId']
del df['Name']

In [21]:
# Check drops
columns = list(df.columns)
columns

['HomePlanet',
 'CryoSleep',
 'Deck',
 'RoomNum',
 'Side',
 'Destination',
 'Age',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Transported']

In [22]:
# examining missing values
print("Missing values distribution: ")
print(df.isnull().mean())

Missing values distribution: 
HomePlanet      0.022205
CryoSleep       0.000000
Deck            0.023053
RoomNum         0.023053
Side            0.023053
Destination     0.021126
Age             0.020817
VIP             0.000000
RoomService     0.020278
FoodCourt       0.022282
ShoppingMall    0.023593
Spa             0.021897
VRDeck          0.020663
Transported     0.000000
dtype: float64


In [23]:
# check datatype in each column
print("Column datatypes: ")
print(df.dtypes)

Column datatypes: 
HomePlanet       object
CryoSleep          bool
Deck             object
RoomNum          object
Side             object
Destination      object
Age             float64
VIP                bool
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported        bool
dtype: object


In [24]:
# Exploring the data
print('HomePlanet', df["HomePlanet"].unique())
print('CryoSleep', df["CryoSleep"].unique())
print('Deck', df["Deck"].unique())
print('Side', df["Side"].unique())
print('Destination', df["Destination"].unique())
print('VIP', df["VIP"].unique())
print('Transported', df["Transported"].unique())

HomePlanet ['Europa' 'Earth' 'Mars' None]
CryoSleep [ True False]
Deck ['B' 'F' 'E' 'D' 'C' 'G' None 'A' 'T']
Side ['S' 'P' None]
Destination ['TRAPPIST-1e' '55 Cancri e' 'PSO J318.5-22' None]
VIP [False  True]
Transported [ True False]


In [49]:
df.isnull().sum()

HomePlanet      288
CryoSleep         0
Deck            299
RoomNum         299
Side            299
Destination     274
Age             270
VIP               0
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Transported       0
dtype: int64

In [50]:
df.shape

(12970, 14)

In [53]:
df["Transported"].value_counts()

True     8655
False    4315
Name: Transported, dtype: int64

In [None]:
df["Transported"].value_counts()

# Dealing with NaNs

In [26]:
df_clean = df.dropna()
df_clean

Unnamed: 0,HomePlanet,CryoSleep,Deck,RoomNum,Side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
495,Europa,False,B,0,P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
497,Europa,True,B,1,P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,True
498,Earth,False,F,4,P,55 Cancri e,24.0,False,0.0,1.0,0.0,0.0,637.0,False
499,Mars,True,F,5,P,TRAPPIST-1e,45.0,False,0.0,0.0,0.0,0.0,0.0,True
500,Earth,False,G,0,P,TRAPPIST-1e,0.0,False,0.0,0.0,0.0,0.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,Mars,False,F,465,P,TRAPPIST-1e,37.0,False,20.0,0.0,2559.0,20.0,0.0,True
12966,Earth,False,F,739,P,TRAPPIST-1e,20.0,False,0.0,0.0,2687.0,35.0,340.0,False
12967,Europa,False,B,153,S,55 Cancri e,42.0,False,0.0,7000.0,8251.0,1523.0,0.0,True
12968,Europa,False,C,257,P,TRAPPIST-1e,41.0,False,0.0,116.0,10705.0,9181.0,10.0,False


In [27]:
compression_opts = dict(method='zip',
                        archive_name='titanic.csv')  
df.to_csv('titanic.zip', index=False,
          compression=compression_opts)  

# Random Forest Model

In [28]:
# Define features set
X = df_clean.copy()
X.drop("Transported", axis=1, inplace=True)
X.head()

Unnamed: 0,HomePlanet,CryoSleep,Deck,RoomNum,Side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
495,Europa,False,B,0,P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
497,Europa,True,B,1,P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0
498,Earth,False,F,4,P,55 Cancri e,24.0,False,0.0,1.0,0.0,0.0,637.0
499,Mars,True,F,5,P,TRAPPIST-1e,45.0,False,0.0,0.0,0.0,0.0,0.0
500,Earth,False,G,0,P,TRAPPIST-1e,0.0,False,0.0,0.0,0.0,0.0,0.0


In [29]:
# Define target vector
y = df_clean["Transported"].values.reshape(-1, 1)
y[:5]

array([[False],
       [ True],
       [False],
       [ True],
       [ True]])

In [30]:
# Dummy Categorical Variables
X = pd.get_dummies(X)

In [31]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [32]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [33]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [34]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [35]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [36]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train, y_train)

  


In [37]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test)

In [38]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [39]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,417,466
Actual 1,285,1486


Accuracy Score : 0.7170308967596082
Classification Report
              precision    recall  f1-score   support

       False       0.59      0.47      0.53       883
        True       0.76      0.84      0.80      1771

    accuracy                           0.72      2654
   macro avg       0.68      0.66      0.66      2654
weighted avg       0.71      0.72      0.71      2654



In [40]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:100]

[(0.0744555648143885, 'Age'),
 (0.06981689201196303, 'Spa'),
 (0.06837316885792351, 'VRDeck'),
 (0.06765946815000623, 'RoomService'),
 (0.058332370950318145, 'FoodCourt'),
 (0.0549888874603498, 'ShoppingMall'),
 (0.03275977169328835, 'CryoSleep'),
 (0.01088559113934193, 'HomePlanet_Earth'),
 (0.009431924545848357, 'Deck_F'),
 (0.008791962622155028, 'Deck_E'),
 (0.008342361709447093, 'Destination_TRAPPIST-1e'),
 (0.008294180096810206, 'HomePlanet_Europa'),
 (0.007455625430327392, 'Destination_55 Cancri e'),
 (0.0073762032676351974, 'Side_S'),
 (0.007250635221492544, 'Side_P'),
 (0.006876620257560048, 'Deck_G'),
 (0.006719317780139269, 'HomePlanet_Mars'),
 (0.005964778476840288, 'VIP'),
 (0.004953223822679059, 'Destination_PSO J318.5-22'),
 (0.004830249732634252, 'Deck_B'),
 (0.004156215620405723, 'Deck_C'),
 (0.003577799859542926, 'Deck_D'),
 (0.0021788723111566244, 'Deck_A'),
 (0.0017467049932325584, 'RoomNum_37'),
 (0.0014473387723955837, 'RoomNum_57'),
 (0.00133808077030804, 'RoomNum

# Trying Data Imputation

In [41]:
# Impute int values with mean.

titanic_df_impute = df.copy()

mean_imputer = impute.SimpleImputer(strategy='mean')

titanic_df_impute['Age'] = mean_imputer.fit_transform(titanic_df_impute['Age'].values.reshape(-1,1))
titanic_df_impute['RoomService'] = mean_imputer.fit_transform(titanic_df_impute['RoomService'].values.reshape(-1,1))
titanic_df_impute['FoodCourt'] = mean_imputer.fit_transform(titanic_df_impute['FoodCourt'].values.reshape(-1,1))
titanic_df_impute['ShoppingMall'] = mean_imputer.fit_transform(titanic_df_impute['ShoppingMall'].values.reshape(-1,1))
titanic_df_impute['Spa'] = mean_imputer.fit_transform(titanic_df_impute['Spa'].values.reshape(-1,1))
titanic_df_impute['VRDeck'] = mean_imputer.fit_transform(titanic_df_impute['VRDeck'].values.reshape(-1,1))

In [42]:
# Check to see if Mean Imputation was performed
titanic_df_impute.isnull().sum()

HomePlanet      288
CryoSleep         0
Deck            299
RoomNum         299
Side            299
Destination     274
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
dtype: int64

In [43]:
## Use ffil and fillna to fill object values with the value nearest to it
titanic_df_impute['RoomNum'] = titanic_df_impute['RoomNum'].fillna(method='ffill')
titanic_df_impute['HomePlanet'] = titanic_df_impute['HomePlanet'].fillna(method='ffill')
titanic_df_impute['CryoSleep'] = titanic_df_impute['CryoSleep'].fillna(method='ffill')
titanic_df_impute['Deck'] = titanic_df_impute['Deck'].fillna(method='ffill')
titanic_df_impute['Side'] = titanic_df_impute['Side'].fillna(method='ffill')
titanic_df_impute['VIP'] = titanic_df_impute['VIP'].fillna(method='ffill')
titanic_df_impute['Destination'] = titanic_df_impute['Destination'].fillna(method='ffill')

In [44]:
# Check to see if categorical Imputation was performed
titanic_df_impute.isnull().sum()

HomePlanet      0
CryoSleep       0
Deck            0
RoomNum         0
Side            0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
dtype: int64

In [47]:
titanic_df_impute

Unnamed: 0,HomePlanet,CryoSleep,Deck,RoomNum,Side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,True,B,175,S,TRAPPIST-1e,33.0,False,0.0,4017.0,174.906033,308.476904,2260.0,True
1,Earth,False,F,1723,S,55 Cancri e,19.0,False,720.0,2.0,174.906033,308.476904,0.0,False
2,Earth,False,E,154,S,PSO J318.5-22,24.0,False,0.0,618.0,174.906033,308.476904,0.0,True
3,Europa,True,D,92,P,55 Cancri e,17.0,False,0.0,0.0,174.906033,308.476904,0.0,True
4,Europa,True,B,1,P,TRAPPIST-1e,34.0,False,0.0,0.0,174.906033,0.000000,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,Mars,False,F,465,P,TRAPPIST-1e,37.0,False,20.0,0.0,2559.000000,20.000000,0.0,True
12966,Earth,False,F,739,P,TRAPPIST-1e,20.0,False,0.0,0.0,2687.000000,35.000000,340.0,False
12967,Europa,False,B,153,S,55 Cancri e,42.0,False,0.0,7000.0,8251.000000,1523.000000,0.0,True
12968,Europa,False,C,257,P,TRAPPIST-1e,41.0,False,0.0,116.0,10705.000000,9181.000000,10.0,False
