In this submission notebook, I will train the best model on the whole cleaned data set and use it to predict the actual test data set

In [1]:
import pandas as pd
import numpy as np
import pickle
import joblib

In [2]:
training_df = pd.read_csv(r"../cleaned_data/cleaned_data.csv")

In [3]:
training_df.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet,Destination,VIP,CryoSleep,Transported,cabin_group,cabin_number,cabin_side,passenger_group_size,RoomService_customer,FoodCourt_customer,ShoppingMall_customer,Spa_customer,VRDeck_customer
0,39.0,0.0,0.0,0.0,0.0,0.0,Europa,TRAPPIST-1e,False,False,False,B,0,P,1,False,False,False,False,False
1,24.0,109.0,9.0,25.0,549.0,44.0,Earth,TRAPPIST-1e,False,False,True,F,0,S,1,True,True,True,True,True
2,58.0,43.0,3576.0,0.0,6715.0,49.0,Europa,TRAPPIST-1e,True,False,False,A,0,S,2,True,True,False,True,True
3,33.0,0.0,1283.0,371.0,3329.0,193.0,Europa,TRAPPIST-1e,False,False,False,A,0,S,2,False,True,True,True,True
4,16.0,303.0,70.0,151.0,565.0,2.0,Earth,TRAPPIST-1e,False,False,True,F,1,S,1,True,True,True,True,True


In [4]:
#creating a list of columns to use later on the test set

training_cols = training_df.columns.tolist()

training_cols_final = [x for x in training_cols if x != "Transported"]

training_cols_final

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'HomePlanet',
 'Destination',
 'VIP',
 'CryoSleep',
 'cabin_group',
 'cabin_number',
 'cabin_side',
 'passenger_group_size',
 'RoomService_customer',
 'FoodCourt_customer',
 'ShoppingMall_customer',
 'Spa_customer',
 'VRDeck_customer']

Selectign Training Data

In [5]:
X_train = training_df[training_cols_final]
y_train = training_df['Transported']

X_train.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet,Destination,VIP,CryoSleep,cabin_group,cabin_number,cabin_side,passenger_group_size,RoomService_customer,FoodCourt_customer,ShoppingMall_customer,Spa_customer,VRDeck_customer
0,39.0,0.0,0.0,0.0,0.0,0.0,Europa,TRAPPIST-1e,False,False,B,0,P,1,False,False,False,False,False
1,24.0,109.0,9.0,25.0,549.0,44.0,Earth,TRAPPIST-1e,False,False,F,0,S,1,True,True,True,True,True
2,58.0,43.0,3576.0,0.0,6715.0,49.0,Europa,TRAPPIST-1e,True,False,A,0,S,2,True,True,False,True,True
3,33.0,0.0,1283.0,371.0,3329.0,193.0,Europa,TRAPPIST-1e,False,False,A,0,S,2,False,True,True,True,True
4,16.0,303.0,70.0,151.0,565.0,2.0,Earth,TRAPPIST-1e,False,False,F,1,S,1,True,True,True,True,True


Loading processor

In [6]:
preprocessor = joblib.load(r"../artifacts/preprocessor.joblib")
preprocessor_1 = joblib.load(r"../artifacts/preprocessor_1.joblib")

Transforming training dataset

In [7]:
training_processed = preprocessor.transform(X_train)

Converting Processed Data Set into a Dataframe

In [8]:
training_processed_df = pd.DataFrame(training_processed, columns=preprocessor.get_feature_names_out())

In [9]:
training_processed_df.head()

Unnamed: 0,num__Age,num__RoomService,num__FoodCourt,num__ShoppingMall,num__Spa,num__VRDeck,num__cabin_number,num__passenger_group_size,one_hot__HomePlanet_Earth,one_hot__HomePlanet_Europa,...,one_hot__RoomService_customer_False,one_hot__RoomService_customer_True,one_hot__FoodCourt_customer_False,one_hot__FoodCourt_customer_True,one_hot__ShoppingMall_customer_False,one_hot__ShoppingMall_customer_True,one_hot__Spa_customer_False,one_hot__Spa_customer_True,one_hot__VRDeck_customer_False,one_hot__VRDeck_customer_True
0,0.70087,-0.332213,-0.286542,-0.298906,-0.265089,-0.265611,-1.199029,-0.645587,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,-0.33866,-0.172504,-0.280623,-0.254667,0.201747,-0.225655,-1.199029,-0.645587,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
2,2.017608,-0.269209,2.065252,-0.298906,5.444943,-0.221115,-1.199029,-0.01187,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
3,0.285058,-0.332213,0.557237,0.357594,2.565692,-0.090352,-1.199029,-0.01187,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
4,-0.893076,0.111749,-0.240506,-0.031705,0.215353,-0.263794,-1.197048,-0.645587,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0


In [10]:


with open("../best_model/rf_best_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

In [11]:
new_best_model = loaded_model.fit(training_processed_df,y_train)

Loading the Test data

In [12]:
test_df = pd.read_csv(r"../data_source__spaceship-titanic/test.csv")

In [None]:
test_df['cabin_group'] = test_df['Cabin'].str[:1]
test_df['cabin_number'] = test_df['Cabin'].str.extract(r"/([^/]+)/")
test_df['cabin_side'] = test_df['Cabin'].str.extract(r"/([^/]+)$")

In [None]:
test_df['passenger_group'] = test_df['PassengerId'].str[:4]

In [None]:
passenger_group_size_dict = test_df['passenger_group'].value_counts().to_dict()
passenger_group_size_dict

In [None]:
test_df['passenger_group_size'] = test_df['passenger_group'].map(passenger_group_size_dict)

In [13]:
test_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [14]:
test_df_ = test_df.copy()

In [15]:
test_df_.isna().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [16]:
#test_df_ = test_df_.drop(['PassengerId', 'Name'], axis=1)

In [17]:
#creating a dummy transported column
test_df_['Transported']= 'na'

In [18]:
 test_df_processed = preprocessor_1.transform(test_df_)

In [19]:
test_df_processed_df = pd.DataFrame(test_df_processed, columns=preprocessor_1.get_feature_names_out())

In [20]:
test_df_processed_df

Unnamed: 0,mode__Cabin,median__Age,zero__RoomService,zero__FoodCourt,zero__ShoppingMall,zero__Spa,zero__VRDeck,unknown__HomePlanet,unknown__Destination,unknown__Name,false__VIP,remainder__PassengerId,remainder__CryoSleep,remainder__Transported
0,G/3/S,27.0,0.0,0.0,0.0,0.0,0.0,Earth,TRAPPIST-1e,Nelly Carsoning,False,0013_01,True,na
1,F/4/S,19.0,0.0,9.0,0.0,2823.0,0.0,Earth,TRAPPIST-1e,Lerome Peckers,False,0018_01,False,na
2,C/0/S,31.0,0.0,0.0,0.0,0.0,0.0,Europa,55 Cancri e,Sabih Unhearfus,False,0019_01,True,na
3,C/1/S,38.0,0.0,6652.0,0.0,181.0,585.0,Europa,TRAPPIST-1e,Meratz Caltilter,False,0021_01,False,na
4,F/5/S,20.0,10.0,0.0,635.0,0.0,0.0,Earth,TRAPPIST-1e,Brence Harperez,False,0023_01,False,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,G/1496/S,34.0,0.0,0.0,0.0,0.0,0.0,Earth,TRAPPIST-1e,Jeron Peter,False,9266_02,True,na
4273,G/734/S,42.0,0.0,847.0,17.0,10.0,144.0,Earth,TRAPPIST-1e,Matty Scheron,False,9269_01,False,na
4274,D/296/P,27.0,0.0,0.0,0.0,0.0,0.0,Mars,55 Cancri e,Jayrin Pore,False,9271_01,True,na
4275,D/297/P,27.0,0.0,2680.0,0.0,0.0,523.0,Europa,unknown,Kitakan Conale,False,9273_01,False,na


In [21]:
test_df_processed_df.columns = test_df_processed_df.columns.str.replace(r".*__", "", regex=True)

In [22]:
test_df_processed_df['RoomService_customer'] = np.where(test_df_processed_df['RoomService']>0, True, False)
test_df_processed_df['FoodCourt_customer'] = np.where(test_df_processed_df['FoodCourt']>0, True, False)
test_df_processed_df['FoodCourt_customer'] = np.where(test_df_processed_df['FoodCourt']>0, True, False)
test_df_processed_df['ShoppingMall_customer'] = np.where(test_df_processed_df['ShoppingMall']>0, True, False)
test_df_processed_df['Spa_customer'] = np.where(test_df_processed_df['Spa']>0, True, False)
test_df_processed_df['VRDeck_customer'] = np.where(test_df_processed_df['VRDeck']>0, True, False)


In [23]:
test_df_processed_df['cabin_group'] = test_df_processed_df['Cabin'].str[:1]

In [24]:
test_df_processed_df = test_df_processed_df[training_cols_final]

KeyError: "['cabin_number', 'cabin_side', 'passenger_group_size'] not in index"

In [None]:
test_df_processed_df

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet,Destination,VIP,CryoSleep,cabin_group,RoomService_customer,FoodCourt_customer,ShoppingMall_customer,Spa_customer,VRDeck_customer
0,27.0,0.0,0.0,0.0,0.0,0.0,Earth,TRAPPIST-1e,False,True,G,False,False,False,False,False
1,19.0,0.0,9.0,0.0,2823.0,0.0,Earth,TRAPPIST-1e,False,False,F,False,True,False,True,False
2,31.0,0.0,0.0,0.0,0.0,0.0,Europa,55 Cancri e,False,True,C,False,False,False,False,False
3,38.0,0.0,6652.0,0.0,181.0,585.0,Europa,TRAPPIST-1e,False,False,C,False,True,False,True,True
4,20.0,10.0,0.0,635.0,0.0,0.0,Earth,TRAPPIST-1e,False,False,F,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,34.0,0.0,0.0,0.0,0.0,0.0,Earth,TRAPPIST-1e,False,True,G,False,False,False,False,False
4273,42.0,0.0,847.0,17.0,10.0,144.0,Earth,TRAPPIST-1e,False,False,G,False,True,True,True,True
4274,27.0,0.0,0.0,0.0,0.0,0.0,Mars,55 Cancri e,False,True,D,False,False,False,False,False
4275,27.0,0.0,2680.0,0.0,0.0,523.0,Europa,unknown,False,False,D,False,True,False,False,True


In [None]:
test_df_processed_df_final = preprocessor.transform(test_df_processed_df)

In [None]:
new_best_model.predict(test_df_processed_df_final)



array([ True, False,  True, ...,  True,  True,  True], shape=(4277,))

In [None]:
test_df['Transported'] = new_best_model.predict(test_df_processed_df_final)



In [None]:
test_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,True
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,False
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,True
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,True
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,False


In [None]:
test_df[['PassengerId', 'Transported']].to_csv(r'../submission/submission.csv', index=False)