# PROJECT DESCRIPTION (KAGGLE -> SPACESHIP-TITANIC)

In [87]:
# https://www.kaggle.com/competitions/spaceship-titanic/overview

In [88]:
# Dataset Description
# In this competition your task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. To help you make these predictions, you're given a set of personal records recovered from the ship's damaged computer system.

# File and Data Field Descriptions

# train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
# PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
# HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
# CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
# Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
# Destination - The planet the passenger will be debarking to.
# Age - The age of the passenger.
# VIP - Whether the passenger has paid for special VIP service during the voyage.
# RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
# Name - The first and last names of the passenger.
# Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.


# test.csv - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.

# sample_submission.csv - A submission file in the correct format.
# PassengerId - Id for each passenger in the test set.
# Transported - The target. For each passenger, predict either True or False.

# IMPORT STATEMENTS

In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error


# encoding
from sklearn.preprocessing import LabelEncoder

import statsmodels.api as sm
from itertools import combinations

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# GET RAW DATASET

In [90]:
raw_url_train = "https://raw.githubusercontent.com/shivansh-yashasvi/Projects-for-Practice---Machine_Learning/main/Space%20Titanic%20Classfication/Dataset/train.csv"
raw_url_test = "https://raw.githubusercontent.com/shivansh-yashasvi/Projects-for-Practice---Machine_Learning/main/Space%20Titanic%20Classfication/Dataset/test.csv"
raw_url_sample = "https://raw.githubusercontent.com/shivansh-yashasvi/Projects-for-Practice---Machine_Learning/main/Space%20Titanic%20Classfication/Dataset/sample_submission.csv"

# Download the CSV file from GitHub
!wget -O train.csv  $raw_url_train
!wget -O test.csv $raw_url_test
!wget -O sample.csv $raw_url_sample

--2024-01-06 11:41:54--  https://raw.githubusercontent.com/shivansh-yashasvi/Projects-for-Practice---Machine_Learning/main/Space%20Titanic%20Classfication/Dataset/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 805421 (787K) [text/plain]
Saving to: ‘train.csv’


2024-01-06 11:41:54 (25.6 MB/s) - ‘train.csv’ saved [805421/805421]

--2024-01-06 11:41:54--  https://raw.githubusercontent.com/shivansh-yashasvi/Projects-for-Practice---Machine_Learning/main/Space%20Titanic%20Classfication/Dataset/test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting r

In [109]:
dfTrain_raw = pd.read_csv('train.csv')
dfTest_raw = pd.read_csv('test.csv')
dfSample_raw = pd.read_csv('sample.csv')

In [110]:
dfTrain_raw.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [111]:
dfTest_raw.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


# PIPELINE FOR DATASET PREPROCESSING

run preprocessing according to your needs, to remove the repeated steps.
Have to manually run dfTest and dfTrain according to our needs

### SELECT WHICH DATASET TO PASS THROUGH PIPELINE
df_pipeline = datasetOfOurChoice

In [153]:
df_pipeline = dfTest_raw.copy()

In [154]:
selected_drop_columns = ['PassengerId','Cabin']

### PIPELINING STEPS

In [155]:
df_pipeline.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [156]:
"""""
Regression

Distinct -> encoding -> which makes it continous finally
Continuous->we need them in regression


Classification
Distinct->we need them in classification
Continous-> sort the numerical column and try to divide by the pivot point(in decision trees,ensemble learning(random forest)) but in SVM,Neural Network and Logistic we want continous only
"""

# Continous -> gggg,pp,num,age,roomService,foodCourt,shoppingMall,spa,vr_deck,sum(expenses)
# Distinct  -> homeplanet,cryosleep,destination,vip,deck,side,sir_name

# Output -> trasported

'""\nRegression\n\nDistinct -> encoding -> which makes it continous finally\nContinuous->we need them in regression\n\n\nClassification\nDistinct->we need them in classification\nContinous-> sort the numerical column and try to divide by the pivot point(in decision trees,ensemble learning(random forest)) but in SVM,Neural Network and Logistic we want continous only\n'

In [157]:
# SPLITTING THE PASSENGERID COLUMN WITH DELIMITER "_"
df_pipeline['PassengerId_gggg'] = df_pipeline['PassengerId'].str[:-3]
df_pipeline['PassengerId_pp'] = df_pipeline['PassengerId'].str[-2:]

In [158]:
# SPLITTING THE CABIN COLUMN WITH DELIMITER "/"
df_pipeline['CabinDeck']=df_pipeline['Cabin'].str[:1]
df_pipeline['CabinNum']=df_pipeline['Cabin'].str[2:-2]
df_pipeline['CabinSide']=df_pipeline['Cabin'].str[-1:]

In [159]:
# CREATING A NEW COLUMN TOTALEXPENSE WHICH CONTAINS THE SUM OF ALL OTHER EXPENSES
df_pipeline['TotalExpense'] = df_pipeline['RoomService'].fillna(0) + df_pipeline['FoodCourt'].fillna(0) + df_pipeline['ShoppingMall'].fillna(0) + df_pipeline['Spa'].fillna(0) + df_pipeline['VRDeck'].fillna(0);

In [160]:
#### CHECKING TOTAL NO. OF UNIQUE VALUES IN EACH COLUMN

# for col in df_pipeline.columns:
#   print(len(df_pipeline[col].unique()))

In [161]:
#change all null values in categorical column to nan
#change all null values in numerical column to 0
#change all null values in boolean column to 2

colBoolean=['CryoSleep','VIP']
colDistinct=['HomePlanet','Destination','Name','CabinDeck','CabinSide', 'PassengerId', 'Cabin']
colContinous=['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','PassengerId_gggg','PassengerId_pp','CabinNum','TotalExpense']


le = LabelEncoder()

for col in colBoolean:
  df_pipeline[col]=df_pipeline[col].fillna(2).astype(int)

for col in colDistinct:
  df_pipeline[col] = df_pipeline[col].fillna('NaN')
  df_pipeline[col] = le.fit_transform(df_pipeline[col])

for col in colContinous:
  df_pipeline[col] = df_pipeline[col].fillna('0')
  df_pipeline[col] = pd.to_numeric(df_pipeline[col], errors='coerce')

**COLUMNS DROPPED**

In [162]:
df_pipeline = df_pipeline.drop(columns=selected_drop_columns)

In [163]:
df_pipeline.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,PassengerId_gggg,PassengerId_pp,CabinDeck,CabinNum,CabinSide,TotalExpense
0,0,1,3,27.0,0,0.0,0.0,0.0,0.0,0.0,2913,13,1,6,3,2,0.0
1,0,0,3,19.0,0,0.0,9.0,0.0,2823.0,0.0,2406,18,1,5,4,2,2832.0
2,1,1,0,31.0,0,0.0,0.0,0.0,0.0,0.0,3377,19,1,2,0,2,0.0
3,1,0,3,38.0,0,0.0,6652.0,0.0,181.0,585.0,2711,21,1,2,1,2,7418.0
4,0,0,3,20.0,0,10.0,0.0,635.0,0.0,0.0,668,23,1,5,5,2,645.0


In [164]:
df_pipeline.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   HomePlanet        4277 non-null   int64  
 1   CryoSleep         4277 non-null   int64  
 2   Destination       4277 non-null   int64  
 3   Age               4277 non-null   float64
 4   VIP               4277 non-null   int64  
 5   RoomService       4277 non-null   float64
 6   FoodCourt         4277 non-null   float64
 7   ShoppingMall      4277 non-null   float64
 8   Spa               4277 non-null   float64
 9   VRDeck            4277 non-null   float64
 10  Name              4277 non-null   int64  
 11  PassengerId_gggg  4277 non-null   int64  
 12  PassengerId_pp    4277 non-null   int64  
 13  CabinDeck         4277 non-null   int64  
 14  CabinNum          4277 non-null   int64  
 15  CabinSide         4277 non-null   int64  
 16  TotalExpense      4277 non-null   float64


### PIPELINE OUTPUT

In [165]:
dfTest = df_pipeline.copy()

# [ OPTIONAL ] - FEATURE SELECTION

### USING BACKWARD ELIMINATION

In [None]:
## FEATURE SELECTION 2 WAYS :
  ## 1. Forward Selection
  ## 2. Backward Elimination
  ## 3. Statistical Methods -> Correlation , Chi-Square Test , etc.

In [None]:
# # Assuming dfTrain is your DataFrame
# target_column = "Transported"
# all_columns = dfTrain.columns.tolist()
# all_columns.remove(target_column)

# bestAccuracy=0;

# # for r in range(1, len(all_columns) + 1):
# #     # Generate all possible combinations of columns
# column_combinations = combinations(all_columns, 14)

# for columns in column_combinations:
#     X = dfTrain[list(columns)]
#     y = dfTrain[target_column]

#     # Split the data into training and testing sets
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#     # Create a Random Forest Classifier
#     xgb_classifier = XGBClassifier(objective='multi:softmax', num_class=len(set(y)), random_state=42)

#     # Train the classifier on the training data
#     xgb_classifier.fit(X_train, y_train)

#     # Make predictions on the test data
#     y_pred = xgb_classifier.predict(X_test)

#     # Evaluate the performance
#     accuracy = accuracy_score(y_test, y_pred)

#     if(accuracy>bestAccuracy):
#       bestAccuracy=accuracy;
#       print(f"Columns: {columns}")
#       print(f"Accuracy: {accuracy:.3f}")
#       print("=" * 30)

Columns: ('HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'PassengerId_gggg', 'PassengerId_pp', 'CabinDeck')
Accuracy: 0.788
Columns: ('HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'PassengerId_gggg', 'PassengerId_pp', 'CabinNum')
Accuracy: 0.793
Columns: ('HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'CabinDeck', 'CabinNum', 'CabinSide')
Accuracy: 0.795
Columns: ('HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'PassengerId_gggg', 'PassengerId_pp', 'CabinDeck', 'CabinSide')
Accuracy: 0.798
Columns: ('HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'PassengerId_pp', 'CabinDeck', 'CabinNum', 'CabinSide')
Accuracy: 0.803
Co

In [None]:
# Columns: ('HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'PassengerId_gggg', 'PassengerId_pp', 'CabinDeck')
# Accuracy: 0.788
# ==============================
# Columns: ('HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'PassengerId_gggg', 'PassengerId_pp', 'CabinNum')
# Accuracy: 0.793
# ==============================
# Columns: ('HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'CabinDeck', 'CabinNum', 'CabinSide')
# Accuracy: 0.795
# ==============================
# Columns: ('HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'PassengerId_gggg', 'PassengerId_pp', 'CabinDeck', 'CabinSide')
# Accuracy: 0.798
# ==============================
# Columns: ('HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'PassengerId_pp', 'CabinDeck', 'CabinNum', 'CabinSide')
# Accuracy: 0.803
# ==============================
# Columns: ('HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'PassengerId_pp', 'CabinDeck', 'CabinNum', 'CabinSide', 'TotalExpense')
# Accuracy: 0.803
# ==============================
# Columns: ('HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'FoodCourt', 'ShoppingMall', 'Spa', 'Name', 'PassengerId_gggg', 'PassengerId_pp', 'CabinDeck', 'CabinSide', 'TotalExpense')
# Accuracy: 0.804
# ==============================
# Columns: ('HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'RoomService', 'FoodCourt', 'Spa', 'VRDeck', 'PassengerId_gggg', 'PassengerId_pp', 'CabinDeck', 'CabinNum', 'CabinSide', 'TotalExpense')
# Accuracy: 0.807
# ==============================

# MODELING

In [128]:
# dividing into the independent columns and target columns
X = dfTrain.drop(columns=['Transported'])
y = dfTrain['Transported']

In [129]:
# choose the columns which are required to be dropped
selected_columns_to_drop_from_X = []

In [130]:
X = X.drop(columns=selected_columns_to_drop_from_X)

In [131]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Random Forest Classifier

In [132]:
# Create a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)


# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.791
Classification Report:
              precision    recall  f1-score   support

       False       0.78      0.80      0.79       861
        True       0.80      0.78      0.79       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



## XGBoost Classifier

In [133]:
# Create an XGBoost classifier
xgb_classifier = XGBClassifier(objective='multi:softmax', num_class=len(set(y)), random_state=42)

# Train the classifier on the training data
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_classifier.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.803
Classification Report:
              precision    recall  f1-score   support

       False       0.80      0.81      0.80       861
        True       0.81      0.80      0.80       878

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739



# Validation on Test Dataset

**RUN THE dfTest_raw through pipeline**

In [148]:
dfTest_raw.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


## SELECT THE MODEL FOR VALIDATION

In [149]:
# CHANGE THE MODEL HERE
y_validate = xgb_classifier.predict(dfTest)

In [150]:
y_validate = y_validate.astype(bool)
print(y_validate)

[ True False  True ...  True  True  True]


## CREATING SUBMISSION FILE

In [151]:
result_df = pd.DataFrame({'PassengerId': dfTest_raw['PassengerId'], 'Transported': y_validate})

print(" ------------------- INFO ABOUT RESULT -------------------")
print(result_df.info())
print()
print("  ------------------- HEAD of RESULT  -------------------")
print(result_df.head())

 ------------------- INFO ABOUT RESULT -------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  4277 non-null   object
 1   Transported  4277 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 37.7+ KB
None

  ------------------- HEAD of RESULT  -------------------
  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True


## DOWNLOAD FILE

In [None]:
# SAVED IN COLAB FOLDER
## OUTPUT WILL BE IN THE REQUIRED SAMPLE CSV FORMAT
result_df.to_csv('output.csv', index=False)