Problem statement

Welcome to the year 2912, where your data science skills are needed to solve a cosmic mystery. We've received a transmission from four lightyears away and things aren't looking good.

The Spaceship Titanic was an interstellar passenger liner launched a month ago. With almost 13,000 passengers on board, the vessel set out on its maiden voyage transporting emigrants from our solar system to three newly habitable exoplanets orbiting nearby stars.

While rounding Alpha Centauri en route to its first destination—the torrid 55 Cancri E—the unwary Spaceship Titanic collided with a spacetime anomaly hidden within a dust cloud. Sadly, it met a similar fate as its namesake from 1000 years before. Though the ship stayed intact, almost half of the passengers were transported to an alternate dimension!



To help rescue crews and retrieve the lost passengers, you are challenged to predict which passengers were transported by the anomaly using records recovered from the spaceship’s damaged computer system.

Help save them and change history!

Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Load files

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Read files

In [None]:
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
submission = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")

In [None]:
train

In [None]:
test

In [None]:
submission

Analyse train

In [None]:
train.isnull().sum()

In [None]:
train.drop(['Cabin', 'Name'], axis = 1, inplace=True)
test.drop(['Cabin', 'Name'], axis = 1, inplace=True)
train

In [None]:
test

In [None]:
train1 = train.dropna()
train1

Analyse target

In [None]:
train1['Transported'].replace({False: 0, True: 1},inplace=True)
train1['Transported']

In [None]:
sns.displot(train1['Transported'])

In [None]:
trans_count = train1['Transported'].value_counts()
trans_count

In [None]:
trans_percent = trans_count / len(train1)
trans_percent

In [None]:
plt.figure(figsize=(25, 7))
ax = plt.subplot()
ax.scatter(train1[train1['Transported'] == 1]['Age'], train1[train['Transported'] == 1]['FoodCourt'], c='green', s=train1[train1['Transported'] == 1]['VRDeck'])
ax.scatter(train1[train1['Transported'] == 0]['Age'], train1[train['Transported'] == 0]['FoodCourt'], c='red', s=train1[train1['Transported'] == 0]['VRDeck']);


Drop target

In [None]:
target = train1['Transported']

train1.drop(['Transported'],axis=1, inplace=True)
train1

Combine train and test

In [None]:
combi = train1.append(test)
combi

Analyse combi

In [None]:
combi.info()

In [None]:
combi.describe()

Check for null values

In [None]:
combi.isnull().sum()

Impute null values

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(random_state=42)

date = pd.Timestamp('2200-01-01')

for col in combi:
    if combi[col].dtype=="object":
        combi[col].fillna("not listed", inplace=True)
    if combi[col].dtype=="int":
        #X[col].fillna(X[col].mode()[0], inplace=True)
        combi[col].fillna(combi[col].mean(), inplace=True)
        #combi[col] = combi[col].astype.int()
    if combi[col].dtype=='float':
       #X[col].fillna(X[col].mean(), inplace=True)
       combi[col] = imp.fit_transform(combi[col].values.reshape(-1,1))
    if combi[col].dtype=="datetime64[ns]":
        combi[col].fillna(date, inplace=True)
combi

In [None]:
combi.isnull().sum()

Analyse features

Home Planet

In [None]:
sns.displot(combi['HomePlanet'])

In [None]:
home_count = combi['HomePlanet'].value_counts()
home_count

In [None]:
home_percent = home_count / len(combi)
home_percent

In [None]:
mylabels = ["Earth", "Europa", "Mars", "not listed"]
plt.pie(home_percent, labels=mylabels)
plt.show() 

In [None]:
combi['HomePlanet'].replace({"Earth": 1, "Europa": 2, "Mars": 3, "not listed": 4},inplace=True)
combi['HomePlanet']

Cryo Sleep

In [None]:
combi['CryoSleep'][combi['CryoSleep'] == 'not listed'] = False

In [None]:
print(combi.iloc[6674])

In [None]:
combi['CryoSleep'].replace({False: 0, True: 1})

In [None]:
sns.distplot(combi['CryoSleep'])

In [None]:
sleep_count = combi['CryoSleep'].value_counts()
sleep_count

In [None]:
sleep_percent = sleep_count / len(combi)
sleep_percent

In [None]:
combi['CryoSleep'] = combi['CryoSleep'].astype(int)
combi['CryoSleep']

Destination

In [None]:
sns.displot(combi['Destination'])

In [None]:
dest_count = combi['Destination'].value_counts()
dest_count

In [None]:
dest_percent = dest_count / len(combi)
dest_percent

In [None]:
mylabels = ["TRAPPIST-1e", "55 Cancri e", "PSO J318.5-22", "not listed"]
plt.pie(dest_percent, labels=mylabels)
plt.show() 

In [None]:
combi['Destination'].replace({"TRAPPIST-1e": 1, "55 Cancri e": 2, "PSO J318.5-22": 3, "not listed": 4},inplace=True)
combi['Destination']

Age

In [None]:
combi['Age_group'] = pd.cut(x=combi['Age'], bins=[-1, 18, 40, 65, 100], labels=['child', 'young adult', 'middle age', 'pensioner'])
combi['Age_group']

In [None]:
sns.displot(combi['Age_group'])

In [None]:
age_count = combi['Age_group'].value_counts()
age_count

In [None]:
age_percent = age_count / len(combi)
age_percent

In [None]:
mylabels = ["young adult", "child", "middle age", "pensioner"]
plt.pie(age_percent, labels=mylabels)
plt.show() 

In [None]:
combi['Age_group'].replace({"young adult": 1, "child": 2, "middle age": 3, "pensioner": 4},inplace=True)
combi['Age_group']

In [None]:
combi['Age_group'] = combi['Age_group'].astype(int)

VIP

In [None]:
combi['VIP'][combi['VIP'] == 'not listed'] = False

In [None]:
combi['VIP'].replace({False: 0, True: 1})

In [None]:
sns.distplot(combi['VIP'])

In [None]:
vip_count = combi['VIP'].value_counts()
vip_count

In [None]:
vip_percent = vip_count / len(combi)
vip_percent

In [None]:
combi['VIP'] = combi['VIP'].astype(int)
combi['VIP']

Room Service

In [None]:
sns.violinplot(combi['RoomService'])

In [None]:
rm_service_high = combi['RoomService'].max()
print(rm_service_high)

In [None]:
combi['Room_Service_group'] = pd.cut(x=combi['RoomService'], bins=[-1, 2000, 8000, 15000], labels=['low', 'med', 'high'])
combi['Room_Service_group']


In [None]:
sns.displot(combi['Room_Service_group'])

In [None]:
rm_service_count = combi['Room_Service_group'].value_counts()
rm_service_count

In [None]:
rm_service_percent = rm_service_count / len(combi)
rm_service_percent

In [None]:
mylabels = ["low", "med", "high"]
plt.pie(rm_service_percent, labels=mylabels)
plt.show()

In [None]:
combi['Room_Service_group'].replace({"low": 1, "med": 2, "high": 3},inplace=True)
combi['Room_Service_group']

Food Court

In [None]:
sns.violinplot(combi['FoodCourt'])

In [None]:
food_high = combi['FoodCourt'].max()
print(food_high)

In [None]:
combi['Food_Court_group'] = pd.cut(x=combi['FoodCourt'], bins=[-1, 5000, 20000, 30000], labels=['low', 'med', 'high'])
combi['Food_Court_group']

In [None]:
sns.displot(combi['Food_Court_group'])

In [None]:
fd_court_count = combi['Food_Court_group'].value_counts()
fd_court_count

In [None]:
fd_court_percent = fd_court_count / len(combi)
fd_court_percent

In [None]:
mylabels = ["low", "med", "high"]
plt.pie(fd_court_percent, labels=mylabels)
plt.show()

In [None]:
combi['Food_Court_group'].replace({"low": 1, "med": 2, "high": 3},inplace=True)
combi['Food_Court_group']

Shopping Mall

In [None]:
sns.violinplot(combi['ShoppingMall'])

In [None]:
shop_high = combi['ShoppingMall'].max()
print(shop_high)

In [None]:
combi['Shopping_group'] = pd.cut(x=combi['ShoppingMall'], bins=[-1, 2000, 15000, 24000], labels=['low', 'med', 'high'])
combi['Shopping_group']

In [None]:
sns.displot(combi['Shopping_group'])

In [None]:
shopping_count = combi['Shopping_group'].value_counts()
shopping_count

In [None]:
shopping_percent = shopping_count / len(combi)
shopping_percent

In [None]:
mylabels = ["low", "med", "high"]
plt.pie(shopping_percent, labels=mylabels)
plt.show()

In [None]:
combi['Shopping_group'].replace({"low": 1, "med": 2, "high": 3},inplace=True)
combi['Shopping_group']

Spa

In [None]:
sns.violinplot(combi['Spa'])

In [None]:
spa_high = combi['Spa'].max()
print(spa_high)

In [None]:
combi['Spa_group'] = pd.cut(x=combi['Spa'], bins=[-1, 5000, 15000, 23000], labels=['low', 'med', 'high'])
combi['Spa_group']

In [None]:
sns.displot(combi['Spa_group'])

In [None]:
spa_count = combi['Spa_group'].value_counts()
spa_count

In [None]:
spa_percent = spa_count / len(combi)
spa_percent

In [None]:
mylabels = ["low", "med", "high"]
plt.pie(spa_percent, labels=mylabels)
plt.show()

In [None]:
combi['Spa_group'].replace({"low": 1, "med": 2, "high": 3},inplace=True)
combi['Spa_group']

VR Deck

In [None]:
sns.violinplot(combi['VRDeck'])

In [None]:
vr_high = combi['VRDeck'].max()
print(vr_high)

In [None]:
combi['VR_group'] = pd.cut(x=combi['VRDeck'], bins=[-1, 5000, 15000, 23000], labels=['low', 'med', 'high'])
combi['VR_group']

In [None]:
sns.displot(combi['VR_group'])

In [None]:
vr_count = combi['VR_group'].value_counts()
vr_count

In [None]:
vr_percent = vr_count / len(combi)
vr_percent

In [None]:
mylabels = ["low", "med", "high"]
plt.pie(vr_percent, labels=mylabels)
plt.show()

In [None]:
combi['VR_group'].replace({"low": 1, "med": 2, "high": 3},inplace=True)
combi['VR_group']

Assign features

In [None]:
features = ["HomePlanet", "CryoSleep", "Destination", "Age_group", "Room_Service_group", "Food_Court_group", "Shopping_group", "Spa_group", "VR_group"]


In [None]:
combi[features].shape

In [None]:
combi[features].isnull().sum()

In [None]:
combi[features].info()

Assign features

Standardise combi

In [None]:
combi[features] = (combi[features] - np.mean(combi[features])) / np.std(combi[features])

Define X and y

In [None]:
y = target
X = combi[features][: len(train1)]
X_test = combi[features][len(train1) :]

Heatmap

In [None]:
cmap = combi[features].corr()
sns.heatmap(cmap)


Split dataset for training and validating

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=1, stratify=y, shuffle=True)
X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

Select model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
X_test = X_test.to_numpy()

y_train = y_train.to_numpy()
y_val = y_val.to_numpy()

X_train=torch.from_numpy(X_train.astype(np.float32))
X_val=torch.from_numpy(X_val.astype(np.float32))
X_test=torch.from_numpy(X_test.astype(np.float32))

y_train=torch.from_numpy(y_train.astype(np.float32))
y_val=torch.from_numpy(y_val.astype(np.float32))

In [None]:
y_train=y_train.view(y_train.shape[0],1)
y_val=y_val.view(y_val.shape[0],1)

In [None]:
class Logistic_Reg_model(torch.nn.Module):
 def __init__(self,no_input_features):
   super(Logistic_Reg_model,self).__init__()
   self.layer1=torch.nn.Linear(no_input_features,20)
   self.layer2=torch.nn.Linear(20,1)
 def forward(self,x):
   y_predicted=self.layer1(x)
   y_predicted=torch.sigmoid(self.layer2(y_predicted))
   return y_predicted

In [None]:
n_features = combi[features].shape[1]

model=Logistic_Reg_model(n_features)

In [None]:
criterion=torch.nn.BCELoss()
optimizer=torch.optim.SGD(model.parameters(),lr=0.01)

In [None]:
number_of_epochs=500
for epoch in range(number_of_epochs):
 y_prediction=model(X_train)
 loss=criterion(y_prediction,y_train)
 loss.backward()
 optimizer.step()
 optimizer.zero_grad()
 if (epoch+1)%10 == 0:
   print('epoch:', epoch+1,',loss=',loss.item())

In [None]:
with torch.no_grad():
 y_pred=model(X_val)
 y_pred_class=y_pred.round()
 accuracy=(y_pred_class.eq(y_val).sum())/float(y_val.shape[0])
 print(accuracy.item())

Predict on X_test

In [None]:
with torch.no_grad():
 predictions=model(X_test)
 predictions_class=predictions.round()

In [None]:
predictions = torch.round(predictions)

predictions = torch.gt(predictions, 0)

predictions 

Prepare and submit

In [None]:
submission['Transported'] = predictions
submission.to_csv('submission.csv', index=False)
my_submission = pd.read_csv("submission.csv")
my_submission