# Spaceship titanic Kaggle

### Importing relevant packages

In [1]:
# Basic stuff
import os
import re
import numpy as np
import pandas as pd

# For data visualization
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 8]
import seaborn as sns

# Data preprocessing 
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

# For creating models
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

# For assessing models with metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

***

### Importing data and data preprocessing

Checking the location of the files

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


Uploading data

In [3]:
train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
submission_df = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')

Marking the train and test data in a column:

In [4]:
train_df['train'] = 1
test_df['train'] = 0

Union the train and test sets

In [5]:
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

Quick snap of the dataframe

In [6]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,train
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,1
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,1
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12674 non-null  object 
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Name          12676 non-null  object 
 13  Transported   8693 non-null   object 
 14  train         12970 non-null  int64  
dtypes: float64(6), int64(1), object(8)
memory usage: 1.5+ MB


How many HomePlanets are there?

In [8]:
df.HomePlanet.value_counts()

Earth     6865
Europa    3133
Mars      2684
Name: HomePlanet, dtype: int64

How many different cabins are there?

In [9]:
df.Cabin.value_counts()

G/734/S     8
G/160/P     8
D/176/S     7
G/1476/S    7
B/201/P     7
           ..
E/317/P     1
F/1039/P    1
F/1038/P    1
C/158/P     1
G/1498/S    1
Name: Cabin, Length: 9825, dtype: int64

How many destinations?

In [10]:
df.Destination.value_counts()

TRAPPIST-1e      8871
55 Cancri e      2641
PSO J318.5-22    1184
Name: Destination, dtype: int64

What percentage of VIP/non VIPs have been transported?

In [11]:
df.groupby('VIP')['Transported'].mean()

VIP
False    0.506332
True     0.381910
Name: Transported, dtype: float64

### **Step 1**: Feature preprocessing

1. Retrieving cabin deck, number and side

In [12]:
df['CabinDeck'] = df['Cabin'].str.split('/').str[0]
df['CabinNum'] = df['Cabin'].str.split('/').str[1]
df['CabinSide'] = df['Cabin'].str.split('/').str[2]

2. Create full expenditure column

In [13]:
df['FullExpenditure'] = df['RoomService'] + \
                        df['FoodCourt'] + \
                        df['ShoppingMall'] + \
                        df['Spa'] + \
                        df['VRDeck']

### **Step 2**: Treat NaN values

In [14]:
df.isna().sum()

PassengerId           0
HomePlanet          288
CryoSleep           310
Cabin               299
Destination         274
Age                 270
VIP                 296
RoomService         263
FoodCourt           289
ShoppingMall        306
Spa                 284
VRDeck              268
Name                294
Transported        4277
train                 0
CabinDeck           299
CabinNum            299
CabinSide           299
FullExpenditure    1363
dtype: int64

**Categorical variables**: We will be treating NaNs as a categorical value, called "Missing".

In [15]:
categorical_cols = df.select_dtypes(include='object').columns.tolist()
# Except boolean variables
unwanted_bool_features = ['PassengerId', 'CryoSleep', 'VIP', 'Transported']
categorical_cols = [cat_elem for cat_elem in categorical_cols if cat_elem not in unwanted_bool_features]
print(categorical_cols)

for column in categorical_cols:
    df[column] = df[column].fillna('Missing')

['HomePlanet', 'Cabin', 'Destination', 'Name', 'CabinDeck', 'CabinNum', 'CabinSide']


**Numerical variables**: We will be using the mean of each variable. 

**Note**: We first need to slice train and test

In [16]:
train_df = df[df['train'] == 1].copy()
test_df = df[df['train'] == 0].copy()

Filling NaNs with mean from the **train** set

In [17]:
numeric_cols = train_df.select_dtypes(include=np.number).columns.tolist()
print(numeric_cols)

for column in numeric_cols:
    train_df[column] = train_df[column].fillna(train_df[column].mean())

['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'train', 'FullExpenditure']


Filling NaNs with mean from the **test** set

In [18]:
numeric_cols = test_df.select_dtypes(include=np.number).columns.tolist()
print(numeric_cols)

for column in numeric_cols:
    test_df[column] = test_df[column].fillna(test_df[column].mean())

['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'train', 'FullExpenditure']


Uniting both datasets again

In [19]:
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

**Boolean variables**: We will set them to 'False'

In [20]:
df.isna().sum()

PassengerId           0
HomePlanet            0
CryoSleep           310
Cabin                 0
Destination           0
Age                   0
VIP                 296
RoomService           0
FoodCourt             0
ShoppingMall          0
Spa                   0
VRDeck                0
Name                  0
Transported        4277
train                 0
CabinDeck             0
CabinNum              0
CabinSide             0
FullExpenditure       0
dtype: int64

In [21]:
df['CryoSleep'] = df[['CryoSleep']].fillna(False, axis=0)
df['VIP'] = df[['VIP']].fillna(False, axis=0)

Finally, we check that our dataset no longer have missing values. Certainly, the variable _Transported_ will still have NaNs.

In [22]:
df.isna().sum()

PassengerId           0
HomePlanet            0
CryoSleep             0
Cabin                 0
Destination           0
Age                   0
VIP                   0
RoomService           0
FoodCourt             0
ShoppingMall          0
Spa                   0
VRDeck                0
Name                  0
Transported        4277
train                 0
CabinDeck             0
CabinNum              0
CabinSide             0
FullExpenditure       0
dtype: int64

Before moving on into further feature processing, let's drop support (and other unnecessary) variables

In [23]:
df.drop(['PassengerId', 'Cabin', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'CabinNum'], axis=1, inplace=True)

In [24]:
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Transported,train,CabinDeck,CabinSide,FullExpenditure
0,Europa,False,TRAPPIST-1e,39.0,False,False,1,B,P,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,True,1,F,S,736.0
2,Europa,False,TRAPPIST-1e,58.0,True,False,1,A,S,10383.0
3,Europa,False,TRAPPIST-1e,33.0,False,False,1,A,S,5176.0
4,Earth,False,TRAPPIST-1e,16.0,False,True,1,F,S,1091.0


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   HomePlanet       12970 non-null  object 
 1   CryoSleep        12970 non-null  bool   
 2   Destination      12970 non-null  object 
 3   Age              12970 non-null  float64
 4   VIP              12970 non-null  bool   
 5   Transported      8693 non-null   object 
 6   train            12970 non-null  int64  
 7   CabinDeck        12970 non-null  object 
 8   CabinSide        12970 non-null  object 
 9   FullExpenditure  12970 non-null  float64
dtypes: bool(2), float64(2), int64(1), object(5)
memory usage: 836.1+ KB


### **Step 3**: Label-encode the categorical variables

**Note**: We first need to slice train and test

In [26]:
y_train = df[df['train'] == 1]['Transported']
train_df = df[df['train'] == 1].drop(['train', 'Transported'], axis=1).copy()
test_df = df[df['train'] == 0].drop(['train', 'Transported'], axis=1).copy()

In [27]:
categorical_cols = train_df.select_dtypes(include='object').columns.tolist()
# Using label-encode
label_encoder = LabelEncoder()

for column in categorical_cols:
    #For the train set
    train_df[column]=label_encoder.fit_transform(train_df[column])
    #For the test set   
    test_df[column]=label_encoder.transform(test_df[column])
    

### **Step 4**: Feature normalization and standardization

#### Normalization of the variables

For the **train** set

In [28]:
scaler = MinMaxScaler()
df_columns = train_df.columns.tolist()
train_df_scaled = scaler.fit_transform(train_df)
train_df_scaled = pd.DataFrame(train_df_scaled)
train_df_scaled.columns = df_columns
train_df_scaled.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinSide,FullExpenditure
0,0.333333,0.0,1.0,0.493671,0.0,0.125,0.5,0.0
1,0.0,0.0,1.0,0.303797,0.0,0.625,1.0,0.020452
2,0.333333,0.0,1.0,0.734177,1.0,0.0,1.0,0.288521
3,0.333333,0.0,1.0,0.417722,0.0,0.0,1.0,0.14383
4,0.0,0.0,1.0,0.202532,0.0,0.625,1.0,0.030317


For the **test** set

In [29]:
df_columns = test_df.columns.tolist()
test_df_scaled = scaler.transform(test_df)
test_df_scaled = pd.DataFrame(test_df_scaled)
test_df_scaled.columns = df_columns
test_df_scaled.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinSide,FullExpenditure
0,0.0,1.0,1.0,0.341772,0.0,0.75,1.0,0.0
1,0.0,0.0,1.0,0.240506,0.0,0.625,1.0,0.078695
2,0.333333,1.0,0.0,0.392405,0.0,0.25,1.0,0.0
3,0.333333,0.0,1.0,0.481013,0.0,0.25,1.0,0.20613
4,0.0,0.0,1.0,0.253165,0.0,0.625,1.0,0.017923


#### Standardization of the variables

For the **train** set

In [30]:
standardizer = StandardScaler()
train_df = standardizer.fit_transform(train_df)
train_df = pd.DataFrame(train_df)
train_df.columns = df_columns
train_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinSide,FullExpenditure
0,0.325245,-0.73277,0.622532,0.709437,-0.153063,-1.866494,-0.866174,-0.5514
1,-0.833031,-0.73277,0.622532,-0.336717,-0.153063,0.350474,0.975267,-0.27804
2,0.325245,-0.73277,0.622532,2.034566,6.533255,-2.420736,0.975267,3.304981
3,0.325245,-0.73277,0.622532,0.290975,-0.153063,-2.420736,0.975267,1.371034
4,-0.833031,-0.73277,0.622532,-0.894666,-0.153063,0.350474,0.975267,-0.146189


For the **test** set

In [31]:
test_df = standardizer.fit_transform(test_df)
test_df = pd.DataFrame(test_df)
test_df.columns = df_columns
test_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinSide,FullExpenditure
0,-0.838947,1.330443,0.606665,-0.118222,-0.132689,0.905491,0.982203,-0.539252
1,-0.838947,-0.75163,0.606665,-0.688601,-0.132689,0.333524,0.982203,0.520018
2,0.313696,1.330443,-1.908396,0.166968,-0.132689,-1.382374,0.982203,-0.539252
3,0.313696,-0.75163,0.606665,0.666051,-0.132689,-1.382374,0.982203,2.235347
4,-0.838947,-0.75163,0.606665,-0.617304,-0.132689,0.333524,0.982203,-0.297999


Add the target variable back to the train dataframe

In [32]:
train_df = pd.concat([train_df, y_train*1], axis=1)
train_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinSide,FullExpenditure,Transported
0,0.325245,-0.73277,0.622532,0.709437,-0.153063,-1.866494,-0.866174,-0.5514,0
1,-0.833031,-0.73277,0.622532,-0.336717,-0.153063,0.350474,0.975267,-0.27804,1
2,0.325245,-0.73277,0.622532,2.034566,6.533255,-2.420736,0.975267,3.304981,0
3,0.325245,-0.73277,0.622532,0.290975,-0.153063,-2.420736,0.975267,1.371034,0
4,-0.833031,-0.73277,0.622532,-0.894666,-0.153063,0.350474,0.975267,-0.146189,1


### **Step 5**: Create feature and target differentiation

In [33]:
y_train = train_df.Transported.values.astype(int)
X_train = train_df.drop('Transported', axis=1).values
X_test = test_df.values

***

## Creating a model 

### XGBoost

In [34]:
xgb_model = XGBClassifier()

# Check AUC value with validation set
scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='roc_auc')
print("XGBoost has %0.2f AUC with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

XGBoost has 0.80 AUC with a standard deviation of 0.01


### Logistic regression

In [35]:
lr_model = LogisticRegression()

# Check AUC value with validation set
scores = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='roc_auc')
print("Logistic Regression has %0.2f AUC with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

# Fit the model on the training data
lr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test)

Logistic Regression has 0.77 AUC with a standard deviation of 0.01


## Submission

In [36]:
submission_df['Transported'] = y_pred_xgb.astype('bool')

In [37]:
submission_df.to_csv('./sample_submission.csv', index=False)

In [38]:
submission_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,False
4,0023_01,True
