# Spaceship Titanic

Ref: https://www.kaggle.com/competitions/spaceship-titanic/

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Dataset Exporation
- show the dataset
- basic information
- data types
- isna
- class balance

## Training Set

In [None]:
df_train = pd.read_csv('../input/spaceship-titanic/train.csv')
df_test = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.describe(include=['O'])

## Test Set

In [None]:
df_test.head()

In [None]:
df_test.info()

In [None]:
df_test.describe(include=['O'])

## Summary
- Numeric data
    - Age
    - RoomService
    - FoodCourt
    - ShoppingMall
    - SPA
    - VRDeck
- Discrete Data
    - Name
    - PassengerID (GGGG-PP)
        - G stand for group
        - P stand for number in group
- Categorical Data
    - HomePlanet
    - Crypto sleep (T/F)
    - Cabin (deck/num/side)
        - side either P for Port or S for Starboard
    - Destination
    - VIP

#### Fields that need futher decomposition
   - PassengerID
   - Cabin

## Not Null

#### Training Set

In [None]:
df_train.isna().sum()

In [None]:
df_test.isna().sum()

In [None]:
print(f'total NA in training set:{sum(df_train.isna().sum())}')
print(f'total NA in test set:{sum(df_test.isna().sum())}')

## Analysis on class balance

In [None]:
transported_count = df_train['Transported'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(5,5))
bars = ax.barh(transported_count.index.astype(str), transported_count.tolist())

ax.bar_label(bars)

In [None]:
transported_count

## Correlation Matrix

In [None]:
df_train.corr()

# Data preprocessing

In [None]:
def data_processing(df, mode):
    # Delete Unique field
    del df['Name'] 
    del df['PassengerId'] 
    del df['Cabin']
    # Fillna  for continuous data using mean
    for col in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
        df[col] = df[col].fillna(df[col].mean())
    # Fillna  for categorical data using mode 
    for col in ['CryoSleep', 'VIP', 'Destination', 'HomePlanet']:
        df[col] = df[col].fillna(df[col].mode()[0])
    # Normoralizing continuous data
    for col in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
        df[col] = df[col]/df[col].max()
    # Label encoding
    if mode == 'train':
        colToTransform = ['CryoSleep', 'VIP', 'Transported']
    elif mode == 'test':
        colToTransform = ['CryoSleep', 'VIP']
    for col in colToTransform:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    # Label encoding for multi class
    df = pd.get_dummies(df, columns=['HomePlanet', 'Destination'], prefix = ['HomePlanet', 'Destination'])
    return df

In [None]:
df_train = data_processing(df_train, 'train')
df_test = data_processing(df_test, 'test')

In [None]:
df_train.isna().sum()

In [None]:
df_test.isna().sum()

In [None]:
df_train.info()

# KNN

In [None]:
features = np.array(df_train.drop('Transported', axis=1))
targets = np.array(df_train['Transported'])

In [None]:
x_train, x_val, y_train, y_val = train_test_split(features, targets, test_size=0.2, random_state=0)

## Grid Search

In [None]:
KNN = KNeighborsClassifier()
k_range = list(range(1,31))
params = dict(n_neighbors=k_range)
grid = GridSearchCV(KNN, params, cv=10, scoring='accuracy', return_train_score=False,verbose=1)

grid_search = grid.fit(x_train, y_train)
print("Best model:"+ str(grid_search.best_params_))
accuracy = grid_search.best_score_ *100
print("Accuracy for our training dataset with tuning is : {:.2f}%".format(accuracy) )

In [None]:
KNN = KNeighborsClassifier(n_neighbors=23)
KNN.fit(x_train, y_train)
y_pred = KNN.predict(x_val)
print("Accuracy: "+str(accuracy_score(y_val,y_pred)))

# RandomForest

In [None]:
rf = RandomForestClassifier()
RF_grid = {'n_estimators': [50, 100, 150, 200], 'max_depth': [4, 6, 8, 10, 12]}
grid = GridSearchCV(rf, RF_grid, cv=10, scoring='accuracy', return_train_score=False,verbose=1)
grid_search = grid.fit(x_train, y_train)
print("Best model:"+ str(grid_search.best_params_))
accuracy = grid_search.best_score_ *100
print("Accuracy for our training dataset with tuning is : {:.2f}%".format(accuracy) )

In [None]:
rf = RandomForestClassifier(max_depth = 10, n_estimators = 200)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_val)
print("Accuracy: "+str(accuracy_score(y_val,y_pred)))

# Test prediction

In [None]:
sample_submission = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
features_test = np.array(df_test)
y_test_pred = rf.predict(features_test)

In [None]:
sample_submission['Transported'] = y_test_pred.astype(bool)
sample_submission.head()

In [None]:
sample_submission.to_csv('./submission.csv', index=False)