# Introduction

This is the practice for Spaceship Titanic, that predicts which passengers are transported to an alternate dimension. By using RandomForestClassifier.



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [None]:
df_train = pd.read_csv('../input/spaceship-titanic/train.csv')
df_test = pd.read_csv('../input/spaceship-titanic/test.csv')
df_sub = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

In [None]:
df_train.head()

In [None]:
df_train.describe()
df_train.info()

# Clean Data

In [None]:
#Count Missing Values in each Column
df_train.isnull().sum()

In [None]:
#Replace all NA's with Median for each Column
df_train = df_train.fillna(df_train.median())
df_train.isnull().sum()

In [None]:
#Drop the NA rows
df_train = df_train.dropna()
df_train.isnull().sum()

* PassengerId - A Id for each passenger. 
* HomePlanet - The planet the passenger from.
* CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage.
* Cabin - The cabin number where the passenger is staying.
* Destination - The planet the passenger will be debarking to.
* VIP - Indicates whether the passenger has paid for special VIP service.
* Name - The first and last names of the passenger.

# Data Visualization

In [None]:
df_train_HP_TP = df_train.groupby('HomePlanet')[['Transported']].count().reset_index()
px.bar(df_train_HP_TP, x = 'HomePlanet', y = 'Transported', 
       title = 'Number of Transported for each HomePlanet',height = 400, text_auto = True)


In [None]:
df_train_DS_TP = df_train.groupby('Destination')[['Transported']].count().reset_index()
px.bar(df_train_DS_TP, x = 'Destination', y = 'Transported', 
       title = 'Number of Transported for each Destination',height = 400, text_auto = True)

In [None]:
px.histogram(df_train, x = 'Age',title = 'Count of each Age')

In [None]:
df_train_AG_TP = df_train.groupby('Age')[['Transported']].count().reset_index()
px.histogram(df_train_AG_TP, x = 'Age', y = 'Transported', 
       title = 'Number of Transported for each Age',height = 400, text_auto = True)

In [None]:
df_train_VIP = df_train.groupby('VIP')['PassengerId'].count().reset_index()
df_train_VIP = df_train_VIP.rename(index = {0: 'Not VIP',
                            1: 'VIP'})
px.bar(df_train_VIP, df_train_VIP.index, 'PassengerId', height = 400, text_auto = True,
      title = 'Number of VIP', labels = {'PassengerId': 'Count',
                                        'index': 'VIP or Not'})

In [None]:
df_train_VIP_TP = df_train.groupby(['Transported','VIP'])['PassengerId'].count().reset_index()
df_train_VIP_TP = df_train_VIP_TP.rename(index = {
    0: 'Not VIP or Transported',
    1: 'VIP but Not Transported',
    2: 'Not VIP but Transported',
    3: 'VIP and Transported'
})
px.bar(df_train_VIP_TP, x =df_train_VIP_TP.index, y = 'PassengerId',
     title = 'Number of Transported for VIP or not',
       labels = {
           'index': "VIP or Transported?"
       }
      )

In [None]:
df_train_HP_RS = df_train.groupby(["HomePlanet"])[["RoomService"]].mean().reset_index()
px.bar(df_train_HP_RS, x = df_train_HP_RS.HomePlanet, y = df_train_HP_RS.RoomService,
      title = 'Average Room Service for each Planet')

In [None]:
df_train_HP_FC = df_train.groupby(["HomePlanet"])[["FoodCourt"]].mean().reset_index()
px.bar(df_train_HP_FC, x = df_train_HP_FC.HomePlanet, y = df_train_HP_FC.FoodCourt,
      title = 'Average Food Court for each Planet')

In [None]:
df_train_HP_SM = df_train.groupby(["HomePlanet"])[["ShoppingMall"]].mean().reset_index()
px.bar(df_train_HP_SM, x = df_train_HP_SM.HomePlanet, y = df_train_HP_SM.ShoppingMall,
      title = 'Average Shopping Mall for each Planet')

In [None]:
df_train_HP_SP = df_train.groupby(["HomePlanet"])[["Spa"]].mean().reset_index()
px.bar(df_train_HP_SP, x = df_train_HP_SP.HomePlanet, y = df_train_HP_SP.Spa,
      title = 'Average SPA for each Planet')

In [None]:
df_train_HP_VR = df_train.groupby(["HomePlanet"])[["VRDeck"]].mean().reset_index()
px.bar(df_train_HP_VR, x = df_train_HP_VR.HomePlanet, y = df_train_HP_VR.VRDeck,
      title = 'Average VR Deck for each Planet')

In [None]:
df_train[['FirstName', 'LastName']] = df_train['Name'].str.split(pat = ' ', expand = True)
df_train.head()

In [None]:
df_train_FN_TP = df_train.groupby('FirstName')[['Transported']].count().reset_index()
px.bar(df_train_FN_TP, x = 'FirstName', y = 'Transported',
      title = 'Number of Transported by First Name')

In [None]:
df_train_LN_TP = df_train.groupby('LastName')[['Transported']].count().reset_index()
px.bar(df_train_LN_TP, x = 'LastName', y = 'Transported',
      title = 'Number of Transported by Last Name')

# Modeling

In [None]:
df_train = df_train.drop(['Name', 'FirstName', 'LastName','PassengerId'], axis = 1)

In [None]:
df_train.select_dtypes(exclude = 'number').columns

In [None]:
def make_label_encoder(data):
    cat_col = data.select_dtypes(['object','bool']).columns
    transformed_df = pd.DataFrame()
    
    data[cat_col] = data[cat_col].astype('str')
    for col in cat_col:
        le = LabelEncoder()
        x = le.fit_transform(data[col])
        transformed_df[col] = x

    numeric_col = data.drop(columns=cat_col)
    return pd.concat([transformed_df,numeric_col],axis=1)

In [None]:
df_train = make_label_encoder(df_train)

In [None]:
df_train.isnull().sum()

In [None]:
df_train = df_train.dropna()
df_train.isnull().sum()

In [None]:
df_train = df_train.fillna(888)
y = df_train.Transported
X = df_train.drop(['Transported'],axis = 1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, test_size=0.2, 
                                                    stratify=y, random_state=0)            

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_valid)
print(y_pred)

In [None]:
mean_absolute_error(y_valid, y_pred)

# For Test Data

In [None]:
df_test = df_test.drop(['Name','PassengerId'], axis = 1)

In [None]:
df_test.select_dtypes(exclude = 'number').columns

In [None]:
df_test.isnull().sum()

In [None]:
df_test = df_test.fillna(df_train.median())
df_test.isnull().sum()

In [None]:
df_test = df_test.dropna()
df_test.isnull().sum()

In [None]:
df_test.info()

In [None]:
df_test = make_label_encoder(df_test)

In [None]:
df_test.isnull().sum()

In [None]:
pred = clf.predict(df_test)

In [None]:
submission = pd.DataFrame({
    'PassengerId': df_sub['PassengerId'],
    'Transported': pred
})

In [None]:
submission = pd.DataFrame({
        "PassengerId": df_sub["PassengerId"],
        "Transported":  pred})
submission['Transported'] = submission['Transported'].astype('bool')
submission.to_csv('submission.csv', index=False)
submission