# <font color='navy'>Spaceship Titanic</font>
#### Author: Anton Khnykin - Data Engineer from Earth

##### **Special thanks for some ideas:**
SARDOR ABDIRAYIMOV - https://www.kaggle.com/code/sardorabdirayimov/best-way-of-dealing-with-missing-values-titanic-2/notebook<br>
OPAMUSORA - https://www.kaggle.com/code/opamusora/top-10-notebook/comments

### Contents
#### <a href='#step1'>Step 1. Import libraries and datasets</a>
#### <a href='#step2'>Step 2. Data preprocessing</a>
#### <a href='#step3'>Step 3. Make predictions</a>

### Step 1. Import libraries and datasets<span id='step1'></span>

In [None]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import missingno 

# import datasets
train_data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

### Step 2. Data preprocessing <span id='step2'></span>

In [None]:
# check duplicates
train_data.duplicated().sum()

#### Note
**We haven't duplicates**

In [None]:
# check data's structure
train_data.info()

#### Note
**There are a lot of gaps in dataset. Only PassengerId and Transported haven't gaps.**

In [None]:
# see first rows
display(train_data.head(5))

In [None]:
# Devide PassengerId, Cabin, Name to several columns
def update_dataset(df):
    df[['Cabin_deck', 'Cabin_num', 'Cabin_side']] = df['Cabin'].str.split('/', expand=True)
    df['PassengerGroup'] = df['PassengerId'].map(lambda x: x[:4])
    df[['Name_name', 'Name_family']] = df['Name'].str.split(' ', expand=True)
    df.drop(["Cabin", "Name"], axis=1, inplace=True)
    return df
    
for df in [train_data, test_data]:
    df = update_dataset(df)

In [None]:
# find minimal Age with which has used Services or VIP. 
# in case of less than min - fill gaps by zero

def update_by_age(df, column):
    query_str = column + " > 0"
    min_age = df[['Age', column]].groupby('Age').sum().reset_index().query(query_str).iloc[0, 0]
    df.loc[df['Age'] < min_age, column] = df.loc[df['Age'] < min_age, column].fillna(0)
    return df

for df in [train_data, test_data]:
    for column in ['VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
        df = update_by_age(df, column)

In [None]:
# if person in CryoSleep than it can't use Services. Fill gaps by zero if it is
def update_by_cryo(df, column):
    df.loc[df['CryoSleep'] == True, column] = df.loc[df['CryoSleep'] == True, column].fillna(0)
    return df
    
for df in [train_data, test_data]:
    for column in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
        df = update_by_age(df, column)

In [None]:
# Check gaps by map
missingno.matrix(train_data, figsize=(15,5), fontsize=12, 
                 color=(0.5, 0.5, 0.5))
plt.show()

In [None]:
train_data.info()

In [None]:
# fill gaps by median (for Age) and zero for Services (if we haven't know about 
# expenses, so we decide than it was absent)

def update_by_mean(df, column):
    if column == 'Age':
        df[column].fillna(df[column].median(), inplace=True)
    elif column == 'HomePlanet':
        df[column].fillna('Earth', inplace=True)
    elif column == 'Destination':
        df[column].fillna('55 Cancri e', inplace=True)
    elif column == 'CryoSleep':
        df[column].fillna(False, inplace=True)
    elif column == 'VIP':
        df[column].fillna(False, inplace=True)
    else:
        df[column].fillna(0, inplace=True)
    return df
    
for df in [train_data, test_data]:
    for column in ['Age', 'RoomService', 'HomePlanet', 'Destination', 'VIP',
                   'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CryoSleep']:
        df = update_by_mean(df, column)

In [None]:
# Change a type of data
train_data['Age'] = train_data['Age'].astype('int32')
test_data['Age'] = test_data['Age'].astype('int32')

### Step 3. Make predictions<span id='step3'></span>

In [None]:
# set target
y_train = train_data["Transported"]

# make X 
features = ["Age", "Destination", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
X_train = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

# scale X
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

# learn a model
model = RandomForestClassifier(n_estimators=100, max_depth=6, 
                               random_state=42)

model.fit(X_train_scaled, y_train)

# make predictions
predictions = model.predict(X_test_scaled)

# save submission
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': predictions})
output.to_csv('submission.csv', index=False)