# 프로젝트 개요
- 강의명 : 2022년 K-디지털 직업훈련(Training) 사업 - AI데이터플랫폼을 활용한 빅데이터 분석전문가 과정
- 교과목명 : 빅데이터 분석 및 시각화, AI개발 기초, 인공지능 프로그래밍
- 프로젝트 주제 : Spaceship Titanic 데이터를 활용한 탑승유무 분류모형 개발
- 프로젝트 마감일 : 2022년 4월 12일 화요일
- 수강생명 : 강지원

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Step 1. Library and Dataset

## Import Library

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os

print("Numpy ver.", np.__version__)
print("Pandas ver.", pd.__version__)
print("Matplotlib ver.", matplotlib.__version__)
print("Seaborn ver.", sns.__version__)

print(os.listdir('../input/spaceship-titanic/'))

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import accuracy_score, recall_score, precision_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.pipeline import Pipeline
import time
import warnings
warnings.filterwarnings('ignore')

## Load Dataset

In [None]:
BASE_DIR = '../input/spaceship-titanic/'
train = pd.read_csv(BASE_DIR + "train.csv")
test = pd.read_csv(BASE_DIR + "test.csv")
sample_submission = pd.read_csv(BASE_DIR + "sample_submission.csv")
print("Train Data:",train.shape)
print("Test Data:",test.shape)
print("Sample Data:",sample_submission.shape)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.head(3)

In [None]:
test.head(3)

#### There are a total of 13 features, of which 6 are numerical variables.<br/>The target, the column we're going to predict, is 'Trasnported', which is a bool type variable

- For the detail descriptions, go to https://www.kaggle.com/competitions/spaceship-titanic/data

# Step 2. EDA

## Overview of Data

In [None]:
train.describe().T.style.background_gradient(cmap="Blues", axis=None,
                          vmin=0, vmax=100, high=0.5, text_color_threshold=0)\
                .bar(subset=["count"], color="lavender", vmin=8000, vmax=train.shape[0])\
                .bar(subset=["mean"], color="wheat", vmin=0)\
                .bar(subset=["std"], color="peachpuff", vmin=0)\
                .bar(subset=["max"], color="steelblue", vmin=0)

#### Except Age, more than half of each numeric variable is zero.
   - They are amount the passenger has billed at each of many amenities.
   - Preprocessing such as feature engineering and one-hot encoding is required.

In [None]:
train.describe(include=['O']).T.style.background_gradient()\
                .bar(subset=["count"], color="lavender", vmin=8000, vmax=train.shape[0])\
                .bar(subset=["unique"], color="sandybrown")

#### PassengerID, Cabin, and Name are text data, not categorical.
  - They don't seem to be important in themselves for model performance.
  - It is necessary to remove them or to be processed by feature engineering.

## Missing Value Distribution

In [None]:
train_miss = pd.DataFrame({
    'missing' : train.isnull().sum(),
    'ratio' : np.round(train.isnull().sum()/train.shape[0],4)*100
})
test_miss = pd.DataFrame({
    'missing' : test.isnull().sum(),
    'ratio' : np.round(test.isnull().sum()/test.shape[0],4)*100
})

In [None]:
fig, ax = plt.subplots(figsize=(10,5))

width = 0.35
x = np.arange(len(train.columns[:-1].to_list()))
x_ = sorted(test.columns.to_list())
y1 = train_miss['missing'][:-1].sort_index()
y1_ = train_miss['ratio'][:-1].sort_index()
y2 = test_miss['missing'].sort_index()
y2_ = test_miss['ratio'].sort_index()

bar1 = ax.bar(x-width/2, y1, width, label="train data", color="cornflowerblue")
bar2 = ax.bar(x+width/2, y2, width, label="test data", color="lightsalmon")
    
ax.set_xticks(x, x_, rotation=30)
ax.set_ylim(0,300)
ax.bar_label(bar1, padding=12, size=12)
ax.bar_label(bar2, padding=12, size=12)
ax.bar_label(bar1, labels=['(%.2f%%)' %y for y in y1_], padding=3, size=10)
ax.bar_label(bar2, labels=['(%.2f%%)' %y for y in y2_], padding=3, size=10)
ax.set_title("Missing Ratio", pad=10, size=20)
ax.legend(loc='best', fontsize=12)

fig.tight_layout()
plt.show()

#### The ratio of missing values by each feature is less than 3%.
- The numerical features will be replaced by the median value.
- The categorical features will be replaced by the most frequent value.

## Target Distribution

In [None]:
train_0 = train[train['Transported']==False]
train_1 = train[train['Transported']==True]

In [None]:
fig, ax = plt.subplots(figsize=(5,5))

labels = train["Transported"].value_counts().index
lst = train["Transported"].value_counts().to_list()
pie = ax.pie(lst, labels = labels, autopct='%.2f%%',
             textprops=dict(color="white", fontsize=15, weight="bold"),
             colors=["cornflowerblue", "lightsalmon"], shadow=True,
             wedgeprops=dict(width=0.75), startangle=45, explode=(0.025,0))

ax.set_title("Target Distribution", size=20)
ax.legend(title="Transported", title_fontsize=12, loc='best', fontsize=12)

plt.show()

## Categorical Feature Distribution 

In [None]:
train_cat = train.select_dtypes(include = 'object')
test_cat = test.select_dtypes(include = 'object')
col_cat = train_cat.columns.to_list()
print(col_cat)

In [None]:
def dist_cat(col_name):
    fig, ax = plt.subplots(1,2,figsize=(10,5))

    width = 0.35
    x = train_cat.loc[:,col_name].value_counts().index
    bar1 = ax[0].bar(x, train_cat.loc[:,col_name].value_counts().to_list(),
                     width, label = "Train Data", color="cornflowerblue")
    bar2 = ax[0].bar(x, test_cat.loc[:,col_name].value_counts().to_list(),
                     width, label = "Test Data", color="lightsalmon")
    bar3 = ax[1].bar(np.arange(len(x)) - width/2,
                     train_1.loc[:,col_name].value_counts().to_list(),
                     width, label = "Target True", color="cornflowerblue")
    bar4 = ax[1].bar(np.arange(len(x)) + width/2,
                     train_0.loc[:,col_name].value_counts().to_list(),
                     width, label = "Target False", color="lightsalmon")
    
    ax[0].bar_label(bar1, size=12)
    ax[0].bar_label(bar2, size=12, color="w")
    ax[1].bar_label(bar3, size=12)
    ax[1].bar_label(bar4, size=12)
    
    ax[0].set_title(f'{col_name} Distribution',pad=10, size=15)
    ax[1].set_title(f'{col_name} by Target (Train Data)',pad=10, size=15)
    ax[0].set_xticks(x, x, size=12)
    ax[1].set_xticks(np.arange(len(x)), x, size=12)
    
    ax[0].legend(loc="best", fontsize=12)
    ax[1].legend(loc="best", fontsize=12)

    fig.tight_layout()
    return plt.show()

### --- HomePlanet ---

In [None]:
dist_cat('HomePlanet')

**3 categories (Earth, Europa, Mars)**  <br/> Earth account for more than half of both Train and Test.

- **Earth** : False in Transported is certainly high.
- **Europa** : True in Tranported is certainly high.
- **Mars** : True in Tranported is a little high, but there is not much difference.



### --- CryoSleep ---

In [None]:
dist_cat('CryoSleep')

**Boolean (False / True)**  <br/> False is nearly twice as many as True of both Train and Test.

- **False** : False in Transported is certainly high.
- **True** : True in Tranported is more than three times.


### --- Destination ---

In [None]:
dist_cat('Destination')

**3 categories (TRAPPIST-1e, 55 Cancri e, PSO J318.5-22)** <br/> TRAPPIST-1e accounts for approximately 70 percent of both Train and Test.

- **TRAPPIST-1e** : False in Transported is a little high, but there is not much difference.
- **55 Cancri e** : True in Tranported is certainly high.<br/>
- **PSO J318.5-22** : Little difference in distribution by target.


### --- VIP ---

In [None]:
dist_cat('VIP')

**Boolean (False / True)**  <br/> False accounts for almost all data

- Little difference in distribution for the target variable, whether VIP is True or False.

## Numeric Feature Distribution

In [None]:
numeric = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
train_num = train.select_dtypes(include = numeric)
test_num = test.select_dtypes(include = numeric)
col_num = train_num.columns.to_list()
col_num

In [None]:
fig, ax = plt.subplots(3,2,figsize=(10,10))

for i in range(3):
    for j in range(2):
        ax[i, j].hist(train_num.iloc[:,2*i+j].tolist(), 20)
        ax[i, j].set_title(col_num[2*i+j], pad=7, size=15)

fig.tight_layout()
plt.show()

#### Except Age, all numeric features have a very unbalanced distribution.
- They seem necessary to convert variables such as encoding, scaling, and standardization.

## Feature Correlation

In [None]:
train.iloc[:,:-1].corr().style.background_gradient(
                        cmap="RdBu", axis=None, vmin=-0.5, vmax=0.5)

#### There seems to be little correlation between features.
- Issues such as multicollinearity don't have to be considered.

# Step 3. Data Preproccessing

## Feature engineering

### PassengerId
Each Id takes the form ***gggg_pp*** where ***gggg*** indicates a group the passenger is travelling with.<br/> So it seems necessary to create a derived variable for the group.
- **PassengerGroup** : Id of the group to which the passenger belongs
- **SizeOfGroup** : Number of people in the group to which the passenger belongs

In [None]:
# function that creates PassengerGroup and SizeOfGroup derived from PassengerId
def split_PassengerId(df):
    p_group = []
    for idx, row in df.iterrows():
        p_id = str(row['PassengerId'])
        if "_" in p_id:
            p_group.append(int(p_id.split("_")[0]))
        else:
            p_group.append(0)
    df['PassengerGroup'] = p_group
    df['SizeOfGroup'] = df.groupby('PassengerGroup')['PassengerId'].transform('nunique')
    return df
train = split_PassengerId(train)
test = split_PassengerId(test)

In [None]:
# function that creates table of value count in train and test set
def val_count(col_name):
    df = pd.DataFrame([train[col_name].value_counts(), test[col_name].value_counts()],
                      index=[[col_name, col_name],["train","test"]])
    return df

In [None]:
val_count('SizeOfGroup')

In [None]:
fig, ax = plt.subplots(figsize=(10,5))

width = 0.35
x = train['SizeOfGroup'].value_counts().index
bar1 = ax.bar(np.arange(len(x)) - width/2,
                 train['SizeOfGroup'].value_counts().to_list(),
                 width, label = "Train data", color="cornflowerblue")
bar2 = ax.bar(np.arange(len(x)) + width/2,
                 test['SizeOfGroup'].value_counts().to_list(),
                 width, label = "Test data", color="lightsalmon")

ax.bar_label(bar1, size=12)
ax.bar_label(bar2, size=12)

ax.set_title('SizeOfGroup Distribution', pad=10, size=15)
ax.set_xticks(np.arange(len(x)), x, size=12)
ax.legend(loc="best", fontsize=12)

fig.tight_layout()
plt.show()

- **SizeOfGroup** has values from 1 to 8,<br/>and its distribution shows that **more than half of them are on board alone**.
- Therefore, it would be good to add a nominal derived variable "***InGroup***",<br/> which distinguishes whether the passenger is on board **in group(True)** or **alone(False)**.

In [None]:
# function that creates InGroup derived from SizeOfGroup
def create_InGroup(df):
    in_group = []
    for idx, row in df.iterrows():
        ig = row['SizeOfGroup']
        if ig==1:
            in_group.append("False")
        else:
            in_group.append("True")
    df['InGroup'] = in_group
    return df
train = create_InGroup(train)
test = create_InGroup(test)

In [None]:
val_count('InGroup')

### Cabin
Each data takes the form ***deck/num/side***, where side can be either P for Port or S for Starboard.<br/> So it seems necessary to create a derived variable for the group.
- **CabinDeck** : Deck type of cabin which the passenger was boarded
- **CabinNum** : Number of cabin which the passenger was boarded
- **CabinSide** : P for Port or S for Starboard

In [None]:
# function that creates CabinDeck, CabinNum, CabinSide derived from Cabin
def split_Cabin(df):
    c_deck = []
    c_num = []
    c_side = []
    for idx, row in df.iterrows():
        cabin = str(row['Cabin'])
        if "/" in cabin:
            c_deck.append(cabin.split("/")[0])
            c_num.append(cabin.split("/")[1])
            c_side.append(cabin.split("/")[2])
        else:
            c_deck.append(None)
            c_num.append(-1)
            c_side.append(None)
    df['CabinDeck'] = c_deck
    df['CabinNum'] = c_num
    df['CabinSide'] = c_side
    return df
train = split_Cabin(train)
test = split_Cabin(test)

In [None]:
train['CabinNum'] = pd.to_numeric(train['CabinNum'], errors='ignore')
test['CabinNum'] = pd.to_numeric(test['CabinNum'], errors='ignore')

In [None]:
val_count('CabinDeck')

- **CabinDeck** has 8 types (A to F, and T).<br/>Among them, **Type T** seems to mean top deck, which  provides a better view than other decks.<br/>Maybe passengers on the top deck pay more, so Type T seems to be more valuable.
- But the number of type T is **very few**. I'm not going to make any more derivatives with CabinDeck.

In [None]:
val_count('CabinNum')

In [None]:
fig, ax = plt.subplots(figsize=(10,5))

# histogram except for -1
ax.hist(train[train['CabinNum']!=-1]['CabinNum'], 200,
        color="cornflowerblue", label="Train data")
ax.hist(test[test['CabinNum']!=-1]['CabinNum'], 200,
        color="lightsalmon", label="Test data")
ax.set_title("CabinNum Histogram", size=15)
ax.legend(loc="best", fontsize=12)

ax.vlines([300,600,900,1200,1500,1800], ymin=0, ymax=150, color="gray")

plt.show()

- Except for -1, which is the imputation of missing values,<br/> **CabinNum** has values values greater than or equal to 0 and less than 2000. 
- CabinNum seems to be seperated into groups of 300 cabins.<br/> So, I will divide it into the groups and convert it to **categorical variable**.

In [None]:
'''
# function that create CabinGroup derived from CabinNum
def create_CabinGroup(df):
    c_group = []
    for idx, row in df.iterrows():
        cn = row['CabinNum']
        if cn==-1:
            c_group.append(None)
        elif cn < 300:
            c_group.append("group1")
        elif cn < 600:
            c_group.append("group2")
        elif cn < 900:
            c_group.append("group3")
        elif cn < 1200:
            c_group.append("group4")
        elif cn < 1500:
            c_group.append("group5")
        elif cn < 1800:
            c_group.append("group6")
        else:
            c_group.append("group7")
    df['CabinGroup'] = c_group
    return df
train = create_CabinGroup(train)
test = create_CabinGroup(test)
'''

In [None]:
'''
val_count('CabinGroup')
'''

In [None]:
'''
fig, ax = plt.subplots(figsize=(10,5))

width = 0.35
x = train['CabinGroup'].value_counts().index
bar1 = ax.bar(np.arange(len(x)) - width/2,
                 train['CabinGroup'].value_counts().to_list(),
                 width, label = "Train data", color="cornflowerblue")
bar2 = ax.bar(np.arange(len(x)) + width/2,
                 test['CabinGroup'].value_counts().to_list(),
                 width, label = "Test data", color="lightsalmon")

ax.bar_label(bar1, size=12)
ax.bar_label(bar2, size=12)

ax.set_title('CabinGroup Distribution', pad=10, size=15)
ax.set_xticks(np.arange(len(x)), x, size=12)
ax.legend(loc="best", fontsize=12)

fig.tight_layout()
plt.show()
'''

In [None]:
val_count('CabinSide')

- **CabinSide** has 2 types (S: Starboard, P: Port), which is little difference in both train and test.

### Bill Distribution
- **RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** is Amount the passenger has billed at each of the *Spaceship Titinic*'s many luxury amenities.
- So I will create a derived variable **TotalBill**, which is total of these variables.

In [None]:
col_bill = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
train['TotalBill'] = train[col_bill].sum(axis=1)
test['TotalBill'] = test[col_bill].sum(axis=1)

pd.DataFrame({'train':train['TotalBill'], 'test':test['TotalBill']}).describe().T

In [None]:
fig, ax = plt.subplots(1,2,figsize=(12,4))

# TotalBill histogram
ax[0].hist(train['TotalBill'], 100,
        color="cornflowerblue", label="Train data")
ax[1].hist(test['TotalBill'], 100,
        color="lightsalmon", label="Test data")
ax[0].set_title("TotalBill in Train", size=15)
ax[1].set_title("TotalBill in Test", size=15)

plt.show()


- Still, it has a very unbalanced distribution and have a long right tail.
- Therefore, I will reduce the imbalance by applying **log conversion** to the data.

In [None]:
col_bill.append('TotalBill')
for col in col_bill:
    train[col] = np.log(1+train[col])
    test[col] = np.log(1+test[col])

In [None]:
fig, ax = plt.subplots(3,2,figsize=(12,15))

for i in range(3):
    for j in range(2):
        ax[i,j].hist(train[col_bill[2*i+j]], 50,
                color="cornflowerblue", label="Train data")
        ax[i,j].hist(test[col_bill[2*i+j]], 50,
                color="lightsalmon", label="Test data")
        ax[i,j].set_title(f'{col_bill[2*i+j]} in Train', size=15)
        ax[i,j].set_title(f'{col_bill[2*i+j]} in Test', size=15)
        ax[i,j].set_ylim(0,300)
        ax[i,j].legend(loc='best', fontsize=12)
ax[2,1].set_ylim(0,1300)
plt.show()

- Although it shows an unbalanced distribution even after log conversion,<br/> it is expected to help improve model performance even a little.

In [None]:
'''
# function that creates PaidForAmenity derived from TotalBill
def create_PaidForAmenity(df):
    paid_lst = []
    for idx, row in df.iterrows():
        tot = row['TotalBill']
        if tot==0:
            paid_lst.append("False")
        else:
            paid_lst.append("True")
    df['PaidForAmenity'] = paid_lst
    return df
train = create_PaidForAmenity(train)
test = create_PaidForAmenity(test)
'''

In [None]:
'''
val_count('PaidForAmenity')
'''

## Missing Values
- Prior to the imputation of missing values,<br/>it is recommended to find out **the pattern of missing values** based on the relationship between variables.

### HomePlanet and PassengerGroup

In [None]:
# Calculate the number of HomePlanet for each PassengerGroup in Train data
hp_pg_train = pd.crosstab(train['PassengerGroup'], train['HomePlanet'])

cnt = []
for i in range(len(hp_pg_train)):
    cnt.append(np.sum(hp_pg_train.iloc[i,:] != 0))
hp_pg_train['count_planet'] = cnt
hp_pg_train.head(20).T

In [None]:
# Calculate the number of HomePlanet for each PassengerGroup in Test data
hp_pg_test = pd.crosstab(test['PassengerGroup'], test['HomePlanet'])

cnt = []
for i in range(len(hp_pg_test)):
    cnt.append(np.sum(hp_pg_test.iloc[i,:] != 0))
hp_pg_test['count_planet'] = cnt
hp_pg_test.head(20).T

In [None]:
fig, ax = plt.subplots(1,2,figsize=(12,2))

ax[0].plot(hp_pg_train['count_planet'], color="cornflowerblue", label="train data")
ax[0].set_xlabel("PassengerGroup", size=12)
ax[0].set_ylabel("Count", size=12)
ax[0].set_yticks(range(3))
ax[0].set_title("HomePlanet Count for each Group", fontsize=15)

ax[1].plot(hp_pg_train['count_planet'], color="lightsalmon", label="test data")
ax[1].set_xlabel("PassengerGroup", size=12)
ax[1].set_ylabel("Count", size=12)
ax[1].set_yticks(range(3))
ax[1].set_title("HomePlanet Count for each Group", fontsize=15)

ax[0].legend()
ax[1].legend()

plt.show()

- According to the graph above, everyone in the same PassengerGroup comes from the same HomePlanet.<br/>So, **Homeplanet Missing values can be filled by groups**.

In [None]:
'''
print("Before :", train['HomePlanet'].isna().sum(), test['HomePlanet'].isna().sum())

# Index with missing HomePlanet and not missing PassengerGroup
hp_pg_train_idx = train[train['HomePlanet'].isna()][(train[train['HomePlanet'].isna()]['PassengerGroup']).isin(hp_pg_train.index)].index
hp_pg_test_idx = test[test['HomePlanet'].isna()][(test[test['HomePlanet'].isna()]['PassengerGroup']).isin(hp_pg_test.index)].index

# Fill corresponding missing values in HomePlanet
train.loc[hp_pg_train_idx, 'HomePlanet'] = train.iloc[hp_pg_train_idx,:]['PassengerGroup'].map(lambda x:hp_pg_train.idxmax(axis=1)[x])
test.loc[hp_pg_test_idx, 'HomePlanet'] = test.iloc[hp_pg_test_idx,:]['PassengerGroup'].map(lambda x:hp_pg_test.idxmax(axis=1)[x])

print("After :", train['HomePlanet'].isna().sum(), test['HomePlanet'].isna().sum())
'''

### HomePlanet and CabinDeck

In [None]:
# Calculate the number of HomePlanet for each CabinDeck in Train data
hp_cd_train = pd.crosstab(train['CabinDeck'], train['HomePlanet'])

cnt = []
for i in range(len(hp_cd_train)):
    cnt.append(np.sum(hp_cd_train.iloc[i,:] != 0))
hp_cd_train['count_planet'] = cnt
hp_cd_train.T

In [None]:
# Calculate the number of HomePlanet for each CabinDeck in Test data
hp_cd_test = pd.crosstab(test['CabinDeck'], test['HomePlanet'])

cnt = []
for i in range(len(hp_cd_test)):
    cnt.append(np.sum(hp_cd_test.iloc[i,:] != 0))
hp_cd_test['count_planet'] = cnt
hp_cd_test.T

In [None]:
# Heatmap of Train data
fig, ax = plt.subplots(figsize=(12,5))
sns.heatmap(hp_cd_train.iloc[:,:3].T, annot=True, fmt='g', cmap="Blues")
plt.show()

In [None]:
# Heatmap of Test data
fig, ax = plt.subplots(figsize=(12,5))
sns.heatmap(hp_cd_test.iloc[:,:3].T, annot=True, fmt='g', cmap="Blues")
plt.show()

- CabinDeck **A, B, C, T** : From Europa
- CabinDeck **D, E, F** : From multiple planet
- CabinDeck **G** : From Earth

In [None]:
'''
print("Before :", train['HomePlanet'].isna().sum(), test['HomePlanet'].isna().sum())

# Missing HomePlanet and Deck A, B, C, T
train.loc[(train['HomePlanet'].isna()) & (train['CabinDeck'].isin(['A','B','C','T'])), 'HomePlanet']='Europa'
test.loc[(test['HomePlanet'].isna()) & (test['CabinDeck'].isin(['A','B','C','T'])), 'HomePlanet']='Europa'

# Missing HomePlanet and Deck G
train.loc[(train['HomePlanet'].isna()) & (train['CabinDeck']=='G'), 'HomePlanet']='Europa'
test.loc[(test['HomePlanet'].isna()) & (test['CabinDeck']=='G'), 'HomePlanet']='Earth'

print("After :", train['HomePlanet'].isna().sum(), test['HomePlanet'].isna().sum())
'''

### Cabin and PassengerGroup

In [None]:
# Calculate the number of Cabin Features for each PassengerGroup in Train data
cd_pg_train = pd.crosstab(train[train['SizeOfGroup']>1]['PassengerGroup'],
                          train[train['SizeOfGroup']>1]['CabinDeck'])
cn_pg_train = pd.crosstab(train[train['SizeOfGroup']>1]['PassengerGroup'],
                          train[train['SizeOfGroup']>1]['CabinNum'])
cs_pg_train = pd.crosstab(train[train['SizeOfGroup']>1]['PassengerGroup'],
                          train[train['SizeOfGroup']>1]['CabinSide'])

cnt1 = []
cnt2 = []
cnt3 = []
for i in range(len(cd_pg_train)):
    cnt1.append(np.sum(cd_pg_train.iloc[i,:] != 0))
    cnt2.append(np.sum(cn_pg_train.iloc[i,:] != 0))
    cnt3.append(np.sum(cs_pg_train.iloc[i,:] != 0))

cd_pg_train['count_deck'] = cnt1
cn_pg_train['count_num'] = cnt2
cs_pg_train['count_side'] = cnt3

print(cd_pg_train.head(10).T, "\n")
print(cn_pg_train.head(10).T, "\n")
print(cs_pg_train.head(10).T)

In [None]:
# Calculate the number of Cabin Features for each PassengerGroup in Test data
cd_pg_test = pd.crosstab(test[test['SizeOfGroup']>1]['PassengerGroup'],
                          test[test['SizeOfGroup']>1]['CabinDeck'])
cn_pg_test = pd.crosstab(test[test['SizeOfGroup']>1]['PassengerGroup'],
                          test[test['SizeOfGroup']>1]['CabinNum'])
cn_pg_test = cn_pg_test.drop(9223,axis=0)
cs_pg_test = pd.crosstab(test[test['SizeOfGroup']>1]['PassengerGroup'],
                          test[test['SizeOfGroup']>1]['CabinSide'])

cnt1 = []
cnt2 = []
cnt3 = []
for i in range(len(cd_pg_test)):
    cnt1.append(np.sum(cd_pg_test.iloc[i,:] != 0))
    cnt2.append(np.sum(cn_pg_test.iloc[i,:] != 0))
    cnt3.append(np.sum(cs_pg_test.iloc[i,:] != 0))

cd_pg_test['count_deck'] = cnt1
cn_pg_test['count_num'] = cnt2
cs_pg_test['count_side'] = cnt3

print(cd_pg_test.head(10).T, "\n")
print(cn_pg_test.head(10).T, "\n")
print(cs_pg_test.head(10).T)

In [None]:
fig, ax = plt.subplots(3,1,figsize=(10,10))

ax[0].plot(cd_pg_train['count_deck'], color="cornflowerblue", label="train data")
ax[0].plot(cd_pg_test['count_deck'], color="lightsalmon", label="test data")

ax[1].plot(cn_pg_train['count_num'], color="cornflowerblue", label="train data")
ax[1].plot(cn_pg_test['count_num'], color="lightsalmon", label="test data")

ax[2].plot(cs_pg_train['count_side'], color="cornflowerblue", label="train data")
ax[2].plot(cs_pg_test['count_side'], color="lightsalmon", label="test data")

for i in range(3):
    ax[i].set_xlabel("PassengerGroup", size=12)
    ax[i].set_ylabel("Count", size=12)
    ax[i].set_yticks(range(6))
    ax[i].legend(loc="upper right")

ax[0].set_title("CabinDeck Count for each Group", pad=7, fontsize=15)
ax[1].set_title("CabinNum Count for each Group", pad=7, fontsize=15)
ax[2].set_title("CabinSide Count for each Group", pad=7, fontsize=15)

fig.tight_layout()
plt.show()

- For CabinDeck and CabinNum,<br/>there is a fairly good correlation with PassengerGroup, but not perfect.
- Everyone in the same PassengerGroup is on the perfectly same CabinSide.<br/>So, **CabinSide Missing values can be filled by groups**.

In [None]:
'''
print("Before :", train['CabinSide'].isna().sum(), test['CabinSide'].isna().sum())

# Index with missing HomePlanet and not missing PassengerGroup
cs_pg_train_idx = train[train['CabinSide'].isna()][(train[train['CabinSide'].isna()]['PassengerGroup']).isin(cs_pg_train.index)].index
cs_pg_test_idx = test[test['CabinSide'].isna()][(test[test['CabinSide'].isna()]['PassengerGroup']).isin(cs_pg_test.index)].index

# Fill corresponding missing values in HomePlanet
train.loc[cs_pg_train_idx, 'CabinSide'] = train.iloc[cs_pg_train_idx,:]['PassengerGroup'].map(lambda x:cs_pg_train.idxmax(axis=1)[x])
test.loc[cs_pg_test_idx, 'CabinSide'] = test.iloc[cs_pg_test_idx,:]['PassengerGroup'].map(lambda x:cs_pg_test.idxmax(axis=1)[x])

print("After :", train['CabinSide'].isna().sum(), test['CabinSide'].isna().sum())
'''

### CryoSleep and TotalBill
- Passengers in a CryoSleep state cannot pay for anything.<br/>Therefore, it is necessary to check the correlation between **CryoSleep and TotalBill**.

In [None]:
# correlation between CryoSleep and TotalBill
train['NoSpend'] = (train['TotalBill']==0).astype(int)
test['NoSpend'] = (test['TotalBill']==0).astype(int)
cr_ns_train = pd.crosstab(train['CryoSleep'], train['NoSpend'])
cr_ns_test = pd.crosstab(test['CryoSleep'], test['NoSpend'])

cr_ns_train

In [None]:
cr_ns_test

- According to data above, everyone in a CryoSleep state doesn't pay for anything.<br/>Therefore, **Missing CryoSleep can be partly filled by NoSpend**

In [None]:
print("Before :", train['CryoSleep'].isna().sum(), test['CryoSleep'].isna().sum())

cr_ns_train_idx = train.loc[train['CryoSleep'].isna(),'CryoSleep'].index
cr_ns_test_idx = test.loc[test['CryoSleep'].isna(),'CryoSleep'].index
train.loc[train['CryoSleep'].isna(),'CryoSleep'] = train.groupby(['NoSpend'])['CryoSleep'].transform(lambda x: x.fillna(pd.Series.mode(x)[0]))[cr_ns_train_idx]
test.loc[test['CryoSleep'].isna(),'CryoSleep'] = test.groupby(['NoSpend'])['CryoSleep'].transform(lambda x: x.fillna(pd.Series.mode(x)[0]))[cr_ns_test_idx]

print("After :", train['CryoSleep'].isna().sum(), test['CryoSleep'].isna().sum())

- In addition, every bill feature of CryoSleep passengers have to be filled 0

In [None]:
print("Before---")
for col in col_bill:
    print(f'{col} :', train[col].isna().sum(), test[col].isna().sum())
    train.loc[(train[col].isna()) & (train['CryoSleep']==True), col]=0
    test.loc[(test[col].isna()) & (test['CryoSleep']==True), col]=0
    
print("\nAfter---")
for col in col_bill:
    print(f'{col} :', train[col].isna().sum(), test[col].isna().sum())

In [None]:
miss = pd.DataFrame({'num_miss_train':train.isna().sum(),
                     'pct_miss_train':np.round(train.isna().sum()/len(train)*100,2),
                     'num_miss_test':test.isna().sum(),
                     'pct_miss_test':np.round(test.isna().sum()/len(train)*100,2)})
miss.T

## Removal of unnecessary variables
- Remove variables that will not be used in the model.
     + PassengerId, Cabin, Name, PassengerGroup, SizeOfGroup, CabinNum

In [None]:
remove_cols = ['PassengerId', 'Cabin', 'Name',
               'PassengerGroup', 'SizeOfGroup', 'TotalBill', 'NoSpend']

print("Before Removal")
print("\tTrain data:", train.shape, "/ Test data:", test.shape)
train_data = train.drop(remove_cols, axis=1)
test_data = test.drop(remove_cols, axis=1)

print("After Removal")
print("\tTrain data:", train_data.shape, "/ Test data:", test_data.shape)

## Preprocessing Pipeline
### Imputing Missing Values
- The numerical features will be replaced by the median value.
- The categorical features will be replaced by the most frequent value.

### Numeric Feature Scailing
- Age does not require scaling.
- Robustscaler is better suited to reduce the impact of extreme values.

### Categorical Feature Encoding
- Ordinal Feature - Ordinal Encoding / Nominal Feature - Onehot Encoding
- The categorical features in this model are all nominal.

In [None]:
num_feature = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall',
               'Spa', 'VRDeck', 'CabinNum']
cat_feature = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP',
               'InGroup', 'CabinDeck', 'CabinSide']
# ord_feature = ['CabinGroup']

In [None]:
y = train_data['Transported'].copy().astype(int)
X = train_data.drop('Transported', axis=1).copy()
X_test = test_data.copy()

In [None]:
numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean')),
       ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='most_frequent')),
       ('onehot', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse=False))
])

'''
ordinal_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='most_frequent')),
       ('ordinal', OrdinalEncoder())
])
'''

preprocessor = ColumnTransformer(
   transformers=[
     ('num', numeric_transformer, num_feature),
     ('cat', categorical_transformer, cat_feature),
    # ('ord', ordinal_transformer, ord_feature)
   ], remainder='passthrough')

X = preprocessor.fit_transform(X)
X_test = preprocessor.transform(X_test)

# Step 4. ML Model

## Data Split
#### Split train data and validation data

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42)
print("Train data\t:", X_train.shape, y_train.shape)
print("Validation data\t:", X_val.shape, y_val.shape)

## Model Selection
#### Apply grid search to select the best model and hyperparameters

In [None]:
# define classifier
classifiers = {
    "KNN" : KNeighborsClassifier(),
    "LogisticRegression" : LogisticRegression(random_state=42),
    "RandomForest" : RandomForestClassifier(random_state=42),
    "LGBM" : LGBMClassifier(random_state=42)
}

# define grid
KNN_grid = {'n_neighbors': [3, 5, 7, 9],
            'p': [1, 2]}

LR_grid = {'penalty': ['l1','l2'],
           'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
           'max_iter': [50, 100, 150]}

RF_grid = {'n_estimators': [50, 100, 150, 200],
        'max_depth': [4, 6, 8, 10, 12]}

LGBM_grid = {'n_estimators': [50, 100, 150, 200],
        'max_depth': [4, 6, 8, 10, 12],
        'learning_rate': [0.05, 0.1, 0.15]}

grid = {
    "KNN" : KNN_grid,
    "LogisticRegression" : LR_grid,
    "RandomForest" : RF_grid,
    "LGBM" : LGBM_grid
}

In [None]:
i=0
clf_best_params = classifiers.copy()
scores = pd.DataFrame({
                    'Classifer':classifiers.keys(),
                    'Train accuracy' : np.zeros(len(classifiers)),
                    'Validation accuracy': np.zeros(len(classifiers)),
                    'Training time': np.zeros(len(classifiers))
                    })

for key, classifier in classifiers.items():
    start = time.time()
    clf = GridSearchCV(estimator=classifier, param_grid=grid[key], n_jobs=-1, cv=None)

    clf.fit(X_train, y_train)
    scores.iloc[i,1]=clf.score(X_train, y_train)
    scores.iloc[i,2]=clf.score(X_val, y_val)
    clf_best_params[key]=clf.best_params_
    
    stop = time.time()
    scores.iloc[i,3]=np.round((stop - start)/60, 2)
    
    print('Model:', key)
    print('Training time (mins):', scores.iloc[i,3])
    print('')
    i+=1

In [None]:
# best parameters from grid search
clf_best_params

In [None]:
scores

#### **RandomForest** and **LGBM** is suitable<br/> because they are better in terms of validation accuracy.

# Step 5. Model Evaluation

## 10-fold Cross Validation

In [None]:
models = {
    "RandomForest" :
    RandomForestClassifier(**clf_best_params['RandomForest'], random_state=42),
    "LGBM" :
    LGBMClassifier(**clf_best_params["LGBM"], random_state=42)
    }

In [None]:
FOLDS=10
val_preds=np.zeros(len(X_val))
test_preds=np.zeros(len(X_test))

for key, classifier in models.items():
    start = time.time()
    
    score=0
    cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, X_valid = X[train_idx], X[val_idx]
        y_train, y_valid = y[train_idx], y[val_idx]

        clf = classifier
        clf.fit(X_train, y_train)
        
        val_preds += clf.predict_proba(X_val)[:,1]
        test_preds += clf.predict_proba(X_test)[:,1]
        score += clf.score(X_valid, y_valid)

    score /= FOLDS
    stop = time.time()

    print('Model:', key)
    print('Average validation accuracy:', np.round(100*score,2))
    print('Training time (mins):', np.round((stop - start)/60,2))
    print('')

## Ensemble predictions

In [None]:
val_preds /= (FOLDS*len(models))
test_preds /= (FOLDS*len(models))
val_preds = np.round(val_preds).astype(bool)
test_preds = np.round(test_preds).astype(bool)

## Confusion Matrix and ROC curve
: Table the relationship between the predicted value and the real value<br/>
- **accuracy** : (TP+TN) / (TP+TN+FP+FN)<br/>
- **precision** : TP / (TP+ FP)<br/>
- **recall**(**sensitivity**) : TP / (TP+FN)<br/>
- **f1 score** : 2 * precision * recall / (precision + recall)

In [None]:
conf = pd.DataFrame(confusion_matrix(y_val, val_preds),
                    index = [["actual", "actual"], ["N", "P"]],
                    columns = [["pred", "pred"], ["N", "P"]])
conf

In [None]:
print("accuracy =", accuracy_score(y_val, val_preds))
print("precision =", precision_score(y_val, val_preds))
print("recall =", recall_score(y_val, val_preds))

In [None]:
fprs, tprs, thresholds = roc_curve(y_val, val_preds)
print("AUC score:", roc_auc_score(y_val, val_preds))

fig, ax = plt.subplots()
ax.plot([0,1],[0,1],label='str')
ax.plot(fprs, tprs, label='roc')
ax.legend()
ax.grid()
plt.show()

# Step 6. Submission

In [None]:
sample_submission['Transported'] = test_preds
sample_submission.head()

In [None]:
sample_submission.to_csv('submission.csv', index=False)

# ** Reference
- https://www.kaggle.com/code/odins0n/spaceship-titanic-eda-27-different-models
- https://www.kaggle.com/code/taranmarley/feature-engineering-eda-and-lightgbm
- https://www.kaggle.com/code/samuelcortinhas/spaceship-titanic-a-complete-guide

# ** Score List
- Submit 1 (22.04.04) : acc = **0.78840** (795/1150)
    > select LGBM
- Submit 2 (22.04.06) : acc = **0.79284** (697/1198)
    > apply Grid search and ensemble RandomForest, LGBM
- Submit 3 (22.04.07) : acc = **0.80336** (252/1198)
    > create derived variables(SizeOfGroup, CabinDect, CabinNum, CabinSide)
- Submit 4 (22.04.09) : acc = **0.80593** (145/1270)
    > create derived variables(InGroup)
- Submit 5 (22.04.12) : acc = **0.80640** (141/1363)
    > impute missing values(CryoSleep, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck)