In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Features Description
## 1. categorical
> * PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
> * HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
> * CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
> * Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
> * Destination - The planet the passenger will be debarking to.
> * VIP - Whether the passenger has paid for special VIP service during the voyage.
> * Name - The first and last names of the passenger.
> * Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

## 2. continuous
> * Age - The age of the passenger.
> * RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

# Import libary

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV

# Load data

In [None]:
df_train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [None]:
df_train.head()

In [None]:
df_train.shape

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.describe(include='object')

In [None]:
df_test.head()

In [None]:
df_test.shape

In [None]:
df_test.info()

In [None]:
df_test.describe()

In [None]:
df_test.describe(include='object')

# Check null presentage

In [None]:
total = df_train.isnull().sum()
percent = (total / df_train.isnull().count()) * 100
pd.DataFrame({
    'total': total.sort_values(ascending=False),
    'percent': percent.sort_values(ascending=False)
})

In [None]:
total = df_test.isnull().sum()
percent = (total / df_test.isnull().count()) * 100
pd.DataFrame({
    'total': total.sort_values(ascending=False),
    'percent': percent.sort_values(ascending=False)
})

# EDA

## Transported (Target variable)

In [None]:
total = float(df_train.shape[0])
ax = sns.countplot(x='Transported', data=df_train)
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2.,
           height +5,
           '{:1.2f}'.format((height/total)*100),
           ha='center')
plt.show()

This distribution explains that:
- This dataset is balance, group 1(who is not transparted) is 49.64%, and 50.36% fall under the group 2(who is transported)

In [None]:
category_cols = ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


## Univariate analysis

### Age

In [None]:
df_train['Age'].hist()

This historgram shows that:
- it is right skewed
- Minumum and Maximum age are 0 and 79

### RoomService

In [None]:
df_train['RoomService'].hist()

This historgram shows that:
- the values are concentrated on 0 (peoples tend not spent money during the trip)
- fewer people spent money during the trip
- Minimum and Maximum are 0 and 14,327

### FoodCourt

In [None]:
df_train['FoodCourt'].hist()

### ShoppingMall

In [None]:
df_train['ShoppingMall'].hist()

### Spa

In [None]:
df_train['Spa'].hist()

### VRDeck

In [None]:
df_train['VRDeck'].hist()

### HomePlanet

In [None]:
total = len(df_train)
ax = sns.countplot(x="HomePlanet", data=df_train)
for p in ax.patches:
    height = p.get_height()
    width = p.get_width() / 2
    ax.text(p.get_x() + width, 
           height + 5, 
           '{:.2f}'.format((height/total)*100),
           ha='center')
    

This chart shows that:
- this HomePlanet attribute has 3 unique categories
- Earth has maximum number of observation
- Mars(20%) has minimum number of observation, but it is close to Europa(24%)

### CryoSleep

In [None]:
ax = sns.countplot(x='CryoSleep', data=df_train)
for p in ax.patches:
    w, h = p.get_width(), p.get_height()
    x = p.get_x() + w/2
    y = h + 5
    percent = (h/len(df_train)) * 100
    ax.text(x,y,
          '{:.2f}'.format(percent),
          ha='center')

This distribution shows that:
- CryoSleep has two unique category
- the samples of group 1(False) is higer than the group 2(True)
- distribution shows that this dataset is skewed toward the group 1(False) with 62%

### Destination

In [None]:
ax = sns.countplot(x='Destination', data=df_train)
for p in ax.patches:
    w, h = p.get_width(), p.get_height()
    x = p.get_x() + w/2
    y = h + 5
    percent = h/len(df_train) * 100
    ax.text(x,y,
           '{:.2f}'.format(percent),
           ha='center')

This chart explains that:
- Destination has three unique attributes
- the maximum(68%) is TRAPPIST-1e, this dataset is skewed toward it
- the minimum(9%) is PSO J318 5-22, which is less than 10%
- the sencond higher category is 55 Cancri e(20%), but it still 48% less than the maximum

### VIP

In [None]:
ax = sns.countplot(x='VIP', data=df_train)
for p in ax.patches:
    w, h = p.get_width(), p.get_height()
    x = p.get_x() + w/2
    y = h + 5
    percent = h/len(df_train) * 100
    ax.text(x,y,
           '{:.2f}'.format(percent),
           ha='center')

This distribution tell that:
- VIP has two unique attributes
- this dataset is skewed toward to the group 1(False) with 95%
- group 2(True) has fewer samples (2%)

VIP features will be removed due to it is highly imbalanced, which may affect the model accuracy

## Bivariate analysis

### Age

In [None]:
sns.boxplot(x='Transported', y='Age', data=df_train)

This boxplot shows that:
- both group 1(False) and group 2(True) are similar. e.g median, min, max and q3
- group 2(True) has flexible IQR range than group 1(False)

### RoomService

In [None]:
sns.boxplot(x='Transported', y='RoomService', data=df_train)

This boxplot shows that:
- the values accumulated at 0 for both groups
- group 1(False) range is longer than goup 2(True), and it has large amount of the outlier

### FoodCourt

In [None]:
sns.boxplot(x="Transported", y='FoodCourt', data=df_train)

This boxplot shows:
- the values concentrate to 0 for both groups

### ShoppingMall

In [None]:
sns.boxplot(x='Transported', y='ShoppingMall', data=df_train)

- similar to FoodCourt

### Spa

In [None]:
sns.boxplot(x='Transported', y='Spa', data=df_train)

similar to RoomService

### VRDeck

In [None]:
sns.boxplot(x='Transported', y='VRDeck', data=df_train)

similar to RoomService

### HomePlanet

In [None]:
ax = sns.countplot(x='HomePlanet', hue='Transported', data=df_train)
for p in ax.patches:
    w,h = p.get_width(), p.get_height()
    x = p.get_x() + w/2
    y = h + 5
    percent = h/len(df_train) * 100
    ax.text(x,y,
           '{:.2f}'.format(percent),
           ha='center')

Above plot explains that:
- Earth has most of the samples, 30% of group 1(False) and 22% of group 2(True)
- Earth is the only attribute that the group 1 (False) is large than group 2 (True)
- the smaples of group 1(False) and group 2(True) are almost balanced

### CryoSleep

In [None]:
ax = sns.countplot(x='CryoSleep', hue='Transported', data=df_train)
for p in ax.patches:
    w, h = p.get_width(), p.get_height()
    x = p.get_x() + w/2
    y = h + 5
    percent = h/len(df_train) * 100
    ax.text(x,y,
           '{:.2f}'.format(percent),
           ha='center')

This plot shows that:
- the False attribute of CryoSleep has most higer percentage of group 1(False)
- in group 2(True), the True attribute of CryoSleep is higher than the False attribute CryoSleep

### Destination

In [None]:
ax = sns.countplot(x='Destination', hue='Transported', data=df_train)
for p in ax.patches:
    w, h = p.get_width(), p.get_height()
    x = p.get_x() + w/2
    y = h + 5
    percent = h/len(df_train) * 100
    ax.text(x,y,
           '{:.2f}'.format(percent),
           ha='center')

This plot shows that:
- TRAPPIST-1e has most higher percentage. 35% of group 1(False) and 32% of group 2 (True)
- the samples of PSO J318.5-22 is balanced, both groups are 4%

In [None]:
ax = sns.countplot(x='VIP', hue='Transported', data=df_train)
for p in ax.patches:
    w, h = p.get_width(), p.get_height()
    x = p.get_x() + w/2
    y = h + 5
    percent = h/len(df_train) * 100
    ax.text(x,y,
           '{:.2f}'.format(percent),
           ha='center')

This plot shows that:
- there is huge difference percentage distribute on each groups, most of samples fall into group 1(False)
- there is fewer samples fall into group 2(True)

## Mutilvariate relationships

In [None]:
corr = df_train[numerical_cols].corr()
sns.heatmap(data=corr,
           annot=True)

In [None]:
sns.heatmap(data=df_train.corr(), annot=True)

- there is no strong correlation among the numerical features
- no strong positive or strong negative correlation present in any features

## Feature Engineering

 After close look into the data:
 - the **Cabin**, they can break down into different group
 - RoomService, FoodCourt, ShoppingMall, Spa, VRDeck can group into single feature

In [None]:
# train data
cabin_train = df_train['Cabin'].astype('category')
df_train['cabin_group'] = cabin_train.apply(lambda x: x.split('/')[0])

# test data
cabin_test = df_test['Cabin'].astype('category')
df_test['cabin_group'] = cabin_test.apply(lambda x: x.split('/')[0])

df_train = df_train.drop(columns='Cabin', axis=1)
df_test = df_test.drop(columns='Cabin', axis=1)

df_train.head()

In [None]:
df_train['cabin_group'].unique()

In [None]:
ax = sns.countplot(x='cabin_group', data=df_train)
for p in ax.patches:
    w, h = p.get_width(), p.get_height()
    x = p.get_x() + w/2
    y = h + 5
    percent = h/len(df_train) * 100
    ax.text(x,y,
           '{:.2f}'.format(percent),
           ha='center')

- there are 8 unique attributes for this feature
- the most large amount sampels is F, following by G
- the value T has minimum samples (0.06%)

In [None]:
ax = sns.countplot(x='cabin_group',hue='Transported', data=df_train)
for p in ax.patches:
    w, h = p.get_width(), p.get_height()
    x = p.get_x() + w/2
    y = h + 5
    percent = h/len(df_train) * 100
    ax.text(x,y,
           '{:.2f}'.format(percent),
           ha='center')

- the group 2(True) is much higher than group 1(False) in the B and C
- the samples of group 1 and group 2 is balanced in A
- the F has highest samples of group 1(False) and the G has highest smaples of group 2(True) among all attributes
- the number of samples in T is almost 0%

In [None]:
billed_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df_train['total_billed'] = df_train[billed_cols].sum(axis=1)
df_test['total_billed'] = df_test[billed_cols].sum(axis=1)

df_train = df_train.drop(columns=billed_cols, axis=1)
df_test = df_test.drop(columns=billed_cols, axis=1)

df_train.head()

In [None]:
df_train['total_billed'].hist()

In [None]:
df_train['total_billed'] = df_train['total_billed'].apply(lambda x: np.log(1+x))
df_test['total_billed'] = df_test['total_billed'].apply(lambda x: np.log(1+x))

df_train['total_billed'].hist()

In [None]:
sns.boxplot(y='total_billed', x='Transported', data=df_train)

# Data Cleaning

## Label Encoding

In [None]:
home_planet_map = {'Europa': 1, 'Earth': 2, 'Mars': 3}
df_train['HomePlanet'].fillna(df_train['HomePlanet'].mode()[0], inplace=True)
df_test['HomePlanet'].fillna(df_test['HomePlanet'].mode()[0], inplace=True)

df_train['HomePlanet'] = df_train['HomePlanet'].map(home_planet_map)
df_test['HomePlanet'] = df_test['HomePlanet'].map(home_planet_map)

cryoSleep_map = {False: 0, True: 1}
df_train['CryoSleep'].fillna(df_train['CryoSleep'].mode()[0], inplace=True)
df_test['CryoSleep'].fillna(df_test['CryoSleep'].mode()[0], inplace=True)

df_train['CryoSleep'] = df_train['CryoSleep'].map(cryoSleep_map)
df_test['CryoSleep'] = df_test['CryoSleep'].map(cryoSleep_map)


destination_map = {'TRAPPIST-1e': 1, 'PSO J318.5-22': 2, '55 Cancri e': 3}
df_train['Destination'].fillna(df_train['Destination'].mode()[0], inplace=True)
df_test['Destination'].fillna(df_test['Destination'].mode()[0], inplace=True)

df_train['Destination'] = df_train['Destination'].map(destination_map)
df_test['Destination'] = df_test['Destination'].map(destination_map)

vip_map = {False: 0, True: 1}
df_train['VIP'].fillna(df_train['VIP'].mode()[0], inplace=True)
df_test['VIP'].fillna(df_test['VIP'].mode()[0], inplace=True)

df_train['VIP'] = df_train['VIP'].map(vip_map)
df_test['VIP'] = df_test['VIP'].map(vip_map)

cabin_group_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': '8'}
df_train['cabin_group'].fillna(df_train['cabin_group'].mode()[0], inplace=True)
df_test['cabin_group'].fillna(df_test['cabin_group'].mode()[0], inplace=True)

df_train['cabin_group'] = df_train['cabin_group'].map(cabin_group_map)
df_test['cabin_group'] = df_test['cabin_group'].map(cabin_group_map)

transported_map = {False: 0, True: 1}
df_train['Transported'] = df_train['Transported'].map(transported_map)


df_train.head()

In [None]:
df_train.dtypes

In [None]:
df_train['cabin_group'] = df_train['cabin_group'].astype('int64')
df_test['cabin_group'] = df_test['cabin_group'].astype('int64')

In [None]:
df_train.dtypes

## Missing values

In [None]:
df_train.isnull().sum()

In [None]:
df_train['Age'].fillna(df_train['Age'].median(), inplace=True)
df_train['total_billed'].fillna(df_train['total_billed'].median(), inplace=True)

# test data
df_test['Age'].fillna(df_test['Age'].median(), inplace=True)
df_test['total_billed'].fillna(df_test['total_billed'].median(), inplace=True)

df_train.isnull().sum()

## Prepare validation data

In [None]:
# drop name and passengerId
df_train = df_train.drop(columns=['Name', 'PassengerId', 'VIP'], axis=1)

test_ids = df_test['PassengerId']
df_test = df_test.drop(columns=['Name', 'PassengerId', 'VIP'], axis=1)

y = df_train['Transported']
X = df_train.drop(columns='Transported')

X_train, X_valid, y_train, y_valid = train_test_split(X,y,
                                                     test_size=0.3,
                                                     random_state=42)

In [None]:
print('train shape:', X_train.shape)
print('valid shape:', X_valid.shape)

# Model Selection

## Base Model

In [None]:
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score

df_model = pd.DataFrame(columns=['model', 'valid_score', 'train_score', 'precision', 'recall', 'f1'])

def model_scores(name, model):
    global df_model

    my_model = model.fit(X_train, y_train)
    prediciton = my_model.predict(X_valid)
    valid_score =  str(my_model.score(X_valid, y_valid))
    train_score =  str(my_model.score(X_train, y_train))
    
    precision = str(precision_score(y_valid, prediciton))
    recall = str(recall_score(y_valid, prediciton))
    f1 = str(f1_score(y_valid, prediciton))
    
    print(classification_report(y_valid, prediciton))
    print("score on valid: " + valid_score)
    print("score on train: " + train_score)
    
    local_df = pd.DataFrame([[name, valid_score, train_score,
                             precision, recall, f1]], columns=df_model.columns)
    df_model = pd.concat([local_df, df_model], ignore_index=True)


In [None]:
from sklearn.naive_bayes import MultinomialNB
model_scores('MultinomialNB', MultinomialNB())

In [None]:
from sklearn.linear_model import LogisticRegression
model_scores('LogisticRegression', LogisticRegression())

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model_scores('KNeighborsClassifier', KNeighborsClassifier())

In [None]:
from sklearn.svm import LinearSVC
model_scores('LinearSVC', LinearSVC())

In [None]:
from sklearn.svm import SVC
model_scores('SVC', SVC())

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_scores('DecisionTreeClassifier', DecisionTreeClassifier())

In [None]:
from xgboost import XGBClassifier
model_scores('XGB', XGBClassifier(objective='binary:logistic'))

In [None]:
df_model.sort_values(by='valid_score', ascending=False, ignore_index=True)

In [None]:
df_model.sort_values(by='f1', ascending=False, ignore_index=True)

In [None]:
df_model.sort_values(by='train_score', ascending=False, ignore_index=True)

# Model Evaluation

## Hyperparameter tuning

In [None]:
# model = LinearSVC()
# param_grid = [
#     {
#         'penalty': ['l1', 'l2'],
#         'loss': ['hinge', 'squared_hinge'],
#         'C': [1,2,3,4,5],
#         'multi_class': ['ovr', 'crammer_singer']
#     }
# ]
# grid_search = GridSearchCV(estimator=model,
#                          param_grid=param_grid,
#                          scoring='accuracy',
#                          verbose=5,
#                          cv=10,
#                          return_train_score=True)
# grid_search.fit(X_train, y_train)

In [None]:
# grid_search.best_params_

In [None]:
svc_model = LinearSVC(C=1,
                 loss='squared_hinge',
                 multi_class='crammer_singer',
                 penalty='l1')

svc_model.fit(X_train, y_train)

In [None]:
print('train score:' + str(svc_model.score(X_train, y_train)))
print('valid score:' + str(svc_model.score(X_valid, y_valid)))
print('f1 score:', f1_score(y_valid, svc_model.predict(X_valid)))

In [None]:
# model = MultinomialNB()
# param_grid = {
#     "alpha": [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]
# }

# grid_search = GridSearchCV(estimator=model,
#                          param_grid=param_grid,
#                          scoring='accuracy',
#                           cv=10,
#                           return_train_score=True)
# grid_search.fit(X_train, y_train)

In [None]:
# grid_search.best_params_

In [None]:
nb_model = MultinomialNB(alpha=1e-05)
nb_model.fit(X_train,y_train)
print('train score:', nb_model.score(X_train, y_train))
print('valid score:', nb_model.score(X_valid, y_valid))
print('f1 score:', f1_score(y_valid, nb_model.predict(X_valid)))

In [None]:
# model = DecisionTreeClassifier()
# param_grid = {
#     'criterion': ['gini', 'entropy', 'log_loss'],
#     'max_depth': [None, 1, 3, 5, 7],
#     'max_features': [None, 'auto', 'sqrt', 'log2']
# }
# grid_search = GridSearchCV(estimator=model,
#                          param_grid=param_grid,
#                          scoring='accuracy',
#                           cv=10)
# grid_search.fit(X_train, y_train)

In [None]:
# grid_search.best_params_

In [None]:
tree_model = DecisionTreeClassifier(criterion='gini',
                                   max_depth=5,
                                   max_features='log2')
tree_model.fit(X_train, y_train)

In [None]:
print('train score:', tree_model.score(X_train, y_train))
print('valid score:', tree_model.score(X_valid, y_valid))
print('f1 score:', f1_score(y_valid, tree_model.predict(X_valid)))

In [None]:
# model = DecisionTreeClassifier()
# param_grid = {
#     "max_depth": [3, 4, 7, 10, 25],
#     "gamma": [0.5, 1, 5, 10, 25],
#     "min_child_weight": [1, 3, 5, 10, 25],
#     "reg_lambda": [5, 10, 50, 100, 300],
# }

# grid_search = HalvingGridSearchCV(estimator=model,
#                          param_grid=param_grid,
#                          scoring='accuracy',
#                           cv=10)
# grid_search.fit(X_train, y_train)

In [None]:
# grid_search.best_prarams_

In [None]:
xgb_model = XGBClassifier(objective='binary:logistic',
                         gamma=5,
                         max_depth=10,
                         min_child_weight=10,
                         reg_lambda=10)
xgb_model.fit(X_train, y_train)

print('train score:', xgb_model.score(X_train, y_train))
print('valid score:', xgb_model.score(X_valid, y_valid))
print('f1 score:', f1_score(y_valid, xgb_model.predict(X_valid)))

# Submission

In [None]:
prediction = xgb_model.predict(df_test)

In [None]:
df_submit=pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')

In [None]:
df_submit['Transported'] = prediction
df_submit.head()

In [None]:
df_submit['Transported'] = df_submit['Transported'].map({0: False, 1: True})

In [None]:
sns.countplot(x='Transported', data=df_submit)

In [None]:
df_submit.to_csv('./submission.csv', index=False)