In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

## 정보 확인

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
print(df_train.dtypes)
print('---------------------------')
print(df_test.dtypes)

## 결측치 다루기

In [None]:
# 1. 결측치 확인
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
# 2. 1차 drop
df_train=df_train.drop(columns=['Name'])
df_test=df_test.drop(columns=['Name'])

In [None]:
df_train.head(10)

In [None]:
df_test.head(10)

In [None]:
# 3. HomePlanet, Destination, CryoSleep, VIP 결측치 제거(최빈값 채우기)
df_train[['Destination','HomePlanet','CryoSleep','VIP']].mode()
mode_list = ['Destination','HomePlanet','CryoSleep','VIP']
for i in mode_list:
    if i == 'Destination':
        df_train[i].fillna('TRAPPIST-1e', inplace = True)
    elif i == 'HomePlanet':
        df_train[i].fillna('Earth', inplace = True)
    elif i == 'CryoSleep':
        df_train[i].fillna(False, inplace = True)
    elif i == 'VIP':
        df_train[i].fillna(False, inplace = True)
        
df_test[['Destination','HomePlanet','CryoSleep']].mode()
for i in mode_list:
    if i == 'Destination':
        df_test[i].fillna('TRAPPIST-1e', inplace = True)
    elif i == 'HomePlanet':
        df_test[i].fillna('Earth', inplace = True)
    elif i == 'CryoSleep':
        df_test[i].fillna(False, inplace = True)
    elif i == 'VIP':
        df_test[i].fillna(False, inplace = True)

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
# 4. Age 결측치 제거(중앙값 채우기)
df_train['Age'].fillna(df_train['Age'].median(), inplace = True)
df_test['Age'].fillna(df_test['Age'].median(), inplace = True)

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
# 5. 소비 항목 결측치 제거(최빈값 채우기) -> 수정
service_list=['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
for i in service_list:
    df_train[i].fillna(0, inplace = True)

for i in service_list:
    df_test[i].fillna(0, inplace = True)

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
# 6. Cabin 결측치 제거
df_train['Cabin'].fillna('N/5000/N', inplace=True)
df_test['Cabin'].fillna('N/5000/N', inplace=True)

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
df_train.dtypes

## 데이터 전처리

In [None]:
# 1. Cabin 전처리
df_train[['Group_num', 'Id_num']] = df_train['PassengerId'].str.split('_', expand = True)
df_train[['Deck', 'Num', 'Side']] = df_train['Cabin'].str.split('/', expand = True)

df_test[['Group_num', 'Id_num']] = df_test['PassengerId'].str.split('_', expand = True)
df_test[['Deck', 'Num', 'Side']] = df_test['Cabin'].str.split('/', expand = True)

In [None]:
df_train.head(3)

In [None]:
df_test.head(3)

In [None]:
# 2. PassengerId 전처리(Group_size 라벨링)
Train_Group_numlist = list(df_train['Group_num'].value_counts().sort_index())
Train_Group_size = []
for i in range(len(Train_Group_numlist)):
    for j in range(Train_Group_numlist[i]):
        Train_Group_size.append(Train_Group_numlist[i])
        
Test_Group_numlist = list(df_test['Group_num'].value_counts().sort_index())    
Test_Group_size = []
for i in range(len(Test_Group_numlist)):
    for j in range(Test_Group_numlist[i]):
        Test_Group_size.append(Test_Group_numlist[i])

In [None]:
# 3. Id_num(PassengerId) 2차 drop
df_train = df_train.drop(columns = ['Id_num'])
df_test = df_test.drop(columns = ['Id_num'])

In [None]:
df_train.head(3)

In [None]:
df_test.head(3)

In [None]:
# 4. df_train에 Group_size 추가
df_train['Group_size'] = Train_Group_size
df_test['Group_size'] = Test_Group_size

In [None]:
df_train.head(3)

In [None]:
df_test.head(3)

In [None]:
# 5. Cabin 위치 나누기
CabinNum_name = ['Cabin_loc1','Cabin_loc2','Cabin_loc3','Cabin_loc4','Cabin_loc5','Cabin_loc6','Cabin_loc7']
df_train['Num'] = df_train['Num'].astype(int)

df_train[CabinNum_name[0]] = (df_train['Num'] < 300)
df_train[CabinNum_name[1]] = ((df_train['Num'] >= 300) & (df_train['Num'] < 600))
df_train[CabinNum_name[2]] = ((df_train['Num'] >= 600) & (df_train['Num'] < 900))
df_train[CabinNum_name[3]] = ((df_train['Num'] >= 900) & (df_train['Num'] < 1200))
df_train[CabinNum_name[4]] = ((df_train['Num'] >= 1200) & (df_train['Num'] < 1500))
df_train[CabinNum_name[5]] = ((df_train['Num'] >= 1500) & (df_train['Num'] < 1800))
df_train[CabinNum_name[6]] = (df_train['Num'] >= 1800)

df_test['Num'] = df_test['Num'].astype(int)

df_test[CabinNum_name[0]] = (df_test['Num'] < 300)
df_test[CabinNum_name[1]] = ((df_test['Num'] >= 300) & (df_test['Num'] < 600))
df_test[CabinNum_name[2]] = ((df_test['Num'] >= 600) & (df_test['Num'] < 900))
df_test[CabinNum_name[3]] = ((df_test['Num'] >= 900) & (df_test['Num'] < 1200))
df_test[CabinNum_name[4]] = ((df_test['Num'] >= 1200) & (df_test['Num'] < 1500))
df_test[CabinNum_name[5]] = ((df_test['Num'] >= 1500) & (df_test['Num'] < 1800))
df_test[CabinNum_name[6]] = (df_test['Num'] >= 1800)

In [None]:
df_train.head(3)

In [None]:
df_test.head(3)

In [None]:
df_train['Luc_exp'] = df_train['RoomService'] + df_train['FoodCourt'] + df_train['ShoppingMall'] + df_train['Spa'] + df_train['VRDeck']
df_test['Luc_exp'] = df_test['RoomService'] + df_test['FoodCourt'] + df_test['ShoppingMall'] + df_test['Spa'] + df_test['VRDeck']

## one-hot encoding 

In [None]:
# 1. category 항목 인코딩
category_list = ['HomePlanet','Destination','Deck','Side']

train_hot_enc = pd.get_dummies(df_train[category_list])

train_final = df_train.copy()
train_final = pd.concat([train_final,train_hot_enc],axis = 1)

test_hot_enc = pd.get_dummies(df_test[category_list])

test_final = df_test.copy()
test_final = pd.concat([test_final,test_hot_enc],axis = 1)

In [None]:
train_final.head(2)

In [None]:
test_final.head(2)

In [None]:
# 3. 2차 drop
train_final2 = train_final.drop(columns = ['PassengerId', 'HomePlanet','Cabin', 'Destination','Group_num','Deck', 'Num', 'Side'])
X_test = test_final.drop(columns = ['PassengerId', 'HomePlanet','Cabin', 'Destination','Group_num','Deck', 'Num', 'Side'])

In [None]:
# 5. X, Y 데이터 분리
Y_train_data = train_final2['Transported'].copy()
X_train_data = train_final2.drop(columns = ['Transported','VIP'])

In [None]:
Y_train_data.head(3)

In [None]:
X_train_data.head(3)

In [None]:
X_train_data.isnull().sum()

In [None]:
X_train_data.shape

## 학습 - 모델 : Catboost

In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=100000,eval_metric='Accuracy',verbose=5000)

model.fit(X_train_data, Y_train_data)

In [None]:
answer = model.predict(X_test)

submission = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
submission['Transported'] = answer
submission.to_csv('submission.csv', index = False)
submission.head(20)