## Machine Learning 프로젝트 수행을 위한 코드 구조화

In [186]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. 라이브러리, 데이터 불러오기

In [188]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import os
import random

# 모델들, 성능 평가
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 상관관계 분석, VIF : 다중공선성 제거
from statsmodels.stats.outliers_influence import variance_inflation_factor

# KFold(CV), partial : optuna를 사용하기 위함
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from functools import partial

# hyper-parameter tuning을 위한 라이브러리, optuna
import optuna

In [189]:
# 데이터
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/space_titanic/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/space_titanic/test.csv')

## 2. EDA

- 데이터에서 찾아야 하는 기초적인 내용들을 확인합니다.


- class imbalance, target distribution, outlier, correlation을 확인합니다.

In [190]:
## 1. 결측치 체크
train[train.isnull().any(axis=1)] #boolean masking ## 결측치를 포함한 row : 2087개

## 2. dtype이 object인 column들 체크 (str)
cat_cols = train.columns[train.dtypes == 'object'] ##mode로
num_cols = train.columns[train.dtypes != 'object'] ##mean으로

## 3. target value 정보를 체크 ----> class imbalance 체크 (이진분류) 10:1 인 경우
train.Transported.value_counts()

True     4378
False    4315
Name: Transported, dtype: int64

In [191]:
train.Transported.value_counts()

True     4378
False    4315
Name: Transported, dtype: int64

In [192]:
train.head(2)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


In [193]:
# pd.pivot_table(data=train, index = 'VIP', values='Transported', aggfunc=['count','sum','mean'])
pd.pivot_table(data=train, index = ['HomePlanet','VIP'], values='Transported',aggfunc=['mean'])
# passengerId : group_num(4자리, usually family but not always)
# HomePlanet, Destination : categorical feature
# Cabin (side deck / otherwise)
# CryoSleep, VIP : Bool
# Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck : 쓴 돈
# Name : 이름
# Transported(y) : True / False

Unnamed: 0_level_0,Unnamed: 1_level_0,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,Transported
HomePlanet,VIP,Unnamed: 2_level_2
Earth,False,0.424337
Europa,False,0.670072
Europa,True,0.48855
Mars,False,0.53418
Mars,True,0.15873


### 3. 전처리

#### 결측치 처리

In [194]:
# tmp1 = train['PassengerId'].apply(lambda x : x.split('_')[0])
# tmp2 = train['PassengerId'].apply(lambda x : x.split('_')[1])

# tmp_df = pd.DataFrame({'Group' : tmp1, 'num' : tmp2})

# def in_large_group(x):
#     if tmp_df.groupby('Group')['num'].count()[x.split('_')[0]] >= 4:
#         return 1
#     else:
#         return 0

# train['in_large_group'] = train['PassengerId'].apply(in_large_group)
# train

temp = train.PassengerId.apply(lambda x:x[:4]).value_counts()
large_group_num = temp[temp >= 4].index
train['in_large_group'] = train.PassengerId.apply(lambda x:x[:4]).isin(large_group_num) * 1
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,in_large_group
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,0
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,0
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,0
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,0


In [195]:
train[train['in_large_group'] == 1]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,in_large_group
21,0020_01,Earth,True,E/0/S,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Almary Brantuarez,False,1
22,0020_02,Earth,True,E/0/S,55 Cancri e,49.0,False,0.0,0.0,0.0,0.0,0.0,Glendy Brantuarez,False,1
23,0020_03,Earth,True,E/0/S,55 Cancri e,29.0,False,0.0,0.0,,0.0,0.0,Mollen Mcfaddennon,False,1
24,0020_04,Earth,False,E/0/S,TRAPPIST-1e,10.0,False,0.0,0.0,0.0,0.0,0.0,Breney Jacostanley,True,1
25,0020_05,Earth,True,E/0/S,PSO J318.5-22,1.0,False,,0.0,0.0,0.0,0.0,Mael Brantuarez,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8647,9227_01,Earth,True,G/1498/P,TRAPPIST-1e,7.0,False,0.0,0.0,0.0,0.0,0.0,Glendy Hinglendez,False,1
8648,9227_02,Earth,True,G/1498/P,PSO J318.5-22,11.0,False,0.0,0.0,0.0,0.0,0.0,Jorgie Hinglendez,True,1
8649,9227_03,Earth,True,G/1498/P,PSO J318.5-22,1.0,False,0.0,0.0,0.0,0.0,0.0,Paulas Hinglendez,True,1
8650,9227_04,Earth,True,G/1498/P,TRAPPIST-1e,0.0,False,0.0,0.0,0.0,0.0,0.0,Robyny Hinglendez,True,1


In [196]:
# 결측치가 있는 column
# dtype이 object인 데이터들을 수치화(encoding)
## cat_cols에 대해서는 mode를 계산해서 채워주고, num_cols에 대해서는 mean을 계산해서 채워줌.
for v in cat_cols:
    train[v] = train[v].fillna(train[v].mode()[0]) # pandas의 최빈값은 여러 개 나올수 있어서 return이 Series임
for v in num_cols:
    train[v] = train[v].fillna(train[v].mean())

train.info()
# categorical feature encoding
train = pd.get_dummies(data = train, columns = ['HomePlanet', 'Destination'])
drop_cols = ['PassengerId','Cabin','Name']
train = train.drop(columns = drop_cols)
train # 8693 x 16

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PassengerId     8693 non-null   object 
 1   HomePlanet      8693 non-null   object 
 2   CryoSleep       8693 non-null   bool   
 3   Cabin           8693 non-null   object 
 4   Destination     8693 non-null   object 
 5   Age             8693 non-null   float64
 6   VIP             8693 non-null   bool   
 7   RoomService     8693 non-null   float64
 8   FoodCourt       8693 non-null   float64
 9   ShoppingMall    8693 non-null   float64
 10  Spa             8693 non-null   float64
 11  VRDeck          8693 non-null   float64
 12  Name            8693 non-null   object 
 13  Transported     8693 non-null   bool   
 14  in_large_group  8693 non-null   int64  
dtypes: bool(3), float64(6), int64(1), object(5)
memory usage: 840.6+ KB


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,in_large_group,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,False,39.0,False,0.0,0.0,0.0,0.0,0.0,False,0,0,1,0,0,0,1
1,False,24.0,False,109.0,9.0,25.0,549.0,44.0,True,0,1,0,0,0,0,1
2,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,0,0,1,0,0,0,1
3,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,0,0,1,0,0,0,1
4,False,16.0,False,303.0,70.0,151.0,565.0,2.0,True,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,0,0,1,0,1,0,0
8689,True,18.0,False,0.0,0.0,0.0,0.0,0.0,False,0,1,0,0,0,1,0
8690,False,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,0,1,0,0,0,0,1
8691,False,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,0,0,1,0,1,0,0


### 4. 학습 데이터 분할

In [197]:
from sklearn.model_selection import train_test_split

X = train.drop(columns = 'Transported') #feature vector
y = train.Transported                   #target value

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state = 42)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(6954, 15) (6954,) (1739, 15) (1739,)


### 5. 학습 및 평가

In [None]:
model = RandomForestClassifier(n_estimators=200,
                               max_depth = 8,
                               min_samples_split=10,
                               min_samples_leaf=5,
                               max_features = 0.6,
                               random_state = 42,
                               n_jobs=-1)

In [None]:
print("\nFitting RandomForest...")
model.fit(X_train, y_train)


Fitting RandomForest...


In [None]:
evaluation_metric = accuracy_score

In [None]:
print("Prediction")
pred_train = model.predict(X_train)
pred_val = model.predict(X_val)

train_score = evaluation_metric(y_train, pred_train)
val_score = evaluation_metric(y_val, pred_val)

print("Train Score : %.4f" % train_score)
print("Validation Score : %.4f" % val_score)

Prediction
Train Score : 0.8155
Validation Score : 0.7826


### 6. Hyper-parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
# 특정 parameter를 집어 넣는 상황에서 조합을 통해 만드는 것
param_grid = {
    'n_estimators':[50,100,200],
    'max_depth':[6,7,8],
    'min_samples_split':[5,10,25],
    'min_samples_leaf':[1,5,10],
    'max_features':[0.5,0.65,0.8]
} # 3 x 3 x 3 x 3 x 3 = 243개의 조합

gcv = GridSearchCV(estimator=RandomForestClassifier(random_state = 42), param_grid=param_grid, cv=5,
                  n_jobs=-1, verbose=2)

gcv.fit(X_train, y_train)
print("Best Estimator : ", gcv.best_estimator_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Estimator :  RandomForestClassifier(max_depth=8, max_features=0.5, min_samples_split=10,
                       n_estimators=200, random_state=42)


In [None]:
print("Prediction with Best Estimator")
gcv_pred_train = gcv.predict(X_train)
gcv_pred_val = gcv.predict(X_val)

gcv_train_score = evaluation_metric(y_train, gcv_pred_train)
gcv_val_score = evaluation_metric(y_val, gcv_pred_val)

print("Train ACC Score : %.4f" % gcv_train_score)
print("Validation ACC Score : %.4f" % gcv_val_score)

> optuna를 사용해봅시다 !

In [198]:
def optimizer(trial):
    # 조절할 hyper-parameter 조합을 적어줍니다. # bayesian optimazation
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth',5,10)
    min_samples_split = trial.suggest_categorical('min_samples_split', [2, 10, 25])
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_float('max_features', 0.5, 0.8)

    # 원하는 모델을 지정합니다, optuna는 시간이 오래걸리기 때문에 저는 보통 RF로 일단 테스트를 해본 뒤에 LGBM을 사용합니다.
    model = RandomForestClassifier(n_estimators = n_estimators,
                                   max_depth = max_depth,
                                   min_samples_split = min_samples_split,
                                   min_samples_leaf = min_samples_leaf,
                                   max_features = max_features,
                                   random_state = 42)

    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    score = evaluation_metric(y_val, preds)

    return score

In [199]:
study = optuna.create_study(direction="maximize") # 최소/최대 어느 방향의 최적값을 구할 건지.
study.optimize(optimizer, n_trials=50)

[I 2023-11-28 03:40:47,525] A new study created in memory with name: no-name-51b278d0-8bf5-4b1d-85ad-374c9b5061db
[I 2023-11-28 03:40:54,266] Trial 0 finished with value: 0.7809085681426107 and parameters: {'n_estimators': 184, 'max_depth': 8, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 0.6039400933649891}. Best is trial 0 with value: 0.7809085681426107.
[I 2023-11-28 03:40:57,543] Trial 1 finished with value: 0.78205865439908 and parameters: {'n_estimators': 114, 'max_depth': 9, 'min_samples_split': 25, 'min_samples_leaf': 3, 'max_features': 0.5711642162083779}. Best is trial 1 with value: 0.78205865439908.
[I 2023-11-28 03:41:00,432] Trial 2 finished with value: 0.780333525014376 and parameters: {'n_estimators': 172, 'max_depth': 7, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 0.6386383212013573}. Best is trial 1 with value: 0.78205865439908.
[I 2023-11-28 03:41:01,945] Trial 3 finished with value: 0.7791834387579069 and parameters: {'n_estimat

In [200]:
# optuna가 시도했던 모든 실험 관련 데이터
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.780909,2023-11-28 03:40:47.532287,2023-11-28 03:40:54.265746,0 days 00:00:06.733459,8,0.60394,10,10,184,COMPLETE
1,1,0.782059,2023-11-28 03:40:54.273334,2023-11-28 03:40:57.542892,0 days 00:00:03.269558,9,0.571164,3,25,114,COMPLETE
2,2,0.780334,2023-11-28 03:40:57.549389,2023-11-28 03:41:00.431615,0 days 00:00:02.882226,7,0.638638,5,10,172,COMPLETE
3,3,0.779183,2023-11-28 03:41:00.439034,2023-11-28 03:41:01.944566,0 days 00:00:01.505532,10,0.638932,2,10,59,COMPLETE
4,4,0.780909,2023-11-28 03:41:01.954742,2023-11-28 03:41:09.200678,0 days 00:00:07.245936,10,0.761205,1,10,192,COMPLETE
5,5,0.774008,2023-11-28 03:41:09.203639,2023-11-28 03:41:11.079924,0 days 00:00:01.876285,5,0.655303,8,25,174,COMPLETE
6,6,0.778033,2023-11-28 03:41:11.085529,2023-11-28 03:41:12.265997,0 days 00:00:01.180468,10,0.674857,3,10,66,COMPLETE
7,7,0.779758,2023-11-28 03:41:12.269007,2023-11-28 03:41:13.635056,0 days 00:00:01.366049,8,0.519533,10,25,131,COMPLETE
8,8,0.774008,2023-11-28 03:41:13.637582,2023-11-28 03:41:14.549606,0 days 00:00:00.912024,5,0.5349,6,2,136,COMPLETE
9,9,0.782634,2023-11-28 03:41:14.551645,2023-11-28 03:41:15.406508,0 days 00:00:00.854863,8,0.658595,1,2,86,COMPLETE


In [201]:
print("Best Score: %.4f" % study.best_value) # best score 출력
print("Best params: ", study.best_trial.params) # best score일 때의 하이퍼파라미터들

Best Score: 0.7872
Best params:  {'n_estimators': 86, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 0.5513242522853813}


In [202]:
# 실험 기록 시각화
optuna.visualization.plot_optimization_history(study)

In [203]:
# hyper-parameter들의 중요도
optuna.visualization.plot_param_importances(study)

In [204]:
# model Finalization
print("Validation ACC")
best_params = study.best_params
best_model = model = RandomForestClassifier(**best_params,
                                            random_state = 42)
best_model.fit(X_train, y_train)
print("Validation Score : %.3f" % evaluation_metric(y_val, best_model.predict(X_val)))

Validation ACC
Validation Score : 0.787


### 7. 테스트 및 제출 파일 생성

In [205]:
## X_test 만들기 -> traindata에 사용한 전처리 기법을 그대로 사용하기!
# X_train과 column 개수도 같고, column 순서도 같은 X_test를 만들기
X_test = test.copy()
category_cols = []
numeric_cols = []
for v in list(X_test.columns):
    if X_test[v].dtype == 'object':
        category_cols.append(v)
    else:
        numeric_cols.append(v)

temp = X_test.PassengerId.apply(lambda x:x[:4]).value_counts()
large_group_num = temp[temp >= 4].index
X_test['in_large_group'] = X_test.PassengerId.apply(lambda x:x[:4]).isin(large_group_num) * 1

for v in category_cols:
    X_test[v] = X_test[v].fillna(X_test[v].mode()[0])
for v in numeric_cols:
    X_test[v] = X_test[v].fillna(X_test[v].mean())


X_test = pd.get_dummies(data = X_test, columns = ['HomePlanet', 'Destination'])
drop_cols = ['PassengerId','Cabin','Name']
X_test = X_test.drop(columns = drop_cols)
X_test

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,in_large_group,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,True,27.000000,False,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,1
1,False,19.000000,False,0.0,9.0,0.0,2823.0,0.0,0,1,0,0,0,0,1
2,True,31.000000,False,0.0,0.0,0.0,0.0,0.0,0,0,1,0,1,0,0
3,False,38.000000,False,0.0,6652.0,0.0,181.0,585.0,0,0,1,0,0,0,1
4,False,20.000000,False,10.0,0.0,635.0,0.0,0.0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,True,34.000000,False,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,1
4273,False,42.000000,False,0.0,847.0,17.0,10.0,144.0,0,1,0,0,0,0,1
4274,True,28.658146,False,0.0,0.0,0.0,0.0,0.0,0,0,0,1,1,0,0
4275,False,28.658146,False,0.0,2680.0,0.0,0.0,523.0,0,0,1,0,0,0,1


In [206]:
# warning도 error도 안 나와야 제대로된 예측값이 나옴
preds = best_model.predict(X_test)
preds

array([ True, False,  True, ...,  True,  True,  True])

In [208]:
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/space_titanic/sample_submission.csv')
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False


In [209]:
submission['Transported'] = preds
submission.to_csv("submission.csv", index=False)