### 모델 선택 배경

- 일단 Oversampling 한 결과가 하지 않은 것보다 Accuracy에서 확연한 우위(20% 가량)를 보였다
- 조금이라도 퍼센트가 높아진 

### Import

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from collections import Counter

from imblearn.over_sampling import SMOTE

### 데이터 준비

In [2]:
data_df = pd.read_csv('../titanic.csv')

feature_cols = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked']
target_col = 'Cabin'

In [3]:
cabin_cap = data_df[target_col].dropna().apply(lambda x : x[0])

t_deck_index = cabin_cap[cabin_cap == 'T'].index[0]

del cabin_cap[t_deck_index]

In [4]:
deck_X = data_df.loc[list(data_df['Cabin'].dropna().index)][feature_cols]

deck_X = np.array(deck_X.drop(t_deck_index))

In [5]:
deck_label_encoder = LabelEncoder()

deck_label_encoder.fit(list(Counter(cabin_cap).keys()))

LabelEncoder()

In [6]:
class_list = list(deck_label_encoder.classes_)
list.insert(class_list, 0, '0')
class_list.append('T')
deck_label_encoder.classes_ = np.array(class_list)

In [7]:
deck_data = deck_label_encoder.transform(cabin_cap)

### Train / Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(deck_X, deck_data, test_size=0.2, random_state=2, stratify=deck_data)

### Pipeline Preprocess

In [9]:
numeric_features = ['SibSp', 'Parch', 'Fare']
numeric_transformer = RobustScaler() # StandardScaler()

categorical_features = ['Pclass', 'Embarked']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore') 

preprocessor = ColumnTransformer(
    transformers=[ # List of (name, transformer, column(s))
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [10]:
preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)])

preprocessor_pipe.fit(pd.DataFrame(X_train, columns=feature_cols))

X_train_transformed = preprocessor_pipe.transform(pd.DataFrame(X_train, columns=feature_cols))
X_test_transformed = preprocessor_pipe.transform(pd.DataFrame(X_test, columns=feature_cols))

### Oversampling

In [11]:
smote = SMOTE(k_neighbors=1)

X_train_over, y_train_over = smote.fit_sample(X_train_transformed, y_train)

### 모델 객체 생성 & 학습 & 테스트

In [12]:
deck_model = KNeighborsClassifier(1).fit(X_train_over, y_train_over)
deck_model = deck_model.fit(X_train_transformed, y_train)

deck_model.score(X_test_transformed, y_test)

0.6829268292682927

### 예측

In [13]:
from tqdm import tqdm

deck_list = []

for row in tqdm(data_df.itertuples()):
    if(pd.isna(row.Cabin)):
        exec(f'tmp_X_data = np.array([row.{", row.".join(feature_cols)}])')
        tmp_X_data = pd.DataFrame([tmp_X_data], columns=feature_cols)
        tmp_X_data = preprocessor_pipe.transform(tmp_X_data)
        deck_list.append(deck_model.predict(tmp_X_data))
    else:
        deck_val = deck_label_encoder.transform([row.Cabin[0]])
        deck_list.append(deck_val)
        
        
# len(deck_list)

891it [00:02, 304.49it/s]


In [14]:
deck_df = pd.DataFrame(deck_list, columns=['deck'])
deck_df

Unnamed: 0,deck
0,3
1,3
2,6
3,3
4,6
...,...
886,4
887,2
888,6
889,3


In [15]:
deck_df.to_pickle('../deck_df.pkl')