In [None]:
import pandas as pd

In [None]:
data_path = '/kaggle/input/cat-in-the-dat-ii/'

train = pd.read_csv(data_path + 'train.csv', index_col='id')
test = pd.read_csv(data_path + 'test.csv', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col='id')

In [None]:
train.shape, test.shape

## 피처 엔지니어링 1 : 피처 맞춤 인코딩

### 데이터 합치기

In [None]:
all_data = pd.concat([train, test])
all_data = all_data.drop('target', axis=1)
all_data

In [None]:
for col in all_data.columns:
    mode = all_data[col].mode(dropna=True).values[0]
    all_data[col] = all_data[col].fillna(mode)

간단하게 최빈값으로 NaN 값 처리

### 이진 피처 인코딩

In [None]:
all_data['bin_3'] = all_data['bin_3'].map({'F': 0.0, 'T': 1.0})
all_data['bin_4'] = all_data['bin_4'].map({'N': 0.0, 'Y': 1.0})

In [None]:
all_data

### 순서형 피처 인코딩

- ord_0 피처는 이미 숫자로 구성돼 있어 인코딩하지 않아도 됨
- ord_1, ord_2 피처는 순서를 정해서 인코딩
- ord_3 ~ ord_5 피처는 알파벳 순서대로 인코딩

In [None]:
ord1dict = {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4}
ord2dict = {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3, 'Boiling Hot': 4, 'Lava Hot': 5}

all_data['ord_1'] = all_data['ord_1'].map(ord1dict)
all_data['ord_2'] = all_data['ord_2'].map(ord2dict)

In [None]:
all_data

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
ord_345 = ['ord_' + str(i) for i in range(3, 6)]

ord_encoder = OrdinalEncoder()

all_data[ord_345] = ord_encoder.fit_transform(all_data[ord_345])

for feature, categories in zip(ord_345, ord_encoder.categories_):
    print(feature)
    print(categories)

In [None]:
all_data

### 명목형 피처 인코딩

In [None]:
nom_features = ['nom_' + str(i) for i in range(10)]

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
onehot_encoder = OneHotEncoder()

encoded_nom_matrix = onehot_encoder.fit_transform(all_data[nom_features])

encoded_nom_matrix

In [None]:
all_data = all_data.drop(nom_features, axis=1)

### 날짜 피처 인코딩

In [None]:
import numpy as np

In [None]:
np.linspace(0, np.pi / 2, 8)

In [None]:
date_features = ['day', 'month']

day_dict = {i: np.sin(k) for i, k in enumerate(np.linspace(0, np.pi / 2, train['day'].nunique() + 1))}
month_dict = {i: np.sin(k) for i, k in enumerate(np.linspace(0, np.pi / 2, train['month'].nunique() + 1))}

all_data['day'] = all_data['day'].map(day_dict)
all_data['month'] = all_data['month'].map(month_dict)
all_data

## 피처 엔지니어링 2 : 피처 스케일링

### 순서형 피처 스케일링

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
ord_features = ['ord_' + str(i) for i in range(6)]

all_data[ord_features] = MinMaxScaler().fit_transform(all_data[ord_features])

### 인코딩 및 스케일링된 피처 합치기

In [None]:
from scipy import sparse

In [None]:
all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data),
                               encoded_nom_matrix,
                              ],
                              format='csr')

In [None]:
all_data_sprs

### 데이터 나누기

In [None]:
num_train = len(train)

X_train = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]

y = train['target']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y, test_size=0.1, stratify=y, random_state=42)

## 하이퍼파라미터 최적화

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [None]:
%%time

logistic_model = LogisticRegression()

lr_params = {'C': [0.1, 0.125, 0.2], 'max_iter': [800, 900, 1000],
             'solver': ['liblinear'], 'random_state': [42]}

gridsearch_logistic_model = GridSearchCV(estimator=logistic_model,
                                         param_grid=lr_params, 
                                         scoring='roc_auc',
                                         cv=5)

gridsearch_logistic_model.fit(X_train, y_train)

print('최적 하이퍼파라미터 :', gridsearch_logistic_model.best_params_)

## 모델 성능 검증

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
y_valid_preds = gridsearch_logistic_model.best_estimator_.predict_proba(X_valid)[:, 1]

roc_auc = roc_auc_score(y_valid, y_valid_preds)

print(f'검증 데이터 ROC AUC : {roc_auc:.4f}')

### 예측 및 결과 제출

In [None]:
y_preds = gridsearch_logistic_model.best_estimator_.predict_proba(X_test)[:, 1]

submission['target'] = y_preds
submission.to_csv('submission.csv')