In [1]:
import os
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

### 데이터 불러오기

In [2]:
INPUT_PATH = './data/'
train = pd.read_csv(os.path.join(INPUT_PATH, 'train.csv'))

### train, test 데이터셋 나누기

In [3]:
X = train.drop(['price_range'], axis=1)
y = train['price_range']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    stratify=train['price_range'],
    random_state=0
)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1500, 20), (1500,), (500, 20), (500,))

* RandomForest

In [4]:
model = RandomForestClassifier(n_jobs=-1, random_state=0)
model.fit(X_train, y_train)
pred = model.predict(X_test)

print('train set acc:', model.score(X_train, y_train))
print('test set acc:', model.score(X_test, y_test))
print(classification_report(y_test, pred))

score = cross_val_score(model, X, y, cv=5, scoring='roc_auc_ovo')
print(score.mean())
score = cross_val_score(model, X, y, cv=5, scoring='roc_auc_ovr')
print(score.mean())

train set acc: 1.0
test set acc: 0.88
              precision    recall  f1-score   support

           0       0.91      0.94      0.93       125
           1       0.82      0.86      0.84       125
           2       0.84      0.82      0.83       125
           3       0.95      0.90      0.92       125

    accuracy                           0.88       500
   macro avg       0.88      0.88      0.88       500
weighted avg       0.88      0.88      0.88       500

0.9805316666666666
0.9805316666666666


* GradientBoosting

In [5]:
model = GradientBoostingClassifier(random_state=0)
model.fit(X_train, y_train)
pred = model.predict(X_test)

print('train set acc:', model.score(X_train, y_train))
print('test set acc:', model.score(X_test, y_test))
print(classification_report(y_test, pred))

score = cross_val_score(model, X, y, cv=5, scoring='roc_auc_ovo')
print(score.mean())
score = cross_val_score(model, X, y, cv=5, scoring='roc_auc_ovr')
print(score.mean())

train set acc: 1.0
test set acc: 0.912
              precision    recall  f1-score   support

           0       0.97      0.94      0.96       125
           1       0.87      0.94      0.90       125
           2       0.86      0.88      0.87       125
           3       0.97      0.89      0.92       125

    accuracy                           0.91       500
   macro avg       0.91      0.91      0.91       500
weighted avg       0.91      0.91      0.91       500

0.986645
0.986645


* Logistic Regression
    * MinMaxScaler로 데이터 전처리

In [6]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(n_jobs=-1)
model.fit(X_train_scaled, y_train)
pred = model.predict(X_test_scaled)

print('train set acc:', model.score(X_train_scaled, y_train))
print('test set acc:', model.score(X_test_scaled, y_test))
print(classification_report(y_test, pred))

pipe = Pipeline([('scaler', scaler), ('model', model)])
score = cross_val_score(pipe, X, y, cv=5, scoring='roc_auc_ovo')
print(score.mean())
score = cross_val_score(pipe, X, y, cv=5, scoring='roc_auc_ovr')
print(score.mean())

train set acc: 0.9406666666666667
test set acc: 0.928
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       125
           1       0.89      0.89      0.89       125
           2       0.92      0.89      0.90       125
           3       0.96      0.98      0.97       125

    accuracy                           0.93       500
   macro avg       0.93      0.93      0.93       500
weighted avg       0.93      0.93      0.93       500

0.9909116666666666
0.9909116666666666


* KNN
    * MinMaxScaler로 데이터 전처리

In [7]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = KNeighborsClassifier(n_jobs=-1)
model.fit(X_train_scaled, y_train)
pred = model.predict(X_test_scaled)

print('train set acc:', model.score(X_train_scaled, y_train))
print('test set acc:', model.score(X_test_scaled, y_test))
print(classification_report(y_test, pred))

pipe = Pipeline([('scaler', scaler), ('model', model)])
score = cross_val_score(pipe, X, y, cv=5, scoring='roc_auc_ovo')
print(score.mean())
score = cross_val_score(pipe, X, y, cv=5, scoring='roc_auc_ovr')
print(score.mean())


train set acc: 0.6293333333333333
test set acc: 0.412
              precision    recall  f1-score   support

           0       0.50      0.63      0.56       125
           1       0.28      0.33      0.30       125
           2       0.32      0.30      0.31       125
           3       0.61      0.39      0.48       125

    accuracy                           0.41       500
   macro avg       0.43      0.41      0.41       500
weighted avg       0.43      0.41      0.41       500

0.6630533333333333
0.6630533333333333


* 점수가 가장 높은 Logistic Regression 모델을 선택.

##### GridSearchCV
* solver
    * lbfgs
    * sag
* C
    * 0.01, 0.1, 1, 10, 100
* penalty
    * l2
    * none

In [8]:
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, stratify=y, random_state=0
)

scaler = MinMaxScaler()
model = LogisticRegression(n_jobs=-1)
pipe = Pipeline([('scaler', scaler), ('model', model)])

param_grid = [
    {
        'model__solver': ['lbfgs', 'sag'],
        'model__C': [0.01, 0.1, 1, 10, 100],
        'model__penalty': ['l2'],
        'model__random_state': [0]
    },
    {
        'model__solver': ['lbfgs', 'sag'],
        'model__penalty': ['none'],
        'model__random_state': [0]
    }
]
gridcv = GridSearchCV(pipe, param_grid=param_grid, cv=5)
gridcv.fit(X_trainval, y_trainval)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('model',
                                        LogisticRegression(n_jobs=-1))]),
             param_grid=[{'model__C': [0.01, 0.1, 1, 10, 100],
                          'model__penalty': ['l2'], 'model__random_state': [0],
                          'model__solver': ['lbfgs', 'sag']},
                         {'model__penalty': ['none'],
                          'model__random_state': [0],
                          'model__solver': ['lbfgs', 'sag']}])

* Best parameter
    * panalty: none
    * solver: sag

In [9]:
print('GridSearchCV best parameter:', gridcv.best_params_)

GridSearchCV best parameter: {'model__penalty': 'none', 'model__random_state': 0, 'model__solver': 'sag'}


In [10]:
pred = gridcv.predict_proba(X_test)
score = roc_auc_score(y_test, pred, multi_class='ovr')
print(f'ROC AUC score: {score}')

ROC AUC score: 0.9984266666666667


##### 최종모델

In [11]:
scaler = MinMaxScaler()
model = LogisticRegression(
    n_jobs=-1, penalty='none', random_state=0, solver='sag'
)
pipe = Pipeline([('scaler', scaler), ('model', model)])
score = cross_val_score(pipe, X, y, cv=5, scoring='roc_auc_ovr')
print(score.mean())

0.9987766666666665
