In [1]:
! pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 KB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.2 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0


In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from math import sqrt
import os
import missingno as msno
import calendar
import datetime
from math import sqrt

#시각화
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

#preprocessing
from sklearn.metrics import mean_squared_error, mean_squared_log_error, make_scorer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.linear_model import Lasso,Ridge
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

#models
from sklearn.neighbors import KNeighborsRegressor

#최적화
import optuna
from optuna import Trial
from optuna.samplers import TPESampler

# #Xai
# from functools import partial 
# import shap

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


### 데이터 준비

In [4]:
test_path = "/content/drive/MyDrive/함께하조/기계학습과 딥러닝/data/kaggle_data/test_eda.csv"
train_path = "/content/drive/MyDrive/함께하조/기계학습과 딥러닝/data/kaggle_data/train_eda.csv"

test = pd.read_csv(test_path)
train = pd.read_csv(train_path)

test.drop("Unnamed: 0", axis=1, inplace=True)
train.drop("Unnamed: 0", axis=1, inplace=True)

In [5]:
#cat features
cat_col = ["season", "Year","weather", "Day of week","Month","Day_info", "Hour"] 
for col in cat_col:
    train[col] = train[col].astype("category")
    test[col] = test[col].astype("category")

#train, valid split
days = list(range(1, 15))
train_d = train.loc[train['Day'].isin(days)]
test_d = train.loc[~(train['Day'].isin(days))]

#target, drop, y
target_col = "count"
drop_cols = ["datetime", "workingday", "holiday", "Day", "Year", "sin_hour", "cos_hour", target_col]

x_train, y_train = train_d.drop(drop_cols, axis=1), train_d[target_col]
x_test, y_test = test_d.drop(drop_cols, axis=1), test_d[target_col]

x_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

x_train.head()

Unnamed: 0,season,weather,temp,humidity,windspeed,Day of week,Month,Hour,Day_info
0,1,Good,9.84,81,0.0,Saturday,1,0,Weekend
1,1,Good,9.02,80,0.0,Saturday,1,1,Weekend
2,1,Good,9.02,80,0.0,Saturday,1,2,Weekend
3,1,Good,9.84,75,0.0,Saturday,1,3,Weekend
4,1,Good,9.84,75,0.0,Saturday,1,4,Weekend


In [6]:
cat_col = ["season", "weather", "Day of week", "Month","Day_info", "Hour"]

test = pd.get_dummies(test, columns=cat_col)
train = pd.get_dummies(train, columns=cat_col)

x_test = pd.get_dummies(x_test, columns=cat_col)
x_train = pd.get_dummies(x_train, columns=cat_col)

In [7]:
x_train.shape

(8026, 56)

In [8]:
def objective(trial):
    #scaler loist
    scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])

    #scaler
    if scalers == "minmax":
        scaler = MinMaxScaler()
    elif scalers == "standard":
        scaler = StandardScaler()
    else:
        scaler = RobustScaler()

    dim_red = trial.suggest_categorical("dim_red", ["PCA", None])

    #pca 
    if dim_red == "PCA":
        pca_n_components=trial.suggest_int("pca_n_components", 2, x_train.shape[1]) 
        dimen_red_algorithm=PCA(n_components=pca_n_components)
    # (c) No dimensionality reduction option
    else:
        dimen_red_algorithm='passthrough'

    #모델 하이퍼 파라미터
    knn_n_neighbors=trial.suggest_int("knn_n_neighbors", 1, 19, 2)
    knn_metric=trial.suggest_categorical("knn_metric", ['euclidean', 'manhattan', 'minkowski'])
    knn_weights=trial.suggest_categorical("knn_weights", ['uniform', 'distance'])

    estimator=KNeighborsRegressor(n_neighbors=knn_n_neighbors, metric=knn_metric, weights=knn_weights)

    #pipeline
    pipeline = make_pipeline(scaler, dimen_red_algorithm, estimator)

    #cross-validation
    score = cross_val_score(pipeline, x_train, y_train, scoring='neg_mean_squared_log_error', cv= KFold(n_splits=5, shuffle=True, random_state=42))
    return -score.mean()

#OPTUNA
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="minimize",
                            sampler=sampler)
study.optimize(objective, n_trials=100)

print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2023-03-23 11:35:28,731][0m A new study created in memory with name: no-name-fcc2861f-77c7-4585-b2e0-ba93f67d4936[0m
[32m[I 2023-03-23 11:35:29,789][0m Trial 0 finished with value: 1.5048841073537147 and parameters: {'scalers': 'standard', 'dim_red': 'PCA', 'pca_n_components': 10, 'knn_n_neighbors': 1, 'knn_metric': 'euclidean', 'knn_weights': 'distance'}. Best is trial 0 with value: 1.5048841073537147.[0m
[32m[I 2023-03-23 11:35:30,556][0m Trial 1 finished with value: 0.9771145883065767 and parameters: {'scalers': 'minmax', 'dim_red': None, 'knn_n_neighbors': 11, 'knn_metric': 'minkowski', 'knn_weights': 'distance'}. Best is trial 1 with value: 0.9771145883065767.[0m
[32m[I 2023-03-23 11:35:36,279][0m Trial 2 finished with value: 1.1584277056001926 and parameters: {'scalers': 'robust', 'dim_red': None, 'knn_n_neighbors': 11, 'knn_metric': 'manhattan', 'knn_weights': 'distance'}. Best is trial 1 with value: 0.9771145883065767.[0m
[32m[I 2023-03-23 11:35:40,870][0m 

Best Score: 0.27636372347788785
Best trial: {'scalers': 'standard', 'dim_red': None, 'knn_n_neighbors': 3, 'knn_metric': 'manhattan', 'knn_weights': 'uniform'}


In [9]:
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

Best Score: 0.27636372347788785
Best trial: {'scalers': 'standard', 'dim_red': None, 'knn_n_neighbors': 3, 'knn_metric': 'manhattan', 'knn_weights': 'uniform'}


In [10]:
# 시각화
optuna.visualization.plot_optimization_history(study)

In [11]:
# 하이퍼파라미터 중요도
optuna.visualization.plot_param_importances(study)

In [12]:
study.best_trial.params

{'scalers': 'standard',
 'dim_red': None,
 'knn_n_neighbors': 3,
 'knn_metric': 'manhattan',
 'knn_weights': 'uniform'}

In [13]:
x_train.shape

(8026, 56)

PCA 시각화(Hour)

In [None]:
# scaling
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

###pca 시각화화
pca = PCA(n_components=x_train_scaled.shape[1])
pca.fit(x_train_scaled)
x_train_pca = pca.transform(x_train_scaled)

#데이터 할당당
pc_name = ["PC"+str(i+1) for i in range(len(pca.explained_variance_ratio_))]
pc_ratio = np.round(pca.explained_variance_ratio_*100, 2)
pca_ratio_sums = []
i = 1
for pc in pca.explained_variance_ratio_:
    sigma_sum = pca.explained_variance_ratio_[:i].sum()*100
    pca_ratio_sums.append(sigma_sum)
    i += 1

fig = go.Figure()
fig.add_trace(go.Bar(x=list(range(len(pca.explained_variance_ratio_))), y=pc_ratio))
fig.add_trace(go.Scatter(x=list(range(len(pca.explained_variance_ratio_))), y=pca_ratio_sums, name="blue", mode="lines", marker_color="darkblue"))

fig.update_layout(title=dict({"text": "Pc & Variance"}))
fig.update_layout(xaxis=dict({"tickvals": list(range(len(pca.explained_variance_ratio_))),
                              "ticktext": pc_name}))
fig.update_layout(yaxis=dict({"title": "Varinance(%)"}))

fig.update_traces()
fig.show()

PCA 시각화(cos, sin, Hour)

In [None]:
# scaling
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

###pca 시각화화
pca = PCA(n_components=12)
pca.fit(x_train_scaled)
x_train_pca = pca.transform(x_train_scaled)

#데이터 할당당
pc_name = ["PC"+str(i+1) for i in range(len(pca.explained_variance_ratio_))]
pc_ratio = np.round(pca.explained_variance_ratio_*100, 2)
pca_ratio_sums = []
i = 1
for pc in pca.explained_variance_ratio_:
    sigma_sum = pca.explained_variance_ratio_[:i].sum()*100
    pca_ratio_sums.append(sigma_sum)
    i += 1

fig = go.Figure()
fig.add_trace(go.Bar(x=list(range(len(pca.explained_variance_ratio_))), y=pc_ratio))
fig.add_trace(go.Scatter(x=list(range(len(pca.explained_variance_ratio_))), y=pca_ratio_sums, name="blue", mode="lines", marker_color="darkblue"))

fig.update_layout(title=dict({"text": "Pc & Variance"}))
fig.update_layout(xaxis=dict({"tickvals": list(range(len(pca.explained_variance_ratio_))),
                              "ticktext": pc_name}))
fig.update_layout(yaxis=dict({"title": "Varinance(%)"}))

fig.update_traces()
fig.show()

### x_test 결과

In [14]:
study.best_trial.params

{'scalers': 'standard',
 'dim_red': None,
 'knn_n_neighbors': 3,
 'knn_metric': 'manhattan',
 'knn_weights': 'uniform'}

In [15]:
# scaling
scaler = StandardScaler()
scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)
# print(sum(pca.explained_variance_ratio_))

In [16]:
op_tuned = KNeighborsRegressor(n_neighbors=3, metric='manhattan', weights='uniform')
op_tuned.fit(x_train_scaled, y_train)

In [17]:
y_test_pred = op_tuned.predict(x_test_scaled)

In [18]:
test_rmsle = np.round(mean_squared_log_error(y_test, y_test_pred, squared=False), 3)
print(f"RMSLE: {test_rmsle}")

RMSLE: 0.469


In [19]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=y_test_pred, mode="markers", name="predict",
                text=[f"RMSLE {test_rmsle}"],textposition="top center"))
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode="lines", name="real"))
fig.add_annotation(x=10, y=800,
            text=f"RMSLE = {test_rmsle}",
            showarrow=True)
fig.update_layout(
    autosize=False,
    width=600,
    height=600,
)
fig.update_layout(title=dict({"text": "Real & Prediction"}))
fig.update_layout(xaxis=dict({"title": "Real"}))
fig.update_layout(yaxis=dict({"title": "Predict"}))
fig.show()

### 제출

In [20]:
#다시 data load
test_path = "/content/drive/MyDrive/함께하조/기계학습과 딥러닝/data/kaggle_data/test_eda.csv"
train_path = "/content/drive/MyDrive/함께하조/기계학습과 딥러닝/data/kaggle_data/train_eda.csv"

test = pd.read_csv(test_path)
train = pd.read_csv(train_path)

test.drop("Unnamed: 0", axis=1, inplace=True)
train.drop("Unnamed: 0", axis=1, inplace=True)

#cat features
cat_col = ["season", "Year","weather", "Day of week","Month", "Hour","Day_info"]
for col in cat_col:
    train[col] = train[col].astype("category")
    test[col] = test[col].astype("category")

#target, drop, y
target_col = "count"
drop_cols = ["datetime", "workingday", "holiday", "Day", "Year", "sin_hour", "cos_hour", target_col]
X_train, Y_train = train.drop(drop_cols, axis=1), train[target_col]
X_test = test.drop(["datetime", "workingday", "holiday", "Day", "Year", "sin_hour", "cos_hour"], axis=1)

print(X_train.shape[0]==train.shape[0])
print(X_test.shape[0]==test.shape[0])
X_train.head()

True
True


Unnamed: 0,season,weather,temp,humidity,windspeed,Day of week,Month,Hour,Day_info
0,1,Good,9.84,81,0.0,Saturday,1,0,Weekend
1,1,Good,9.02,80,0.0,Saturday,1,1,Weekend
2,1,Good,9.02,80,0.0,Saturday,1,2,Weekend
3,1,Good,9.84,75,0.0,Saturday,1,3,Weekend
4,1,Good,9.84,75,0.0,Saturday,1,4,Weekend


In [None]:
#one hot encoding
cat_col = ["season", "weather", "Day of week", "Month", "Hour","Day_info"]

X_test = pd.get_dummies(X_test, columns=cat_col)
X_train = pd.get_dummies(X_train, columns=cat_col)

# scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#모델
op_tuned = KNeighborsRegressor(n_neighbors=3, metric='manhattan', weights='uniform')
op_tuned.fit(X_train_scaled, Y_train)

#제출파일
Y_test_pred = op_tuned.predict(X_test_scaled)
test["count"] = Y_test_pred
submission = test[["datetime", "count"]]

# 0.5571
submission.to_csv("KNN_Hour.csv", index=False)

pca 파일

In [None]:
#one hot encoding
cat_col = ["season", "weather", "Day of week", "Month","Day_info"] #"Hour"

X_test = pd.get_dummies(X_test, columns=cat_col)
X_train = pd.get_dummies(X_train, columns=cat_col)

# scaling
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# pca 하면면
###pca 시각화화
pca = PCA(n_components=12)
pca.fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

#model
op_tuned = KNeighborsRegressor(n_neighbors=3, metric='minkowski', weights='distance')
op_tuned.fit(X_train_pca, Y_train)

#제출파일
Y_test_pred = op_tuned.predict(X_test_pca)
test["count"] = Y_test_pred
submission = test[["datetime", "count"]]

# 0.5571
submission.to_csv("KNN_cos_Hour.csv", index=False)

### 모델 해석력

In [21]:
np.arange(1, 19, 2)

array([ 1,  3,  5,  7,  9, 11, 13, 15, 17])