# 자동으로 데이터 전처리 및 모델 학습을 위한 pycaret 설치하기

- [pycaret](https://pycaret.org/)은 low-code로 machine learning을 구현할 수 있도록 도와줍니다

- pycaret를 사용하기 위해 설치를 먼저 진행해줍니다

In [None]:
!pip install pycaret

Collecting pycaret
  Downloading pycaret-3.2.0-py3-none-any.whl (484 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.7/484.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting category-encoders>=2.4.0 (from pycaret)
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting deprecation>=2.1.0 (from pycaret)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting kaleido>=0.2.1 (from pycaret)
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting matplotlib<=3.6,>=3.3.0 (from pycaret)
  Downloading matplotlib-3.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11

In [None]:
from pycaret.classification import setup, compare_models, plot_model, predict_model

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# 암호화폐 데이터 가져오기


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv("/content/drive/MyDrive/coin_data/days/KRW-BTC.csv")

# 오늘 대비 내일의 종가 변화율 구하기
- 기존에 존재하던 `chagne_rate` 열의 의미는 전날 대비 오늘의 종가 변화입니다
- 이 `chagne_rate` 열을 하나씩 앞으로 당겨주면 내일의 종가 변화율을 의미하게 됩니다
- `DataFrame.shift(-1)`을 이용하여 앞으로 하나씩 당겨줄 수 있습니다
- 그럼 가장 마지막 행의 `chagne_rate` 정보는 사라지니 제거해주도록 합니다

In [None]:
df['change_rate'] = df["change_rate"].shift(-1)
df = df.iloc[:-1]
df

Unnamed: 0,market,datetime,open,high,low,close,trade_price,trade_volume,change_rate
0,KRW-BTC,2017-09-25T09:00:00,4201000.0,4333000.0,4175000.0,4322000.0,5.602146e+08,132.484755,-0.000231
1,KRW-BTC,2017-09-26T09:00:00,4317000.0,4418000.0,4311000.0,4321000.0,9.950724e+07,22.788340,0.077760
2,KRW-BTC,2017-09-27T09:00:00,4322000.0,4677000.0,4318000.0,4657000.0,1.448276e+08,32.269662,-0.015246
3,KRW-BTC,2017-09-28T09:00:00,4657000.0,4772000.0,4519000.0,4586000.0,3.721860e+08,80.588243,0.015482
4,KRW-BTC,2017-09-29T09:00:00,4586000.0,4709000.0,4476000.0,4657000.0,2.724558e+08,59.352373,0.051106
...,...,...,...,...,...,...,...,...,...
2304,KRW-BTC,2024-01-16T09:00:00,58416000.0,59577000.0,58169000.0,59196000.0,2.208476e+11,3748.967419,-0.007095
2305,KRW-BTC,2024-01-17T09:00:00,59218000.0,59300000.0,58269000.0,58776000.0,1.666847e+11,2837.334490,-0.025385
2306,KRW-BTC,2024-01-18T09:00:00,58776000.0,58900000.0,56754000.0,57284000.0,2.472746e+11,4268.883840,0.007943
2307,KRW-BTC,2024-01-19T09:00:00,57281000.0,58180000.0,55935000.0,57739000.0,3.199353e+11,5634.881771,-0.003845


# change_rate를 기반으로 한 급 상승 및 급 하락 라벨링하기

In [None]:
def assign_bull(x):
    if x > 0.10:
        return 1
    else:
        return 0

def assign_bear(x):
    if x < -0.10:
        return 1
    else:
        return 0

df['bull'] = df['change_rate'].apply(assign_bull)

df['bear'] = df['change_rate'].apply(assign_bear)

# 급 상승 및 급 하락 일 수 확인해보기

In [None]:
df[["bull", "bear"]].sum()

bull    50
bear    24
dtype: int64

# 보조지표 추가하기

In [None]:
def get_ma_7(df, column):
    ma_7 = df[column].rolling(window=7).mean()
    return ma_7

In [None]:
df["close_ma_7"] = get_ma_7(df, "close")
df["change_rate_ma_7"] = get_ma_7(df, "change_rate")

# 인덱스, 문제 데이터, 정답 데이터 나눠주기

In [None]:
index_df = df.iloc[:, :2]
X_df = df.drop(columns=list(index_df.columns)  + ["change_rate", "bull", "bear"])
Y_bull_df = df[["bull"]]
Y_bear_df = df[["bear"]]

In [None]:
X_df.iloc[:, :-1] = np.log1p(X_df.iloc[:, :-1])

In [None]:
XY_bull_df = pd.concat([X_df, Y_bull_df], axis=1).iloc[6:]
XY_bear_df = pd.concat([X_df, Y_bear_df], axis=1).iloc[6:]

# 급 상승 예측 여러 모델의 성능 비교


In [None]:
test_size = 30
def get_model(df, target, test_size):
    exp1 = setup(
        df.iloc[:-test_size],
        target=target,
        train_size=0.9,
        fix_imbalance_method="SMOTE",
        verbose=False
        )
    return compare_models(sort="F1")


In [None]:
bull_model = get_model(XY_bull_df, "bull", test_size)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9809,0.9772,0.3833,0.3533,0.3367,0.3278,0.3427,0.032
qda,Quadratic Discriminant Analysis,0.9702,0.9131,0.3833,0.1393,0.2003,0.1879,0.2159,0.054
ada,Ada Boost Classifier,0.9858,0.6868,0.1167,0.22,0.1286,0.1241,0.1426,0.282
nb,Naive Bayes,0.9692,0.7551,0.2,0.0911,0.1178,0.1049,0.1168,0.033
dt,Decision Tree Classifier,0.979,0.5365,0.0833,0.0533,0.0619,0.0522,0.0549,0.04
gbc,Gradient Boosting Classifier,0.9829,0.8522,0.05,0.0333,0.04,0.0333,0.0339,0.522
lr,Logistic Regression,0.9883,0.7576,0.0,0.0,0.0,0.0,0.0,0.045
knn,K Neighbors Classifier,0.9883,0.5226,0.0,0.0,0.0,0.0,0.0,0.049
svm,SVM - Linear Kernel,0.9883,0.0,0.0,0.0,0.0,0.0,0.0,0.032
ridge,Ridge Classifier,0.9883,0.0,0.0,0.0,0.0,0.0,0.0,0.031


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

# 실제 급 상승 여부와 예측값 비교해보기

In [None]:
predictions

Unnamed: 0,open,high,low,close,trade_price,trade_volume,close_ma_7,change_rate_ma_7,bull,prediction_label,prediction_score
2279,17.901134,17.916815,17.89464,17.902025,26.378387,8.474575,17.883272,0.003334,0,0,1.0
2280,17.902025,17.904842,17.893047,17.897276,25.657301,7.75838,17.886482,0.003346,0,0,1.0
2281,17.897259,17.903166,17.875954,17.875988,26.11697,8.224253,17.889626,-0.000133,0,0,1.0
2282,17.875988,17.887251,17.866669,17.876677,26.329557,8.452806,17.889412,-0.001918,0,0,1.0
2283,17.876677,17.878365,17.847075,17.859123,26.32473,8.462996,17.887434,-0.003413,0,0,1.0
2284,17.859123,17.879602,17.847075,17.874573,26.148258,8.285726,17.883944,-0.00622,0,0,1.0
2285,17.874573,17.883989,17.853285,17.85693,26.148027,8.283953,17.87764,-0.007495,0,0,1.0
2286,17.856876,17.863811,17.840862,17.848866,26.212027,8.360369,17.870031,-0.00634,0,0,1.0
2287,17.848866,17.861191,17.842665,17.852209,25.591852,7.741848,17.863543,-0.002301,0,0,1.0
2288,17.852209,17.866199,17.848902,17.859386,25.602421,7.744736,17.861158,0.002088,0,0,1.0


# 하락 예측 여러 모델들의 성능 비교

In [None]:
bear_model = get_model(XY_bear_df, "bear", test_size)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9844,0.9661,0.5167,0.4386,0.4424,0.4353,0.4523,0.032
dt,Decision Tree Classifier,0.9834,0.596,0.2,0.2833,0.2233,0.2153,0.2242,0.038
gbc,Gradient Boosting Classifier,0.9853,0.9336,0.2,0.25,0.2167,0.21,0.214,0.734
qda,Quadratic Discriminant Analysis,0.978,0.8766,0.3333,0.1708,0.2133,0.2038,0.2212,0.03
nb,Naive Bayes,0.9746,0.883,0.3,0.1652,0.203,0.1927,0.2059,0.028
ada,Ada Boost Classifier,0.9868,0.725,0.15,0.25,0.1833,0.1787,0.1868,0.4
xgboost,Extreme Gradient Boosting,0.9878,0.9315,0.05,0.1,0.0667,0.0641,0.068,0.151
svm,SVM - Linear Kernel,0.9824,0.0,0.05,0.0062,0.0111,0.0095,0.0156,0.029
lr,Logistic Regression,0.9892,0.8059,0.0,0.0,0.0,0.0,0.0,0.047
knn,K Neighbors Classifier,0.9892,0.5815,0.0,0.0,0.0,0.0,0.0,0.051


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

# 학습된 하락 예측 모델로 예측해보기

In [None]:
predictions = predict_model(bear_model, data=XY_bear_df[-test_size:])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,1.0,0,0.0,0.0,0.0,,0.0


# 실제 하락 여부와 예측값 비교해보기

In [None]:
predictions

Unnamed: 0,open,high,low,close,trade_price,trade_volume,close_ma_7,change_rate_ma_7,bear,prediction_label,prediction_score
2279,17.901134,17.916815,17.89464,17.902025,26.378387,8.474575,17.883272,0.003334,0,0,1.0
2280,17.902025,17.904842,17.893047,17.897276,25.657301,7.75838,17.886482,0.003346,0,0,1.0
2281,17.897259,17.903166,17.875954,17.875988,26.11697,8.224253,17.889626,-0.000133,0,0,1.0
2282,17.875988,17.887251,17.866669,17.876677,26.329557,8.452806,17.889412,-0.001918,0,0,1.0
2283,17.876677,17.878365,17.847075,17.859123,26.32473,8.462996,17.887434,-0.003413,0,0,1.0
2284,17.859123,17.879602,17.847075,17.874573,26.148258,8.285726,17.883944,-0.00622,0,0,1.0
2285,17.874573,17.883989,17.853285,17.85693,26.148027,8.283953,17.87764,-0.007495,0,0,1.0
2286,17.856876,17.863811,17.840862,17.848866,26.212027,8.360369,17.870031,-0.00634,0,0,1.0
2287,17.848866,17.861191,17.842665,17.852209,25.591852,7.741848,17.863543,-0.002301,0,0,1.0
2288,17.852209,17.866199,17.848902,17.859386,25.602421,7.744736,17.861158,0.002088,0,0,1.0
