# 0. Import Libraries


In [None]:
!pip install yfinance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm

import yfinance as yf
from pandas_datareader import data as pdr
pd.core.common.is_list_like = pd.api.types.is_list_like
yf.pdr_override()

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 1. Get Data

### 1-1. get data from yagoo finance

In [None]:
SPY_df = pdr.get_data_yahoo('SPY', '2012-01-01', '2021-07-31')
SPY_df

In [None]:
# 결측치 확인
SPY_df.info()

In [None]:
SPY_df.describe()

### 1-2. Feature & Target data

In [None]:
tmp_df = SPY_df.copy()
print(tmp_df.shape)
tmp_df.head()

In [None]:
tmp_df = tmp_df[tmp_df.columns[:4]]

In [None]:
tmp_df['OC'] = tmp_df['Open'] - tmp_df['Close']

In [None]:
tmp_df['HL'] = tmp_df['High'] - tmp_df['Low']

In [None]:
# 피쳐엔지니어링 
x_val = tmp_df[['OC', 'HL']]
x_val

In [None]:
tmp_df['Close'].shift(-1) > tmp_df['Open'].shift(-1)

In [None]:
#np.where(조건, 참일 경우 값, 거짓일 경우 값)
y_val = np.where(tmp_df['Close'].shift(-1) > tmp_df['Open'].shift(-1), 1, -1)
y_val

### 1-3. Visualization Feature data

In [None]:
x_min, x_max = x_val['OC'].min() - .5, x_val['OC'].max() + .5
y_min, y_max = x_val['HL'].min() - .5, x_val['HL'].max() + .5

In [None]:
plt.figure(figsize=(8, 6))

plt.scatter(x_val['OC'],x_val['HL'], c=y_val, cmap='cool')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)


plt.title('Features Distribution', fontsize=15)
plt.xlabel('Open-Close', fontsize=15)
plt.ylabel('High-Low', fontsize=15)
plt.grid()
plt.show()

### 1-4. Standardization

#### 1-4-1. 표준화(Standardization)
* 평균 == 0 & 표준편차 == 1



In [None]:
standardizer = StandardScaler()

In [None]:
x_val_standardized = standardizer.fit_transform(x_val)
x_val_standardized

#### 1-4-2. 정규화(Normalization)
* 모든 데이터를 0~1로 범주화




In [None]:
print(round(x_val_standardized.mean()))
print(round(x_val_standardized.std()))

# 2. Split into Train & Test data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x_val_standardized, y_val,
    test_size=0.3, random_state=42, shuffle=False
)

# 3. Train Model Object

### 3-1. Training

In [None]:
train_acc = []
test_acc = []


for n in tqdm(range(1, 15)):
    clf = KNeighborsClassifier(n_neighbors=n, n_jobs=-1)
    clf.fit(x_train, y_train)
    pred = clf.predict(x_test)

    train_acc.append(clf.score(x_train, y_train))
    test_acc.append((pred == y_test).mean())

In [None]:
plt.figure(figsize=(10, 8))

plt.plot(range(1, 15), train_acc, label='Train')
plt.plot(range(1, 15), test_acc, label='Test')
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.xticks(range(0,15))
plt.legend()
plt.grid()
plt.show()


### 3-2. Model Performance

In [None]:
clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=5)
clf.fit(x_train, y_train)
print(accuracy_score(y_test, clf.predict(x_test)))