In [None]:
import os
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier
from xgboost import plot_importance

from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve, auc, f1_score, accuracy_score

from sklearn.ensemble import RandomForestClassifier

## 1. Data Preprocessing

In [None]:
base_path = '/content/drive/MyDrive/Colab Notebooks'

ETF_df = pd.read_csv(os.path.join(base_path, 'ETFs_mains.csv'), parse_dates=['Dates'])
print(ETF_df.shape)
ETF_df.head()

- GLD: Gold shares
- FXY: Japanese Yen Foreign Exchange
- T102Y: 10-Year T-Note minus 2-Year T-bill rate
- TED: LTBOR minus T-bill rate
- USO: United States Oil Fund
- UUP: US Dollar Index
- VIX: the Chicago Board Option Exchange's CBOE Volatility Index for S&P 500
- VMO: Municipal Opportunity Trust

### 1-1 Calculate MA & VMA & RSI

In [None]:
ETF_df['CLOSE_SPY'].rolling(45, min_periods=45).mean()

In [None]:
def moving_average(df, n):
    MA = pd.Series(df['CLOSE_SPY'].rolling(n, min_periods=n).mean(), name='MA_'+str(n))
    df = df.join(MA)
    return df
def volume_moving_average(df, n):
    VMA = pd.Series(df['VOLUME'].rolling(n, min_periods=n).mean(), name='VMA_'+str(n))
    df = df.join(VMA)
    return df

In [None]:
def relative_strength_index(df, n):
    i = 0
    UpI = [0]
    DoI = [0]
    while i + 1 <= df.index[-1]:
        UpMove = df.loc[i+1, 'HIGH'] - df.loc[i, 'HIGH'] # 내일 시점의 고가와 오늘 시점의 고가를 - 해줌 = 상승폭
        DoMove = df.loc[i, 'LOW'] - df.loc[i, 'LOW'] # 내일 시점의 저가와 오늘 시점의 저가를 - 해줌 = 하락폭
        
        if UpMove > DoMove and UpMove > 0:
            UpD = UpMove
        else:
            UpD = 0
        UpI.append(UpD)

        if DoMove > UpMove and DoMove > 0:
            DoD = DoMove
        else:
            DoD = 0
        DoI.append(DoD)
        i += 1

    UpI = pd.Series(UpI)
    DoI = pd.Series(DoI)
    # exponential weighted mean -> 가장 최근 데이터에 더 큰 가중치를 두는 것
    PosDI = pd.Series(UpI.ewm(span=n, min_periods=n).mean())
    NegDI = pd.Series(DoI.ewm(span=n, min_periods=n).mean())

    RSI = pd.Series(PosDI / (PosDI + NegDI), name='RSI_'+str(n))
    df = df.join(RSI)
    return df




In [None]:
ETF_df = moving_average(ETF_df, 45)
ETF_df = volume_moving_average(ETF_df, 45)
ETF_df = relative_strength_index(ETF_df, 14)



In [None]:
ETF_df = ETF_df.dropna()
ETF_df = ETF_df.set_index('Dates')
print(ETF_df.shape)
ETF_df.head()

### 1-2 Calculate RoR & UP/DOWN

In [None]:
ETF_df['RoR'] = ETF_df['CLOSE_SPY'].pct_change()

In [None]:
ETF_df['target'] = ETF_df['RoR'].apply(lambda x: 1 if x > 0 else -1)

In [None]:
ETF_df['target'] = ETF_df['target'].shift(-1)

In [None]:
ETF_df = ETF_df.dropna()

In [None]:
ETF_df['target'].value_counts()

In [None]:
up_count = ETF_df[ETF_df['target'] == 1]['target'].count()
total_count = ETF_df['target'].count()
print('{0:.2f}%'.format(up_count/total_count*100))

In [None]:
ETF_df['target'] = ETF_df['target'].astype(np.int64)

In [None]:
y_val = ETF_df['target']
x_val = ETF_df.drop(['target', 'OPEN', 'HIGH', 'LOW', 'VOLUME', 'CLOSE_SPY', 'RoR'], axis=1)

In [None]:
y_val # 정답 데이터

In [None]:
x_val.head()

## 2. Model Training

### 2-1. Split into Train & Test dataset

In [None]:
x_train, x_test, y_train, y_test =train_test_split(
    x_val, y_val,
    test_size = 0.3,
    shuffle=False,
    random_state=42
)

In [None]:
print('---Train dataset---')
print(y_train.value_counts()/y_train.count())

print('---Test dataset---')
print(y_test.value_counts()/y_test.count())

### 2-2. Training Model object

In [None]:
model = XGBClassifier(
    n_esimator=400,
    learning_rate=.1,
    max_depth=6,
    n_jobs=-1,
    nthread=-1,
    min_child_weight=1,
    gamma=0.2,
    subsample=0.8,
    colsample_bytree=0.8
)

In [None]:
#Parent Node -> Child Node1 & Child Node2 [Entropy / Gini] max(Information gain) min(loss)
# gamma 값이 크면 클수록 과적합을 감소시켜줌 대신 과소적합이 증가?

In [None]:
model.fit(x_train, y_train)

### 2-3. Model Performance

In [None]:
model.score(x_train, y_train)

In [None]:
def get_confusion_matrix(y_test, pred):
    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred)
    rec = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    print('---confusion matrix---')
    print('accuracy:{0:.4f}\nprecision:{1:.4f}\nrecall:{2:.4f}\nF1:{3:.4f}'.format(acc, prec, rec, f1))
    print('Harmonic mean', 2*((prec*rec)/ (prec+rec)))

In [None]:
get_confusion_matrix(y_test, model.predict(x_test))

In [None]:
fpr, tpr, _ = roc_curve(y_test, model.predict_proba(x_test)[:, 1])

In [None]:
roc_auc = auc(fpr, tpr)

In [None]:
plt.figure(figsize=(10, 8))

plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %.2f)'%roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.title('ROC Curve', fontsize=15)
plt.legend(loc='lower right')

plt.show()

### 2-4. Feature Importance

In [None]:
model.feature_importances_

In [None]:
for ip_type in ['weight', 'total_gain', 'total_cover']:

    plot_importance(model, importance_type=ip_type, title=ip_type, grid=True)
plt.show()

## 3. Grid Search

In [None]:
params = {
    'bootstrap':[True],
    'n_estimators':range(10, 100, 10),
    'max_depth':[4, 6, 8, 10, 12],
    'min_sample_leaf':[2, 3, 4, 5],
    'min_sample_split':[2, 4 ,6, 8, 10],
    'max_features':[4]
}

In [None]:
cv = TimeSeriesSplit(n_splits=5).split(x_train)
cv

In [None]:
clf_model = GridSearchCV(RandomForestClassifier(), params, cv=cv, n_jobs=-1, verbose=100)

In [None]:
clf_model

In [None]:
clf_model.fit(x_train, y_train)

In [None]:
clf_model.best_score_

In [None]:
clf_model.best_params_

In [None]:
get_confusion_matrix(y_test, clf_model.predict(x_test))