# creating features for currency price prediction 
![](https://miro.medium.com/max/2560/0*R5pC0bAlYxH_nTlF.jpg)

In [None]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
import collections
import seaborn as sns
from xgboost import DMatrix
import matplotlib.pyplot as plt
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.metrics import cohen_kappa_score
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from scipy import signal
import matplotlib.pyplot as plt
import random
import numpy as np
from scipy.stats import zscore
from statistics import mean, mode
# チャート用設定
# setting for chart
import plotly as py
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

init_notebook_mode(connected=True)
layout=go.Layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(250,250,250,0.8)')
fig = go.Figure(layout=layout)
templated_fig = pio.to_templated(fig)
pio.templates['my_template'] = templated_fig.layout.template
pio.templates.default = 'my_template'

## データインポート/import

## ready features

### フラグの作成 / make flag
ゴールデンクロスの簡易版です  
simple version of golden cross  

In [None]:
def sma_and_flag(df):
    for num in ['5', '10', '15', '30']:
        # 移動平均線と移動標準偏差
        # moving average and std of it
        df['SMA_' + num] = df['Close'].rolling(int(num)).mean().shift()
        df['SMA_' + num + '_std'] = df['Close'].rolling(int(num)).std().shift()
        col_name = 'SMA_' + num + '_sub'
        today_col_name = 'flag_today_sma_' + num
        yesterday_col_name = 'flag_yesterday_sma_' + num
        df[col_name] = df['Close'].rolling(int(num)).mean().shift()
        df.loc[df[col_name] < df['Close'], today_col_name] = 1
        df.loc[df[col_name] >= df['Close'], today_col_name] = 0
        df[yesterday_col_name] = df[today_col_name].shift(1)
        # フラグを作成
        # make flag
        df.loc[(df[yesterday_col_name] == 0) & (df[today_col_name] == 1), "flag_sma_under_" + num] = 1
        df.loc[~((df[yesterday_col_name] == 0) & (df[today_col_name] == 1)), "flag_sma_under_" + num] = 0
        df.loc[(df[yesterday_col_name] == 1) & (df[today_col_name] == 0), "flag_sma_over_" + num] = 1
        df.loc[~((df[yesterday_col_name] == 1) & (df[today_col_name] == 0)), "flag_sma_over_" + num] = 0
        df = df.drop([col_name, yesterday_col_name, today_col_name], 1)
    up_flag = {'5' : 0, '10': 0, '15' : 0, '30' : 0}
    down_flag = {'5' : 0, '10': 0, '15' : 0, '30' : 0}
    # フラグから特定の距離にあるかチェック
    # check distance from flag
    for i in range(len(df)):
        for num in ['5', '10', '15', '30']:
            up_column = "up_flag_distance_" + num
            down_column = "down_flag_distance_" + num
            df.loc[i, up_column] = 0
            df.loc[i, down_column] = 0
            if up_flag[num] > int(num):
                up_flag[num] = 0
                df.loc[i, "up_flag_distance_" + num] = 0
            elif up_flag[num] > 0:
                df.loc[i, "up_flag_distance_" + num] = up_flag[num]
                up_flag[num] += 1

            if down_flag[num] > int(num):
                down_flag[num] = 0
                df.loc[i, "down_flag_distance_" + num] = 0
            elif down_flag[num] > 0:
                df.loc[i, "down_flag_distance_" + num] = down_flag[num]
                down_flag[num] += 1

            if df.loc[i, "flag_sma_under_" + num] == 1:
                df.loc[i, "down_flag_distance_" + num] = 1
                down_flag[num] = 2
            if df.loc[i, "flag_sma_over_" + num] == 1:
                df.loc[i, "up_flag_distance_" + num] = 1
                up_flag[num] = 2
    for num in ['5', '10', '15', '30']:
        df = df.drop(["flag_sma_over_" + num, "flag_sma_under_" + num], 1)
    return df

### RSI / 相対力指数
相対的に上昇方向なのか下降方向なのかを示す指数です。  
relative strength of up or down

In [None]:
def relative_strength_idx(df, close_column = "Close", n=14):
    close = df[close_column]
    delta = close.diff()
    delta = delta[1:]
    pricesUp = delta.copy()
    pricesDown = delta.copy()
    pricesUp[pricesUp < 0] = 0
    pricesDown[pricesDown > 0] = 0
    rollUp = pricesUp.rolling(n).mean()
    rollDown = pricesDown.abs().rolling(n).mean()
    rs = rollUp / rollDown
    rsi = 100.0 - (100.0 / (1.0 + rs))
    return rsi

### wavelet analysis / ウェーブレット解析
Wavelet coefficient represents periodic nature of time-series data.  
ウェーブレット係数は時系列データの周期的な性質を表しています。

In [None]:
def wt( data ):
    widths = np.arange(1, 100)
    return signal.cwt( data, signal.ricker, widths )

### make objective variable / 目的変数に作成
make objective variable and assign weight based on objective variable(change label)  
目的関数を作成します。また、変化率に基づいて学習の重み付けを行います。

#### weight
big change rate gives me big profit(or loss).  
we must focus on big change.  
set heavy weight on big change.  

大きな変化は大きな利益や損失をもたらすため、間違えた時のリスクが大きいです。  
よって、大きな変化ほど予測性能があがるように重み付けをします。

In [None]:
def change_rate(df):
    # 評価に終値の変化率を使うため
    # qualtile([0.2, 0.4, 0.6, 0.8])で閾値を決定
    # 大きなプラスを2として、小さなプラスを1 →　大きなマイナスを-2
    
    # objective variable is change rate of close
    # devide change rate to label by quantile
    # big plus is 2 and small plus is 1 ... bug minus is -2
    df['Change_rate'] = df['Close'].pct_change() / df['SMA_30_std']
    divide = df.loc[:2345]['Change_rate'].quantile([0, 0.2, 0.4, 0.6, 0.8, 1.0])
    # 0.4%はでかい, 0.1%が
    df.loc[(df['Change_rate'] > divide[0.4]) & (df['Change_rate'] < divide[0.6]), 'Change_label'] = 0
    df.loc[(df['Change_rate'] > divide[0.4]) & (df['Change_rate'] < divide[0.6]), 'weight'] = 1
    df.loc[(df['Change_rate'] >= divide[0.2]) & (df['Change_rate'] <= divide[0.4]), 'Change_label'] = -1
    df.loc[(df['Change_rate'] >= divide[0.2]) & (df['Change_rate'] <= divide[0.4]), 'weight'] = 1.2
    df.loc[df['Change_rate'] < divide[0.2], 'Change_label'] = -2
    df.loc[df['Change_rate'] < divide[0.2], 'weight'] = 1.5
    df.loc[(df['Change_rate'] <= divide[0.8]) & (df['Change_rate'] >= divide[0.6]), 'Change_label'] = 1
    df.loc[(df['Change_rate'] <= divide[0.8]) & (df['Change_rate'] >= divide[0.6]), 'weight'] = 1.2
    df.loc[df['Change_rate'] > divide[0.8], 'Change_label'] = 2
    df.loc[df['Change_rate'] > divide[0.8], 'weight'] = 1.5
    df = df.reset_index()
    return df.drop('index', 1)

### MACD, MACD signalフラグ
- up signal 
macdがsignalを下から上に抜いた時
yesterday macd < macd signal && today macd > macd signal
- down signal 
macdがsignalを上から下に抜いた時
yesterday macd > macd signal && today macd < macd signal

In [None]:
pairs = [['MACD', 'MACD_signal'], ['SMA_25', 'SMA_75'], ['SMA_25', 'SMA_200'], ['SMA_75', 'SMA_200']]
def flag_features(df):
    EMA_12 = pd.Series(df['Close'].ewm(span=12, min_periods=12).mean())
    EMA_26 = pd.Series(df['Close'].ewm(span=26, min_periods=26).mean())
    df['MACD'] = pd.Series(EMA_12 - EMA_26)
    df['MACD_signal'] = pd.Series(df.MACD.ewm(span=9, min_periods=9).mean())
    df['SMA_25'] = df['Close'].rolling(int(25)).mean().shift()
    df['SMA_75'] = df['Close'].rolling(int(75)).mean().shift()
    df['SMA_200'] = df['Close'].rolling(int(200)).mean().shift()
    for column in ['MACD', 'MACD_signal', 'SMA_25', 'SMA_75', 'SMA_200']:
        df["yesterday_" + column]= df[column].shift()
    for pair in pairs:
        df = make_flag(df, pair[0], pair[1], True, 20)
        df = make_flag(df, pair[1], pair[0], False, 20)
    return df.drop(['yesterday_MACD', 'yesterday_MACD_signal', 'yesterday_SMA_25', 'yesterday_SMA_75', 'yesterday_SMA_200'], 1)

def make_flag(df, up_column, down_column, up, term):
    if up:
        direction = "up"
    else:
        direction = "down"
    flag_column = "up_" + up_column + '_down_' + down_column + '_price_' + direction + '_flag'
    count_column = "flag_distance_up_is_" + up_column + '_down_is_' + down_column + '_price_' + direction
    df.loc[(df['yesterday_' + down_column] > df['yesterday_' + up_column]) & (df[down_column] < df[up_column]), flag_column] = 1
    df.loc[~((df['yesterday_' + down_column] > df['yesterday_' + up_column]) & (df[down_column] < df[up_column])), flag_column] = 0
    flag_distance = 0
    for i in range(len(df)):
        df.loc[i, count_column] = 0
        if df.loc[i, flag_column] == 1:
            df.loc[i, count_column] = 1
            flag_distance = 2
            continue
        
        if flag_distance > term:
            flag_distance = 0
        elif flag_distance > 0:
            df.loc[i, count_column] = flag_distance
            flag_distance += 1
    return df.drop(flag_column, 1)

### ready simple features / 単純な説明変数の作成
MACD and EMA... is very easy to create.
In this method we make easy features.

簡単に作成できる特徴量はここで作成しています。

In [None]:
def ready_features(df, train = True):
    # データの読み込み
    # read csv and modify type of columns
#     df['Date'] = pd.to_datetime(df['Date'])
    df["Close"] = df['Close'].astype(float)
    for index_fund in index_funds:
        df[index_fund+"_Close"] = df[index_fund+"_Close"].astype(float)
    # 指数平滑移動平均
    # Exponential Moving Average
    # 指数平滑移動平均の日本語の説明↓
    # https://media-kojirokousi.com/exponential-moving-average/#:~:text=%E6%8C%87%E6%95%B0%E5%B9%B3%E6%BB%91%E7%A7%BB%E5%8B%95%E5%B9%B3%E5%9D%87%E7%B7%9A(EMA)%E3%81%AF%E3%80%81%E5%BE%93%E6%9D%A5%E3%81%AE,EMA%E3%81%A8%E5%91%BC%E3%81%B0%E3%82%8C%E3%81%BE%E3%81%99%E3%80%82
    df['EMA_9'] = df['Close'].ewm(9).mean().shift()
    df['EMA_9_std'] = df['Close'].ewm(9).std().shift()
    # MACD
    # MACDの日本語の説明↓
    # https://www.sevendata.co.jp/shihyou/technical/macd.html
    df = flag_features(df)
    df['RSI'] = relative_strength_idx(df).fillna(0)
    df = sma_and_flag(df)
    df = change_rate(df)
    #↓他のインデックスファンドのデータも同様の特徴量を生成している
    #↓create same features from other index-fund data
    for index_fund in index_funds:
        close_column = index_fund+"_Close"
        rsi_column = index_fund +"_RSI"
        df[index_fund + '_EMA_9'] = df[close_column].ewm(9).mean().shift()
        df[index_fund + '_EMA_9_std'] = df[close_column].ewm(9).std().shift()
        EMA_12 = pd.Series(df[close_column].ewm(span=12, min_periods=12).mean())
        EMA_26 = pd.Series(df[close_column].ewm(span=26, min_periods=26).mean())
        df[rsi_column] = relative_strength_idx(df, close_column).fillna(0)
        column_wave = wt(df[close_column])
        for num in ['1', '2', '3', '5', '8', '10', '12','15', '30']:
            df[close_column + '_wave_' + num] = column_wave[int(num)]
        rsi_wave = wt(df[rsi_column])
        for num in ['1', '2', '3', '5', '8', '10', '12','15', '30']:
            df[rsi_column + '_wave_' + num] = rsi_wave[int(num)]

    # wavelet解析
    # wavelet
    for column in ['Close', 'RSI']:
        column_wave = wt(df[column])
        for num in ['1', '2', '3', '5', '8', '10', '12','15', '30', '45']:
            df[column + '_wave_' + num] = column_wave[int(num)]
    return df

In [None]:
def ready_data_for_train(df_for_ready, train = True):
    # 前日のデータから、翌日の終値を予測するため、Change_label, Change_rate, weightをshiftしている。
    # ↓we would like to predict tommorow's movement
    df_for_ready = df_for_ready.copy()
    df_for_ready['Change_label'] = df_for_ready['Change_label'].shift(-1)
    df_for_ready['Change_rate'] = df_for_ready['Change_rate'].shift(-1)
    df_for_ready['weight'] = df_for_ready['weight'].shift(-1)
#     if train:
#         df_for_ready = df_for_ready[:-1]      # Because of shifting
    df_for_ready["Change_label"] = df_for_ready["Change_label"].fillna(0).astype('int')
    return df_for_ready

In [None]:
currency_hises = pd.read_csv("/kaggle/input/new-his-data/his_data.csv", names=("Date", "JPY", "NZD", "AUD", "EUR", "GBP", "HKD", "BRL", "DKK", "INR", "CAD", "CHF")).drop(0)
currency_hises['Date'] = currency_hises['Date'].astype('string')
# currency_hises['Date'] = currency_hises["Date"].str[:4] + "-" + currency_hises["Date"].str[4:6] + "-" + currency_hises["Date"].str[6:8]
# index_funds = ['N225', 'GSPC', 'EZU', 'GSPTSE', 'GDAXI', 'FCHI', 'EWQ', 'EWG', 'EWC', 'GCF', 'CLF']
index_funds = []
for index_fund in index_funds:
    index_df = pd.read_csv("/kaggle/input/stock-price-datas/" + index_fund + ".csv", names=("Date", "Open", "High", "Low", index_fund+"_Close", "Adj Close", "Volume")).drop(0)
    index_df = index_df.drop(["Open", "High", "Low", "Adj Close", "Volume"], 1)
    index_df["Date"] = index_df["Date"]
    currency_hises = pd.merge(currency_hises, index_df, on="Date", how='left')
    if (index_fund != 'N225'):
        currency_hises[index_fund+"_Close"] = currency_hises[index_fund+"_Close"].shift()
currency_hises = currency_hises[1:]      # Because of shifting close price
currency_hises = currency_hises.reset_index().drop("index", 1)

In [None]:
currencies = ['JPY', 'NZD', 'AUD', 'EUR', 'GBP', 'HKD', 'BRL', 'DKK', 'INR', 'CAD', 'CHF']
# currencies = ['JPY']
for currency in tqdm(currencies):
    df_for_currency = pd.DataFrame(columns = ['Date', 'Close'])
    df_for_currency["Date"] = currency_hises["Date"]
    df_for_currency["Close"] = currency_hises[currency]
    df_for_currency["Close"] = zscore(df_for_currency['Close'].astype('float'))
    for index_fund in index_funds:
        df_for_currency[index_fund + "_Close"] = currency_hises[index_fund + "_Close"]
    df_for_currency = ready_features(df_for_currency)
    df_for_currency = ready_data_for_train(df_for_currency)
    df_for_currency.to_csv(currency + ".csv")

## 一行一行追加する場合

In [None]:
# currencies = ['JPY', 'NZD', 'AUD', 'EUR', 'GBP', 'HKD', 'BRL', 'DKK', 'INR', 'CAD', 'CHF']
# currencies = ['JPY']
# currency_hises = pd.read_csv("/kaggle/input/group3data/his_data.csv", names=("Date", "JPY", "NZD", "AUD", "EUR", "GBP", "HKD", "BRL", "DKK", "INR", "CAD", "CHF")).drop(0)
# new_record = currency_hises[-1:]
# currency_hises = currency_hises[:-1]
# currency_hises = currency_hises.append(new_record)
# currency_hises['Date'] = currency_hises['Date'].astype('string')
# currency_hises['Date'] = currency_hises["Date"].str[:4] + "-" + currency_hises["Date"].str[4:6] + "-" + currency_hises["Date"].str[6:8]
# # currency_hises.to_csv("his_data.csv")
# for currency in tqdm(currencies):
#     df_for_currency = pd.DataFrame(columns = ['Date', 'Close'])
#     df_for_currency["Date"] = currency_hises["Date"]
#     df_for_currency["Close"] = currency_hises[currency]
#     df_for_currency["Close"] = zscore(df_for_currency['Close'].astype('float'))
#     df_for_currency = df_for_currency[-201:].reset_index().drop('index', 1)
#     df_for_currency = ready_features(df_for_currency)
#     df_for_currency = ready_data_for_train(df_for_currency, False)
#     old_data = pd.read_csv('/kaggle/input/train-feature/' + currency + ".csv", index_col=0)
#     old_data = old_data.append(df_for_currency[-1:]).reset_index().drop('index', 1)
#     old_data.to_csv(currency + "2.csv")