# パッケージのインポート（Import Packages）

In [None]:
import numpy as np
import pandas as pd
import jpx_tokyo_market_prediction
from lightgbm import LGBMRegressor
import optuna.integration.lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import warnings
warnings.filterwarnings("ignore")

# データ読込（Load Data）

In [None]:
prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv", parse_dates=["Date"])

# EDA

In [None]:
prices

列の意味（column information）<br>
(1) RowId: 日付_銘柄。ユニークID（Date_SecuritiesCode, Unique ID of price records）<br>
(2) Date：日付（Trade date）<br>
(3) SecuritiesCode: 銘柄コード。2000種類存在。（Local securities code, 2000）<br>
(4) Open:始値（First traded price on a day）<br>
(5) High:高値（Highest traded price on a day）<br>
(6) Low :安値（Lowest traded price on a day）<br>
(7) Close:終値（Last traded price on a day）<br>
(8) Volume:出来高（Number of traded stocks on a day）<br>
(9) AdjustmentFactor：分割／併合による株価の変動。終値から次の日の始まりまでの間に補正される。（Change in stock price due to a split/reverse split. Correction from the closing price to the beginning of the next day.）<br>
(10) ExpectedDividend:予想配当金額。（Projected dividend amount）<br>
(11) SupervisionFlag: 監理・整理銘柄フラグ。上場廃止リスク高い。（Flag for supervised and delisted stocks. High risk of delisting.）<br>
(12) Target:目標変数。１日後と2日後の差額から得られる収益率（Target variable; rate of return derived from the difference between one and two days later）<br>

## 欠損値の確認（Check missing values）

In [None]:
#列ごとに欠損値の数を確認（Check the number of missing values per column）
prices.isnull().sum()

### 

In [None]:
#"Open"（始値）, "High"（高値）, "Low"（安値）, "Close"（終値）の欠損部分について調査（Check about missing values of "Open", "High", "Low", "Close"）
print((prices["Open"].isnull() == (prices["Volume"]==0)).all())
print((prices["High"].isnull() == (prices["Volume"]==0)).all())
print((prices["Low"].isnull() == (prices["Volume"]==0)).all())
print((prices["Close"].isnull() == (prices["Volume"]==0)).all())
#"Volume"(出来高)はその日に売買された合計金額を示す。（"Volume" represents the total amount traded on that day.）
#ゆえに「出来高=0」はその日に売買されていないことを示し、始値、高値、安値、終値はnullとなる（So, "Volume = 0" means that there was no trading on that day, and the opening, high, low, and closing prices are null.）

In [None]:
#"ExpectedDividend"(予想配当金額)の欠損部分について調査（Check about missing values of ""ExpectedDividend"）
prices[(~prices["ExpectedDividend"].isnull())]["Date"].value_counts()
#特定の日のみ値が存在。決算時期に値が埋まるのではないか？（Value exists only on certain days. The value may be filled at the time of closing.）

In [None]:
#銘柄コード別に日付に欠損がないかを確認（Check for missing dates）
prices["SecuritiesCode"].value_counts().sort_values()
#1413は21日欠損。8806は20日欠損。4699は1日欠損（1413 misses 21days, 8806 misses 20days, 4699 misses 1day）

In [None]:
#欠損している日付を確認（Check for missing dates）
pd.pivot(prices, index="Date", columns="SecuritiesCode", values="Volume")[[1413, 8806, 4699, 1375]].tail(30)
#1413は2022-04-25～2022-05-27（計21日）で欠損（1413 misses 2022-04-25～2022-05-27）
#8806は2022-04-26～2022-05-27（計20日）で欠損（8806 misses 2022-04-26～2022-05-27）
#4699は2022-05-27（計1日）で欠損（4699 misses 2022-05-27）

# 特徴量生成・選択（Feature generation and selection）

In [None]:
def MA(series, window=25):
    return series.rolling(window, min_periods=1).mean()

def DMA(series, window=25):
    return series/MA(series, window) - 1

def divergence(series, window=25):
    std = series.rolling(window,min_periods=1).std()
    mean = series.rolling(window,min_periods=1).mean()
    return (series-mean) / std    

def rsi(series, n=14):
    return (series - series.shift(1)).rolling(n).apply(lambda s:s[s>0].sum()/abs(s).sum())

def stochastic(series, k=14, n=3, m=3):
    _min = series.rolling(k).min()
    _max = series.rolling(k).max()
    _k = (series - _min)/(_max - _min)
    _d1 = _k.rolling(n).mean()
    _d2 = _d1.rolling(m).mean()
    return pd.DataFrame({
                    "%K":_k,
                    "FAST-%D":_d1,
                    "SLOW-%D":_d2,
                    },index=series.index)
    # return _k, _d1, _d2

def psy(series, n=14):
    return (series - series.shift(1)).rolling(n).apply(lambda s:(s>=0).mean())

def ICH(series):
    conv = series.rolling(9).apply(lambda s:(s.max()+s.min())/2)
    base = series.rolling(26).apply(lambda s:(s.max()+s.min())/2)
    pre1 = ((conv + base)/2).shift(25)
    pre2 = d.Close_adj.rolling(52).apply(lambda s:(s.max()+s.min())/2).shift(25)
    lagg = d.Close_adj.shift(25)
    return conv, base, pre1, pre2, lagg

def roc(series, window=14):
    return series/series.shift(window) - 1

class FeatureBase():
    def create_feature(self, d):
        assert False, "NotImplemented"
        
class MAFeature(FeatureBase):
    def create_feature(self, d):
        return self._create_feature(d["Close_adj"])

    def _create_feature(self, series, window1=5, window2=25):
        ma1 = MA(series, window1).rename("MA1")
        ma2 = MA(series, window2).rename("MA2")
        diff = ma1 - ma2
        cross = pd.Series(
                        np.where((diff>0) & (diff<0).shift().fillna(False), 1,
                            np.where((diff<0) & (diff>0).shift().fillna(False), -1, 0
                                )
                        ),
                        index = series.index, name="MA_Cross"
                )
        return pd.concat([ma1, ma2, cross], axis=1)

In [None]:
def holiday(d):
    return pd.DataFrame({
        "before_holiday":(d["Date"] != d["Date"].shift(-1) - datetime.timedelta(days=1)) | (d["weekday"]==4),
        "after_holiday":(d["Date"] != d["Date"].shift(1) + datetime.timedelta(days=1)) | (d["weekday"]==0)
    }, index=d.index)
def make_features(df):
    df = df[[
        "Date","SecuritiesCode","Open","Close","AdjustmentFactor",
        "Volume"
    ]].copy()
    df["weekday"] = df["Date"].dt.weekday
    df = df.join(df.groupby("SecuritiesCode").apply(holiday))
    df["Volume_ratio"] = df["Volume"]/df.groupby("SecuritiesCode")["Volume"].rolling(window=15, min_periods=1).mean().reset_index("SecuritiesCode",drop=True)
    df["Close_adj"] = df.groupby("SecuritiesCode").apply(lambda d:d["Close"]/d["AdjustmentFactor"].cumprod().shift().fillna(1)).reset_index("SecuritiesCode",drop=True)
    df[["MA1", "MA2", "MA_Cross"]] = df.groupby("SecuritiesCode").apply(lambda d: MAFeature()._create_feature(d.Close_adj))# .join(df["Target"].shift(-1)).groupby("MA_Cross").describe()
    df["Diff"] = (df["Close"] - df["Open"]) / df[["Close","Open"]].mean(axis=1)
    df["Diff_MA1"] = df["Close_adj"] - df["MA1"]
    df["Diff_MA2"] = df["Close_adj"] - df["MA2"]
    for i in range(1, 3):
        df["MA_Cross_lag_{:}".format(i)] = df.groupby("SecuritiesCode")["MA_Cross"].shift(i)

    df["DivMA"] = df.groupby("SecuritiesCode")["Close_adj"].apply(DMA)
    df["Div"] = df.groupby("SecuritiesCode")["Close_adj"].apply(divergence)
    df["Rsi"] = df.groupby("SecuritiesCode")["Close_adj"].apply(rsi)
    df = df.join(df.groupby("SecuritiesCode")["Close_adj"].apply(stochastic))
    
    ##################以下、特徴量を追加してください（Add features）#######################
    
    ##################以上、特徴量を追加してください（Add features）#######################
    return df

In [None]:
columns = [
    "Diff", "Close_adj","Volume_ratio",
    "before_holiday", "after_holiday",
    "Diff_MA1", "Diff_MA2","MA_Cross",
    'MA_Cross_lag_1', 'MA_Cross_lag_2',
    "DivMA", "Div", "Rsi", "%K", "FAST-%D","SLOW-%D",
]

# モデル生成（Build model）

In [None]:
df = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv", parse_dates=["Date"])
df = make_features(df).join(df.Target)

In [None]:
def train_model(X, y):
    model=LGBMRegressor(boosting_type="dart",
                        num_leaves=31,max_depth=12,
                        learning_rate=0.02, n_estimators=1000,
                        random_state=0)
    model.fit(X,y)
    # model.score(X,y)
    return model

In [None]:
%%time
models = {}
for code, d in df.groupby("SecuritiesCode"):
    d = d[~d.Target.isnull()]
    X = d[columns]
    y = d.Target
    model = train_model(X, y)
    models[code] = model
    #print(code, model.score(X,y))

# 精度評価

In [None]:
# df.iloc[test_index]
# from sklearn.model_selection import KFold
# kf = KFold(n_splits=12, shuffle=False)
# for train_index, test_index in kf.split(df):
#   display(df.iloc[test_index])

# 予測（Predict）

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()   # 環境初期化
iter_test = env.iter_test()    # テストファイルをループするイテレーター

In [None]:
data = df.copy()
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    display(prices)
    prices["Date"] = pd.to_datetime(prices["Date"])
#     data = prices.drop_duplicates(["SecuritiesCode", "Date"], keep="last").sort_values(["SecuritiesCode", "Date"]).reset_index(drop=True)
    data = data.append(prices).drop_duplicates(["SecuritiesCode", "Date"], keep="last").sort_values(["SecuritiesCode", "Date"]).reset_index(drop=True)
    data = make_features(data)
    
    d = sample_prediction[["Date","SecuritiesCode"]].reset_index()
    d["Date"] = pd.to_datetime(d["Date"])
    d = d.merge(data, on=["Date","SecuritiesCode"])
    for code, _d in d.groupby("SecuritiesCode"):
        d.loc[_d.index, "Pred"] = models[code].predict(_d[columns])#予測
    d = d.sort_values(by="Pred", ascending=False).set_index("index")
    d["Rank"] = np.arange(0,2000)
    
    d = d.sort_index()
    sample_prediction["Rank"] = d["Rank"]
    
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    env.predict(submission)# 予測結果の提出