# EDA for stock_price_spec

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

sns.set_style("darkgrid")

In [None]:
# data import
PATH = "../input/jpx-tokyo-stock-exchange-prediction"
stock_prices_train = pd.read_csv(f"{PATH}/train_files/stock_prices.csv")
stock_prices_test = pd.read_csv(f"{PATH}/example_test_files/stock_prices.csv")
stock_prices_sup = pd.read_csv(f"{PATH}/supplemental_files/stock_prices.csv")

stock_prices_train = pd.concat([stock_prices_train, stock_prices_sup])


stock_prices_train["Date"] = pd.to_datetime(stock_prices_train["Date"])
stock_prices_test["Date"] = pd.to_datetime(stock_prices_test["Date"])
stock_prices_train["year"] = stock_prices_train["Date"].dt.year
stock_prices_train["month"] = stock_prices_train["Date"].dt.month
stock_prices_train["day"] = stock_prices_train["Date"].dt.day
stock_prices_test["year"] = stock_prices_test["Date"].dt.year
stock_prices_test["month"] = stock_prices_test["Date"].dt.month
stock_prices_test["day"] = stock_prices_test["Date"].dt.day


display(stock_prices_train.tail())
display(stock_prices_test.tail())

In [None]:
# check unique number for "SecuritiesCode" 
print(stock_prices_train["SecuritiesCode"].nunique())
print(stock_prices_test["SecuritiesCode"].nunique())

In [None]:
stock_prices_train.describe()

In [None]:
# check null number
print(len(stock_prices_train))
print(stock_prices_train.isna().sum())

In [None]:
print(len(stock_prices_train[stock_prices_train["Volume"]==0]))

In [None]:
ncount = stock_prices_train[stock_prices_train["Open"].isnull()]
ncount.groupby("Date")["SecuritiesCode"].count().sort_values(ascending=False)

In [None]:
# check record number for Date is 2020-10-01
# JPX had occured system failure by 2020-10-01
# https://www.jpx.co.jp/english/corporate/research-study/system-failure/b5b4pj000003rlat-att/wg2_english.pdf

len(ncount[ncount["Date"]=="2020-10-01"])

In [None]:
# split data by year for train
stock_prices_train_2017 = stock_prices_train[stock_prices_train["year"]==2017]
stock_prices_train_2018 = stock_prices_train[stock_prices_train["year"]==2018]
stock_prices_train_2019 = stock_prices_train[stock_prices_train["year"]==2019]
stock_prices_train_2020 = stock_prices_train[stock_prices_train["year"]==2020]
stock_prices_train_2021 = stock_prices_train[stock_prices_train["year"]==2021]

# opendays
opendates_2017 = stock_prices_train_2017["Date"].unique()
opendates_2018 = stock_prices_train_2018["Date"].unique()
opendates_2019 = stock_prices_train_2019["Date"].unique()
opendates_2020 = stock_prices_train_2020["Date"].unique()
opendates_2021 = stock_prices_train_2021["Date"].unique()

print(len(opendates_2017))
print(len(opendates_2018))
print(len(opendates_2019))
print(len(opendates_2020))
print(len(opendates_2021))

# check null record's SecuritiesCode
ncount2 = ncount[~(ncount["Date"]=="2020-10-01")]

# split data by year 
ncount2_2017 = ncount2[ncount2["year"]==2017]
ncount2_2018 = ncount2[ncount2["year"]==2018]
ncount2_2019 = ncount2[ncount2["year"]==2019]
ncount2_2020 = ncount2[ncount2["year"]==2020]
ncount2_2021 = ncount2[ncount2["year"]==2021]

ncount3 = ncount2.groupby("SecuritiesCode")["RowId"].count().reset_index()
ncount3_2017 = ncount2_2017.groupby(["SecuritiesCode"])["RowId"].count().reset_index()
ncount3_2018 = ncount2_2018.groupby(["SecuritiesCode"])["RowId"].count().reset_index()
ncount3_2019 = ncount2_2019.groupby(["SecuritiesCode"])["RowId"].count().reset_index()
ncount3_2020 = ncount2_2020.groupby(["SecuritiesCode"])["RowId"].count().reset_index()
ncount3_2021 = ncount2_2021.groupby(["SecuritiesCode"])["RowId"].count().reset_index()

ncount3.columns = ["SecuritiesCode", "count"]
ncount3_2017.columns = ["SecuritiesCode", "count"]
ncount3_2018.columns = ["SecuritiesCode", "count"]
ncount3_2019.columns = ["SecuritiesCode", "count"]
ncount3_2020.columns = ["SecuritiesCode", "count"]
ncount3_2021.columns = ["SecuritiesCode", "count"]

display(ncount3.sort_values(["count"], ascending=False).reset_index(drop=True).head(10))
display(ncount3_2017.sort_values(["count"], ascending=False).reset_index(drop=True).head())
display(ncount3_2018.sort_values(["count"], ascending=False).reset_index(drop=True).head())
display(ncount3_2019.sort_values(["count"], ascending=False).reset_index(drop=True).head())
display(ncount3_2020.sort_values(["count"], ascending=False).reset_index(drop=True).head())
display(ncount3_2021.sort_values(["count"], ascending=False).reset_index(drop=True).head())


In [None]:
# Check SecuritiesCode #9733
# it occurs null recode when stock does not trade in day
display(stock_prices_train[(stock_prices_train["SecuritiesCode"]==9733) & (stock_prices_train["Open"].isnull())].head())
display(stock_prices_train[(stock_prices_train["SecuritiesCode"]==9733) & ~(stock_prices_train["Open"].isnull())].head())


In [None]:
# Check SecuritiesCode #3540
# start to get listed from 201715Dec
display(stock_prices_train_2017[(stock_prices_train_2017["SecuritiesCode"]==3540) & ~(stock_prices_train_2017["Open"].isnull())])
display(stock_prices_train_2017[(stock_prices_train_2017["SecuritiesCode"]==3540) & (stock_prices_train_2017["Open"].isnull())].head())

In [None]:
# Check SecuritiesCode #4628
display(stock_prices_train_2017[(stock_prices_train_2017["SecuritiesCode"]==4628) & (stock_prices_train_2017["Open"].isnull())].head())
display(stock_prices_train_2017[(stock_prices_train_2017["SecuritiesCode"]==4628) & ~(stock_prices_train_2017["Open"].isnull())].head())


In [None]:
# Check SecuritiesCode #9977
display(stock_prices_train_2017[(stock_prices_train_2017["SecuritiesCode"]==9977) & (stock_prices_train_2017["Open"].isnull())].head())
display(stock_prices_train_2017[(stock_prices_train_2017["SecuritiesCode"]==9977) & ~(stock_prices_train_2017["Open"].isnull())].head(10))


In [None]:
# check null record Target
stock_prices_train[stock_prices_train["Target"].isnull()]

In [None]:
# Target is null that means not listed
display(stock_prices_train[stock_prices_train["SecuritiesCode"]==4382].head())
display(stock_prices_train[stock_prices_train["SecuritiesCode"]==4056].head())
display(stock_prices_train[stock_prices_train["SecuritiesCode"]==2987].head())

In [None]:
# Check SecuritiesCode #9977 2017
# Target is calculated from the value with the most recent Close date listed.
stock_prices_train_2017_9977 = stock_prices_train_2017[stock_prices_train_2017["SecuritiesCode"]==9977]
stock_prices_train_2017_9977["Close_prev1"] = stock_prices_train_2017_9977["Close"].shift(-1)
stock_prices_train_2017_9977["Close_prev2"] = stock_prices_train_2017_9977["Close"].shift(-2)
stock_prices_train_2017_9977["Target_own"] = (stock_prices_train_2017_9977["Close_prev2"] - stock_prices_train_2017_9977["Close_prev1"]) / stock_prices_train_2017_9977["Close_prev1"]
stock_prices_train_2017_9977.head(100)

In [None]:
# check null record Open, High, Low, Close
# print(len(ncount["Open"].isnull()))
print(len(ncount["High"].isnull()))
print(len(ncount["Low"].isnull()))
print(len(ncount["Close"].isnull()))

In [None]:
display(stock_prices_train[~(stock_prices_train["AdjustmentFactor"]==1)].head())
print(len(stock_prices_train[~(stock_prices_train["AdjustmentFactor"]==1)]))

In [None]:
# split case
stock_prices_train[stock_prices_train["SecuritiesCode"]==6861].head(20)

In [None]:
# split case
split = stock_prices_train[stock_prices_train["SecuritiesCode"]==6861]

split["Close_prev1"] = split["Close"].shift(-1)
split["Close_prev2"] = split["Close"].shift(-2)
split["Target_own"] = (split["Close_prev2"] - split["Close_prev1"]) / split["Close_prev1"]
split.head(100)

In [None]:
# reverse-split case
stock_prices_train[stock_prices_train["SecuritiesCode"]==8057].head(20)

In [None]:
stock_prices_train[~stock_prices_train["ExpectedDividend"].isnull()].sort_values("ExpectedDividend", ascending=False)

In [None]:
# we need to deal stocks 2 business days before ex-dividend date to get dividend
stock_prices_train_2021 = stock_prices_train[stock_prices_train["year"]==2021]
stock_prices_train_2021_7974 = stock_prices_train_2021[stock_prices_train_2021["SecuritiesCode"]==7974]
stock_prices_train_2021_7974_3 = stock_prices_train_2021_7974[stock_prices_train_2021_7974["month"]==3]
display(stock_prices_train_2021_7974_3)

plt.figure(figsize=(14,6))
sns.lineplot(x="day", y="Close", data=stock_prices_train_2021_7974_3)
plt.show()

In [None]:
# we need to deal stocks 2 business days before ex-dividend date to get dividend
stock_prices_train_2021_7974_9 = stock_prices_train_2021_7974[stock_prices_train_2021_7974["month"]==9]
display(stock_prices_train_2021_7974_9)

plt.figure(figsize=(14,6))
sns.lineplot(x="day", y="Close", data=stock_prices_train_2021_7974_9)
plt.show()

In [None]:
# we need to deal stocks 2 business days before ex-dividend date to get dividend
stock_prices_train_7974_9 = stock_prices_train[(stock_prices_train["SecuritiesCode"]==7974) & (stock_prices_train["month"]==9)].head(20)
display(stock_prices_train_7974_9)

plt.figure(figsize=(14,6))
sns.lineplot(x="day", y="Close", data=stock_prices_train_7974_9)
plt.show()

In [None]:
plt.figure(figsize=(14,6))
sns.lineplot(x="day", y="Close", data=stock_prices_train_2021_7974, hue="month")
plt.show()

In [None]:
# test data
print(stock_prices_test.shape)
print(len(stock_prices_test["SecuritiesCode"].unique()))
print(stock_prices_test["Date"].unique())

In [None]:
stock_prices_test.isna().sum()

In [None]:
stock_prices_test[stock_prices_test["Open"].isnull()]

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

# The API will deliver six dataframes in this specific order:
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    break

In [None]:
sample_prediction

In [None]:
# sharpe
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby(["month", "day"]).apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
stock_prices_train.tail()

In [None]:
stock_prices_train[stock_prices_train["Date"]=="2021-12-06"]

In [None]:
# 実際のポテンシャルがどこまででそれをどこまで引き出しているかがポイント（絶対値ではわからない）
stock_prices_train.tail()

sp_train = stock_prices_train[stock_prices_train["Date"]>="2021-12-06"]
sp_train = sp_train.fillna({'ExpectedDividend': 0})
sp_train = sp_train.fillna(method="ffill")

sp_train["Rank"] = sp_train.groupby("Date")["Target"].rank(ascending=False, method="first") - 1
sp_train["Rank"] = sp_train["Rank"].astype("int")
sp_train

In [None]:
# 2021-12-06 to 2022-02-28 5.4347049198950685
# only 2 days 9.131384568610958
score = calc_spread_return_sharpe(sp_train, portfolio_size= 200, toprank_weight_ratio= 2)
score