# Init

In [None]:
import os
from pathlib import Path
from decimal import ROUND_HALF_UP, Decimal

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

In [None]:
# Draw finance chart
!pip install mplfinance
import mplfinance as mpf

In [None]:
# I/O Func
def adjusting_price(price, key: str):
    """[Adjusting Close Price]
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """

    def generate_adjusted(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, f"CumulativeAdjustmentFactor{key}"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, f"Adjusted{key}"] = (
            df[f"CumulativeAdjustmentFactor{key}"] * df[key]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df[f"Adjusted{key}"] == 0, f"Adjusted{key}"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, f"Adjusted{key}"] = df.loc[:, f"Adjusted{key}"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted).reset_index(drop=True)

    # price.set_index("Date", inplace=True)
    return price

def adjusting_volume(price, key = "Volume"):
    """[Adjusting Close Price]
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """

    def generate_adjusted(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, f"CumulativeAdjustmentFactor{key}"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, f"Adjusted{key}"] = (
            df[key] / df[f"CumulativeAdjustmentFactor{key}"]  
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df[f"Adjusted{key}"] == 0, f"Adjusted{key}"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, f"Adjusted{key}"] = df.loc[:, f"Adjusted{key}"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted).reset_index(drop=True)

    # price.set_index("Date", inplace=True)
    return price

def read_prices(dir_name: str, securities_code: int = None):
    """[Important: the dateset of 2020/10/1 is lost because of system failer in JPX, see: https://www.jpx.co.jp/corporate/news/news-releases/0060/20201019-01.html]
    
    """
    base_path = Path(f'../input/jpx-tokyo-stock-exchange-prediction/{dir_name}')
    df = pd.read_csv(base_path / 'stock_prices.csv')
    df.loc[: ,"Date"] = pd.to_datetime(df.loc[: ,"Date"], format="%Y-%m-%d")
    df = df[df['Open'].notna()]
    if securities_code:
        df = df[df["SecuritiesCode"] == securities_code]
    return df

def read_stock_list(securities_code: int = None, only_universe: bool = True):
    df = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv')
    df.loc[: ,"EffectiveDate"] = pd.to_datetime(df.loc[: ,"EffectiveDate"], format="%Y%m%d")
    if only_universe:
        df = df[df['Universe0']]
    if securities_code:
        df = df[df["SecuritiesCode"] == securities_code]
    return df

def read_train_data_by_price(securities_code: int = None, with_supplemental: bool = True):
    """[The train base is price dataset, the other data are joined to prices DF by left join]
    
    """
    def merge_data(prices, stock_list):
        base_df = prices.copy()
        _stock_list = stock_list.copy()
        _stock_list.rename(columns={'Close': 'Close_x'}, inplace=True)
        base_df = base_df.merge(_stock_list, on='SecuritiesCode', how="left")
        return base_df
    
    # origin
    df = merge_data(prices=read_prices(dir_name="train_files", securities_code=securities_code), stock_list=read_stock_list(securities_code=securities_code))
    
    # supplyment
    if with_supplemental:
        supplemental_df = merge_data(prices=read_prices(dir_name="supplemental_files", securities_code=securities_code), stock_list=read_stock_list(securities_code=securities_code))
        df = pd.concat([df, supplemental_df]).reset_index(drop=True)
        
    df = adjusting_price(df, "Close")
    df = adjusting_price(df, "Open")
    df = adjusting_price(df, "High")
    df = adjusting_price(df, "Low")
    df = adjusting_volume(df)
    return df

def write_df(df, filename):
    base_path = Path(f'/kaggle/working')
    df.to_csv(base_path / f'{filename}.csv',index = False)

In [None]:
# Draw Func
import seaborn as sns
def draw_pie(df, target):
    val_cnt = df[target].value_counts()
    pct = val_cnt / val_cnt.sum() * 100
    labels = [f"{sec} {ratio:.2f}%" for sec, ratio in zip(val_cnt.index, pct)]

    fig, ax = plt.subplots(figsize=(10, 5))
    patches, texts = ax.pie(val_cnt.values, 
                            colors=sns.color_palette("pastel"), 
                            shadow=True, 
                            startangle=90)
    patches, labels, dummy = zip(*sorted(zip(patches, labels, val_cnt.values),
                                         key=lambda x: x[2],
                                         reverse=True))
    ax.legend(patches, labels, bbox_to_anchor=(-0.1, 1.), fontsize=8)
    ax.set_title(f"Ratio of {target}")
    plt.show()
    
def draw_candlestick_chart(df, securities_code: int, datetime_index_name: str = "Date", open_name: str = "Open", high_name: str = "High", low_name: str = "Low", close_name: str = "Close", volume_name: str = "Volume"):
    """
    Index is DatetimeIndex and Open, High, Low, Close, Volume
    """
    tmp_df = df.copy()
    tmp_df = tmp_df[tmp_df["SecuritiesCode"] == securities_code]
    tmp_df = tmp_df[tmp_df["SecuritiesCode"] == securities_code]
    tmp_df = tmp_df[[datetime_index_name, open_name, high_name, low_name, close_name, volume_name]]
    tmp_df.rename(columns={open_name: 'Open'}, inplace=True)
    tmp_df.rename(columns={high_name: 'High'}, inplace=True)
    tmp_df.rename(columns={low_name: 'Low'}, inplace=True)
    tmp_df.rename(columns={close_name: 'Close'}, inplace=True)
    tmp_df.rename(columns={volume_name: 'Volume'}, inplace=True)
    tmp_df = tmp_df.set_index(datetime_index_name)
    
    # daily

    mpf.plot(tmp_df, type='candle', figratio=(12,4), volume=True, mav=(5, 25), style='yahoo')
    
    # weekly
    d_ohlcv = {'Open': 'first',
           'High': 'max',
           'Low': 'min',
           'Close': 'last',
           'Volume': 'sum'}
    df_w = tmp_df.resample('W-MON', closed='left', label='left').agg(d_ohlcv)
    mpf.plot(df_w, type='candle', figratio=(12,4), volume=True, mav=(5, 25), style='yahoo')

In [None]:
TOYOTA = 7203

# Stock List

In [None]:
# stock list
stock_list = read_stock_list()
stock_list

In [None]:
stock_list.nunique()

In [None]:
print(stock_list.dtypes)

In [None]:
stock_list["EffectiveDate"].describe()

In [None]:
draw_pie(stock_list, '33SectorName')

In [None]:
draw_pie(stock_list, '17SectorName')

In [None]:
draw_pie(stock_list, 'NewIndexSeriesSize')

In [None]:
draw_pie(stock_list, 'NewMarketSegment')

# Prices

In [None]:
train_prices = read_train_data_by_price()
train_prices

In [None]:
print(train_prices.dtypes)

In [None]:
train_prices.nunique()

In [None]:
train_prices["Date"].describe()

In [None]:
draw_candlestick_chart(df = train_prices, securities_code=TOYOTA)

In [None]:
draw_candlestick_chart(df = train_prices, securities_code=TOYOTA, open_name = "AdjustedOpen", high_name = "AdjustedHigh", low_name  = "AdjustedLow", close_name = "AdjustedClose", volume_name = "AdjustedVolume")