### Visualizing and analyzing stocks

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from itertools import combinations, product
from functools import partial
from multiprocessing import Pool, Manager, cpu_count
from IPython.display import display_html
sns.set_context("notebook")

import warnings
warnings.filterwarnings("ignore")

In [None]:
def prep_prices(price):
    
    from decimal import ROUND_HALF_UP, Decimal
    
    pcols = ["Open","High","Low","Close"]

    price.ExpectedDividend.fillna(0,inplace=True)
    
    def qround(x):
        return float(Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
    
    def adjust_prices(df):
        df = df.sort_values("Date", ascending=False)
        df.loc[:, "CumAdjust"] = df["AdjustmentFactor"].cumprod()

        # generate adjusted prices
        for p in pcols:     
            df.loc[:, p] = (df["CumAdjust"] * df[p]).apply(qround)
        df.loc[:, "Volume"] = df["Volume"] / df["CumAdjust"]
        df.ffill(inplace=True)
        df.bfill(inplace=True)
        
        # generate and fill Targets
        #df.loc[:, "Target"] = df.Close.pct_change().shift(-2).fillna(df.Target).fillna(0)
        df.Target.fillna(0,inplace=True)

        return df

    # generate Adjusted
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(adjust_prices).reset_index(drop=True)
    price = price.sort_values("RowId")
    return price

In [None]:
%%time
path = "../input/jpx-tokyo-stock-exchange-prediction/"

df_train = pd.read_csv(f"{path}train_files/stock_prices.csv", parse_dates=["Date"])
df_train = df_train[df_train.Date>"2020-10-02"] #Targets not Nulls and 2000 secutities data
df_train = prep_prices(df_train)

df_test = pd.read_csv(f"{path}supplemental_files/stock_prices.csv", parse_dates=["Date"])
df_test = prep_prices(df_test)

In [None]:
# create new dataframe with just closing price for each stock
df = df_train.pivot(index='Date', columns='SecuritiesCode', values='Close')

In [None]:
def absHighPass(df, absThresh):
    df.loc[:, ]
    passed = set()
    for (r,c) in combinations(df.columns, 2):
        if (abs(df.loc[r,c]) >= absThresh):
            passed.add(r)
            passed.add(c)
    passed = sorted(passed)
    return df.loc[passed,passed]

corr = df.corr()

### Search High Correlations Stocks

In [None]:
mat = absHighPass(corr,0.978)
mask = np.triu(np.ones_like(mat))
fig, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(mat, annot=True, mask=mask, cmap="viridis")
plt.show()

### Pair with high correlation

In [None]:
sns.jointplot(9101, 9104, df, kind='reg', color='seagreen', height=10);

In [None]:
fig = sns.PairGrid(df[[9101, 9104, 9107, 9110, 6532]].dropna())
fig.map_upper(plt.scatter, color='blue')
fig.map_lower(sns.kdeplot, cmap='cool_d')
fig.map_diag(sns.distplot, bins=30);

### Stock Charts 

In [None]:
Code = 9101 # Select Security Code

In [None]:
cols = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'SecuritiesCode', 'Target']
STOCK = df_train[df_train.SecuritiesCode==Code][cols].set_index("Date")
TEST = df_test[df_test.SecuritiesCode==Code].set_index("Date")
display_html(STOCK)

In [None]:
plt.figure(figsize=(15,7))
top = plt.subplot2grid((4,4), (0, 0), rowspan=3, colspan=4)
bottom = plt.subplot2grid((4,4), (3,0), rowspan=1, colspan=4)
top.plot(STOCK.index, STOCK.Close, label="Train set")
top.plot(TEST.index,TEST.Close, color="magenta", label="Test set")
bottom.bar(STOCK.index, STOCK.Volume)
bottom.bar(TEST.index, TEST.Volume, color="magenta")
top.legend(bbox_to_anchor=(1.01, 1., 0.11, 0.), loc='upper right', borderaxespad=0.)
 
# set the labels
top.axes.xaxis.set_ticklabels([])
top.axvline(TEST.index[0], color='red', linestyle='--')
bottom.axvline(TEST.index[0], color='red', linestyle='--')
top.set_title(Code)
top.grid(True)
top.set_ylabel('Closing Price')
bottom.set_ylabel('Volume')
bottom.grid(True);

In [None]:
plt.figure(figsize=(15,7))
plt.title("Plot a Histogram of the Daily Closing Price - TRAIN set")
sns.distplot(STOCK['Close'].dropna(), bins=50, color='blue');

In [None]:
plt.figure(figsize=(15,7))
plt.title("Plot a Histogram of the Daily Closing Price - TEST set")
sns.distplot(TEST['Close'].dropna(), bins=50, color='magenta');

### Forecasting stock prices

In [None]:
d = 60 # days to plot
e = 15  # days to extrapolate
g = 2  # max degree of regression

width = .8
width2 = .1
col1 = 'green'
col2 = 'red'

LDAYS = STOCK.tail(d)
up = LDAYS[LDAYS.Close>=LDAYS.Open]
down = LDAYS[LDAYS.Close<LDAYS.Open]
fut = TEST[TEST.index<=TEST.head(e).index[-1]]

xd = (LDAYS.index).append(TEST.head(e).index)

x = [x for x in range(d)]
p = {}
for i in range(g):
    z = np.polyfit(x, LDAYS.Close, i+1)
    p[i] = np.poly1d(z)
x = np.array(range(d+e))

plt.figure(figsize=(15,7))
top = plt.subplot2grid((4,4), (0, 0), rowspan=3, colspan=4)
top.set_title(f"CandleStick Chart of last {d} days with {e} extrapolated days by polynomials of degrees until {g}")
top.axes.xaxis.set_ticklabels([])
for i in range(g):
    top.plot(xd, p[i](x), linewidth=1, label=f'Degree {i+1}')
top.plot(fut.index,fut.Close,  marker="o", markersize=5, color="magenta", linewidth=0, label="Test Close")
top.bar(up.index,up.Close-up.Open,width,bottom=up.Open,color=col1)
top.bar(up.index,up.High-up.Close,width2,bottom=up.Close,color=col1)
top.bar(up.index,up.Low-up.Open,width2,bottom=up.Open,color=col1)
top.bar(down.index,down.Close-down.Open,width,bottom=down.Open,color=col2)
top.bar(down.index,down.High-down.Open,width2,bottom=down.Open,color=col2)
top.bar(down.index,down.Low-down.Close,width2,bottom=down.Close,color=col2)
top.axvline(TEST.index[0], color='red', linestyle='--')
top.legend(bbox_to_anchor=(1.01, 1., 0.11, 0.), loc='upper right', borderaxespad=0.)
top.grid(True)

bottom = plt.subplot2grid((4,4), (3,0), rowspan=1, colspan=4)
bottom.bar(xd, np.append(LDAYS['Volume'].values,np.zeros(e))) 
bottom.bar(fut.index, fut.Volume, color="magenta") 
bottom.axvline(TEST.index[0], color='red', linestyle='--')

bottom.grid(True)
plt.show();

In [None]:
d = 120 # days to plot
w = 3 # windows size

LDAYS = STOCK.tail(d)
fut = pd.concat([STOCK,TEST])
xd = (LDAYS.index).append(TEST.index)

fut['Cl_lr'] = fut.Close.rolling(window=w,closed="left").apply(lambda y: 
                   np.poly1d(np.polyfit(np.array(range(w)),y,1))(w),raw=True)

fut['Cl2'] = fut.Close.rolling(window=2,closed="left").apply(lambda y: 
                   np.poly1d(np.polyfit([0,1],y,1))(2),raw=True)

rho = np.corrcoef(fut.Close[-len(xd):],fut.Cl_lr[-len(xd):])

plt.figure(figsize=(15,7))
plt.plot(LDAYS.index, LDAYS.Close, label="Train set")
plt.plot(TEST.index,TEST.Close, color="magenta", label="Test set")
plt.plot(xd,fut.Cl_lr[-len(xd):], color="black", label="Rolling LR")
plt.axvline(TEST.index[0], color='red', linestyle='--')
plt.title(f"Rolling Liner Regression with {w}-days window - Pearson Correlation = {rho[0,1]:.3f}")
plt.legend(bbox_to_anchor=(1.01, 1., 0.11, 0.), loc='upper right', borderaxespad=0.)
plt.show();

In [None]:
# Generate diagonal line to plot.
fig, ax = plt.subplots(figsize=(8,8))
d_x = np.linspace(start=TEST.Close.min() - 1, stop=TEST.Close.max() + 1, num=100)
sns.regplot(x=TEST.Close, y=fut.Cl_lr[-len(TEST):], color='magenta', label='test', ax=ax)
sns.lineplot(x=d_x, y=d_x, dashes={'linestyle': ''}, color='blue', ax=ax)
ax.lines[1].set_linestyle('--')
ax.set(title=f'Test Data vs Predictions - Corr = {np.corrcoef(TEST.Close,fut.Cl_lr[-len(TEST):])[0,1]:.3f}');

### Prophet Forecasting

In [None]:
from fbprophet import Prophet
from fbprophet.make_holidays import make_holidays_df

year_list = [2017, 2018, 2019, 2020, 2021, 2022, 2023]
holidays = make_holidays_df(year_list=year_list, country='JP')

ph_df = fut[["Close","Cl_lr",'Cl2']][w:len(STOCK)].reset_index()
ph_df.rename(columns={'Close': 'y', 'Date': 'ds'}, inplace=True)

m = Prophet(holidays=holidays,
            daily_seasonality=False,
            changepoint_prior_scale=0.015)
m.add_seasonality(name='monthly', period=20.5, fourier_order=5)
m.add_regressor('Cl_lr')
m.add_regressor('Cl2')
m.fit(ph_df)

### Compare Predict and Future Close

In [None]:
TEST.shape

In [None]:
# Predict Prices
future_prices = m.make_future_dataframe(periods=175, freq='d')
future_prices = future_prices[future_prices.ds.dt.dayofweek < 5]
future_prices = future_prices.set_index("ds").join(fut[['Cl_lr','Cl2']][w:], how='left')
future_prices = future_prices.reset_index().dropna()
forecast = m.predict(future_prices)
result = TEST[['Close']].join(forecast[['ds','yhat','yhat_lower','yhat_upper']].set_index("ds"),how='left')
display_html(result)

In [None]:
# Generate diagonal line to plot.
rho = np.corrcoef(result.Close,result.yhat)
fig, ax = plt.subplots(figsize=(8,8))
d_x = np.linspace(start=TEST.Close.min() - 1, stop=TEST.Close.max() + 1, num=175)
sns.regplot(x=result.Close, y=result.yhat, color='magenta', label='test', ax=ax)
sns.lineplot(x=d_x, y=d_x, dashes={'linestyle': ''}, color='blue', ax=ax)
ax.lines[1].set_linestyle('--')
ax.legend(loc='upper left')
ax.set(title=f'Test Data vs Predictions - Corr = {np.corrcoef(result.Close,result.yhat)[0,1]:.3f}');

In [None]:
fig = m.plot(forecast)
plt.title(f"{Code} Stock Price Forecast", fontsize=16)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Close Price", fontsize=12)
plt.axvline(TEST.index[0], color='red', linestyle='--')
plt.plot(TEST.index, TEST.Close,  marker="o", markersize=3, color="magenta", linewidth=0, label="Test Close")
plt.show()

In [None]:
fig2 = m.plot_components(forecast)
plt.show()

### Rolling Traning and Predict

In [None]:
# Utilities 

def calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    weights_mean = weights.mean()
    df = df.sort_values(by='Rank')
    purchase = (df['Target'][:portfolio_size]  * weights).sum() / weights_mean
    short    = (df['Target'][-portfolio_size:] * weights[::-1]).sum() / weights_mean
    return purchase - short

def calc_spread_return_sharpe(df, portfolio_size=200, toprank_weight_ratio=2):
    grp = df.groupby('Date')
    min_size = grp["Target"].count().min()
    if min_size<2*portfolio_size:
        portfolio_size=min_size//2
        if portfolio_size<1:
            return 0, None
    buf = grp.apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio, buf

def add_rank(df, col_name="pred"):
    df["Rank"] = df.groupby("Date")[col_name].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

In [None]:
def add_feat(df):
    df['C3'] = df.Close.rolling(window=3,closed="left").apply(lambda y: 
               np.poly1d(np.polyfit([0,1,2],y,1))(3),raw=True)        
    df['C2'] = df.Close.rolling(window=2,closed="left").apply(lambda y: 
               np.poly1d(np.polyfit([0,1],y,1))(2),raw=True)
    return df

def run_prophet(tr):
    m = Prophet(holidays=holidays,
                daily_seasonality=False,
                yearly_seasonality=False,
                weekly_seasonality=False,
                changepoint_prior_scale=0.01)
    m.add_regressor('C3')
    m.add_regressor('C2')
    m.fit(tr)
    pred = m.predict(tr[-2:])
    return (pred.yhat[1]/pred.yhat[0] - 1)

def run_reg(tr):
    pred = list(0.6*tr.C2[-2:]+0.4*tr.C3[-2:])
    return (pred[1]/pred[0] - 1)

def proc_cod(cod, tr):
    tr = tr[tr.SecuritiesCode==cod][["Date","Close"]]
    x = [0,1]
    for _ in x:
        tr = tr.append(
        pd.DataFrame({'Date': pd.date_range(start=tr.Date.iloc[-1], 
                                            periods=2, freq='B', 
                                            closed='right'),
                      'Close': np.poly1d(np.polyfit(x,tr.Close[-2:],1))(2)
                     })
        )
    tr = add_feat(tr)
    tr = tr[6:]
    target = run_reg(tr)
    return target

In [None]:
%%time
import jpx_tokyo_market_prediction

env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

trgts = {}
for (prices, _, _, _, _, sample_prediction) in iter_test:
    cods = prices.SecuritiesCode.unique()
    df_train = pd.concat([df_train, prices])
    df_train = df_train.sort_values(["SecuritiesCode", "Date"])
    df_train.ffill(inplace=True)
    for cod in tqdm(cods):
        trgts[cod] = proc_cod(cod, df_train)
    tr = df_train[df_train.Date==prices.Date.iat[0]].copy()
    tr.Target=tr["SecuritiesCode"].map(trgts) 
    tr = add_rank(tr, "Target")
    score = calc_spread_return_per_day(tr,200,2)
    print(f"Score: {score}")
    pred = tr.set_index("SecuritiesCode")["Rank"]
    sample_prediction['Rank'] = sample_prediction["SecuritiesCode"].map(pred)
    env.predict(sample_prediction)