In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import plotly.figure_factory as ff
import decimal
from decimal import *

init_notebook_mode(connected=True)
temp = dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), width=800))
colors=px.colors.qualitative.Plotly

%matplotlib inline

Loading Dataset

In [None]:
stock_prices = pd.read_csv('/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv', sep=',', parse_dates=['Date'])
stock_list = pd.read_csv('/kaggle/input/jpx-tokyo-stock-exchange-prediction/stock_list.csv', sep=',')
stock_prices.head()

In [None]:
x = list(stock_list['Section/Products'].value_counts(ascending=True).index)
y = list(stock_list['Section/Products'].value_counts(ascending=True).values)

fig = go.Figure()
fig.add_trace(go.Bar(x=x, y=y,
              marker=dict(color=colors[0], opacity=0.7, 
                  line=dict(width=1, color=colors[0])),))
fig.update_layout(template=temp, title='Section Value Counts', xaxis_title='Section')
fig.show()

In [None]:
x = list(stock_list['NewMarketSegment'].value_counts(ascending=True).index)
y = list(stock_list['NewMarketSegment'].value_counts(ascending=True).values)

fig = go.Figure()
fig.add_trace(go.Bar(x=x, y=y,
              marker=dict(color=colors[0], opacity=0.7, 
                  line=dict(width=1, color=colors[0]))))
fig.update_layout(template=temp, title='New Market Value Counts', xaxis_title='Market')
fig.show()

In [None]:
fig = go.Figure()
x_hist=stock_prices['Target']
x_hist = x_hist.dropna()
fig.add_trace(go.Histogram(x=x_hist*100,
                           marker=dict(color=colors[0], opacity=0.7, 
                                       line=dict(width=1, color=colors[0])),
                           xbins=dict(start=-40,end=40,size=1)))
fig.update_layout(template=temp,title='Target Distribution', 
                  xaxis=dict(title='Stock Return',ticksuffix='%'), height=450)
fig.show()

In [None]:
train = stock_prices

train_date=train.Date.unique()
returns=train.groupby('Date')['Target'].mean().mul(100).rename('Average Return')
close_avg=train.groupby('Date')['Close'].mean().rename('Closing Price')
vol_avg=train.groupby('Date')['Volume'].mean().rename('Volume')

fig = make_subplots(rows=3, cols=1, 
                    shared_xaxes=True)
for i, j in enumerate([returns, close_avg, vol_avg]):
    fig.add_trace(go.Scatter(x=train_date, y=j, mode='lines',
                             name=j.name, marker_color=colors[i]), row=i+1, col=1)
fig.update_xaxes(rangeslider_visible=False,
                 rangeselector=dict(
                     buttons=list([
                         dict(count=6, label="6m", step="month", stepmode="backward"),
                         dict(count=1, label="1y", step="year", stepmode="backward"),
                         dict(count=2, label="2y", step="year", stepmode="backward"),
                         dict(step="all")])),
                 row=1,col=1)
fig.update_layout(template=temp,title='JPX Market Average Stock Return, Closing Price, and Shares Traded', 
                  hovermode='x unified', height=700, 
                  yaxis1=dict(title='Stock Return', ticksuffix='%'), 
                  yaxis2_title='Closing Price', yaxis3_title='Shares Traded',
                  showlegend=False)
fig.show()

In [None]:
stock_list['SectorName']=[i.rstrip().lower().capitalize() for i in stock_list['17SectorName']]
stock_list['Name']=[i.rstrip().lower().capitalize() for i in stock_list['Name']]
train_df = train.merge(stock_list[['SecuritiesCode','Name','SectorName']], on='SecuritiesCode', how='left')
train_df['Year'] = train_df['Date'].dt.year
years = {year: pd.DataFrame() for year in train_df.Year.unique()[::-1]}
for key in years.keys():
    df=train_df[train_df.Year == key]
    years[key] = df.groupby('SectorName')['Target'].mean().mul(100).rename("Avg_return_{}".format(key))
df=pd.concat((years[i].to_frame() for i in years.keys()), axis=1)
df=df.sort_values(by="Avg_return_2021")

fig = make_subplots(rows=1, cols=5, shared_yaxes=True)
for i, col in enumerate(df.columns):
    x = df[col]
    mask = x<=0
    fig.add_trace(go.Bar(x=x[mask], y=df.index[mask],orientation='h', 
                         text=x[mask], texttemplate='%{text:.2f}%',textposition='auto',
                         hovertemplate='Average Return in %{y} Stocks = %{x:.4f}%',
                         marker=dict(color='red', opacity=0.7),name=col[-4:]), 
                  row=1, col=i+1)
    fig.add_trace(go.Bar(x=x[~mask], y=df.index[~mask],orientation='h', 
                         text=x[~mask], texttemplate='%{text:.2f}%', textposition='auto', 
                         hovertemplate='Average Return in %{y} Stocks = %{x:.4f}%',
                         marker=dict(color='green', opacity=0.7),name=col[-4:]), 
                  row=1, col=i+1)
    fig.update_xaxes(range=(x.min()-.15,x.max()+.15), title='{} Returns'.format(col[-4:]), 
                     showticklabels=False, row=1, col=i+1)
fig.update_layout(template=temp,title='Yearly Average Stock Returns by Sector', 
                  hovermode='closest',margin=dict(l=250,r=50),
                  height=600, width=1000, showlegend=False)
fig.show()

In [None]:
pal = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, 18)]
fig = go.Figure()
for i, sector in enumerate(df.index[::-1]):
    y_data=train_df[train_df['SectorName']==sector]['Target']
    fig.add_trace(go.Box(y=y_data*100, marker_color=pal[i], name=sector, showlegend=False))
fig.update_layout(template=temp, title='Target Distribution by Sector',
                  yaxis=dict(title='Stock Return',ticksuffix='%'),
                  margin=dict(b=150), height=750, width=900)
fig.show()

**Key Observations**
In 2021, nearly all industries saw a positive return on average, with the highest in Energy Resources at about 0.13% overall, while in 2018, all sectors saw a negative return except for Electric Power & Gas.
While most sectors have returns between 1% and -1%, there are quite a few outliers across all industries, with some returns as high as 62% in Commercial & Wholesale Trade and others as low as -31% in IT & Services sector. The graph below shows the stock price movements within each sector.

In [None]:
train_date=train_df.Date.unique()
sectors=train_df.SectorName.unique().tolist()
sectors.insert(0, 'All')
open_avg=train_df.groupby('Date')['Open'].mean()
high_avg=train_df.groupby('Date')['High'].mean()
low_avg=train_df.groupby('Date')['Low'].mean()
close_avg=train_df.groupby('Date')['Close'].mean() 
buttons=[]

fig = go.Figure()
for i in range(18):
    if i != 0:
        open_avg=train_df[train_df.SectorName==sectors[i]].groupby('Date')['Open'].mean()
        high_avg=train_df[train_df.SectorName==sectors[i]].groupby('Date')['High'].mean()
        low_avg=train_df[train_df.SectorName==sectors[i]].groupby('Date')['Low'].mean()
        close_avg=train_df[train_df.SectorName==sectors[i]].groupby('Date')['Close'].mean()        
    
    fig.add_trace(go.Candlestick(x=train_date, open=open_avg, high=high_avg,
                                 low=low_avg, close=close_avg, name=sectors[i],
                                 visible=(True if i==0 else False)))
    
    visibility=[False]*len(sectors)
    visibility[i]=True
    button = dict(label = sectors[i],
                  method = "update",
                  args=[{"visible": visibility}])
    buttons.append(button)
    
fig.update_xaxes(rangeslider_visible=True,
                 rangeselector=dict(
                     buttons=list([
                         dict(count=3, label="3m", step="month", stepmode="backward"),
                         dict(count=6, label="6m", step="month", stepmode="backward"),
                         dict(step="all")]), xanchor='left',yanchor='bottom', y=1.16, x=.01))
fig.update_layout(template=temp,title='Stock Price Movements by Sector', 
                  hovermode='x unified', showlegend=False, width=1000,
                  updatemenus=[dict(active=0, type="dropdown",
                                    buttons=buttons, xanchor='left',
                                    yanchor='bottom', y=1.01, x=.01)],
                  yaxis=dict(title='Stock Price'))
fig.show()

In the candlestick charts above, the boxes represent the daily spread between the open and close prices and the lines represent the spread between the low and high prices. The color of the boxes indicates whether the close price was greater or lower than the open price, with green indicating a higher closing price on that day and red indicating a lower closing price. In late August, the market saw a consecutive 14-day period where the close price was greater than the open price.

In [None]:
stock=train_df.groupby('Name')['Target'].mean().mul(100)
stock_low=stock.nsmallest(7)[::-1].rename("Return")
stock_high=stock.nlargest(7).rename("Return")
stock=pd.concat([stock_high, stock_low], axis=0).reset_index()
stock['Sector']='All'
for i in train_df.SectorName.unique():
    sector=train_df[train_df.SectorName==i].groupby('Name')['Target'].mean().mul(100)
    stock_low=sector.nsmallest(7)[::-1].rename("Return")
    stock_high=sector.nlargest(7).rename("Return")
    sector_stock=pd.concat([stock_high, stock_low], axis=0).reset_index()
    sector_stock['Sector']=i
    stock=stock.append(sector_stock,ignore_index=True)
    
fig=go.Figure()
buttons = []
for i, sector in enumerate(stock.Sector.unique()):
    
    x=stock[stock.Sector==sector]['Name']
    y=stock[stock.Sector==sector]['Return']
    mask=y>0
    fig.add_trace(go.Bar(x=x[mask], y=y[mask], text=y[mask], 
                         texttemplate='%{text:.2f}%',
                         textposition='auto',
                         name=sector, visible=(False if i != 0 else True),
                         hovertemplate='%{x} average return: %{y:.3f}%',
                         marker=dict(color='green', opacity=0.7)))
    fig.add_trace(go.Bar(x=x[~mask], y=y[~mask], text=y[~mask], 
                         texttemplate='%{text:.2f}%',
                         textposition='auto',
                         name=sector, visible=(False if i != 0 else True),
                         hovertemplate='%{x} average return: %{y:.3f}%',
                         marker=dict(color='red', opacity=0.7)))
    
    visibility=[False]*2*len(stock.Sector.unique())
    visibility[i*2],visibility[i*2+1]=True,True
    button = dict(label = sector,
                  method = "update",
                  args=[{"visible": visibility}])
    buttons.append(button)

fig.update_layout(title='Stocks with Highest and Lowest Returns by Sector',
                  template=temp, yaxis=dict(title='Average Return', ticksuffix='%'),
                  updatemenus=[dict(active=0, type="dropdown",
                                    buttons=buttons, xanchor='left',
                                    yanchor='bottom', y=1.01, x=.01)], 
                  margin=dict(b=150),showlegend=False,height=700, width=900)
fig.show()

In [None]:
highest_performing_stocks = ['Enechange ltd.', 'Gmo financial gate,inc.', 'Macbee planet,inc.', 'Cellsource co.,ltd.', 'Nextone inc.']
stock_list[stock_list.Name.apply(lambda x: True if x in highest_performing_stocks else False)]

In [None]:
stocks=train_df[train_df.SecuritiesCode.isin([4051,4169,4880,7094,7095])]
df_pivot=stocks.pivot_table(index='Date', columns='Name', values='Close').reset_index()
pal=['rgb'+str(i) for i in sns.color_palette("coolwarm", len(df_pivot))]

fig = ff.create_scatterplotmatrix(df_pivot.iloc[:,1:], diag='histogram', name='')
fig.update_traces(marker=dict(color=pal, opacity=0.9, line_color='white', line_width=.5))
fig.update_layout(template=temp, title='Scatterplots of Highest Performing Stocks', 
                  height=1000, width=1000, showlegend=False)
fig.show()

In [None]:
corr=train_df.groupby('SecuritiesCode')[['Target','Close']].corr().unstack().iloc[:,1]
stocks=corr.nlargest(10).rename("Return").reset_index()
stocks=stocks.merge(train_df[['Name','SecuritiesCode']], on='SecuritiesCode').drop_duplicates()
pal=sns.color_palette("magma_r", 14).as_hex()
rgb=['rgba'+str(matplotlib.colors.to_rgba(i,0.7)) for i in pal]

fig = go.Figure()
fig.add_trace(go.Bar(x=stocks.Name, y=stocks.Return, text=stocks.Return, 
                     texttemplate='%{text:.2f}', name='', width=0.8,
                     textposition='outside',marker=dict(color=rgb, line=dict(color=pal,width=1)),
                     hovertemplate='Correlation of %{x} with target = %{y:.3f}'))
fig.update_layout(template=temp, title='Most Correlated Stocks with Target Variable',
                  yaxis=dict(title='Correlation',showticklabels=False), 
                  xaxis=dict(title='Stock',tickangle=45), margin=dict(b=100),
                  width=800,height=500)
fig.show()

In [None]:
df_pivot=train_df.pivot_table(index='Date', columns='SectorName', values='Close').reset_index()
corr=df_pivot.corr().round(2)
mask=np.triu(np.ones_like(corr, dtype=bool))
c_mask = np.where(~mask, corr, 100)
c=[]
for i in c_mask.tolist()[1:]:
    c.append([x for x in i if x != 100])
    
cor=c[::-1]
x=corr.index.tolist()[:-1]
y=corr.columns.tolist()[1:][::-1]
fig=ff.create_annotated_heatmap(z=cor, x=x, y=y, 
                                hovertemplate='Correlation between %{x} and %{y} stocks = %{z}',
                                colorscale='viridis', name='')
fig.update_layout(template=temp, title='Stock Correlation between Sectors',
                  margin=dict(l=250,t=270),height=800,width=900,
                  yaxis=dict(showgrid=False, autorange='reversed'),
                  xaxis=dict(showgrid=False))
fig.show()

# Feature Engineering

In [None]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df
    
    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)
    return price

train=train.drop('ExpectedDividend',axis=1).fillna(0)
prices=adjust_price(train)
prices.head()

In [None]:
def create_features(df):
    df=df.copy()
    col='AdjustedClose'
    periods=[5,10,20,30,50]
    for period in periods:
        df.loc[:,"Return_{}Day".format(period)] = df.groupby("SecuritiesCode")[col].pct_change(period)
        df.loc[:,"MovingAvg_{}Day".format(period)] = df.groupby("SecuritiesCode")[col].rolling(window=period).mean().values
#         df.loc[:,"ExpMovingAvg_{}Day".format(period)] = df.groupby("SecuritiesCode")[col].ewm(span=period,adjust=False).mean().values
        df.loc[:,"Volatility_{}Day".format(period)] = np.log(df[col]).groupby(df["SecuritiesCode"]).diff().rolling(period).std()
    return df

def FE(stock_price_df):
    stock_price_df['BOP'] = (stock_price_df['Close']-stock_price_df['Open'])/(stock_price_df['High']-stock_price_df['Low'])
    stock_price_df['wp'] = (stock_price_df['Open']+stock_price_df['High']+stock_price_df['Low'])/3
    stock_price_df['TR'] = stock_price_df['High'] - stock_price_df['Low']
    # stock_price_df['AD'] = ta.AD(High, Low, Close, Volume)
    # stock_price_df['OBV']  = ta.OBV(Close, Volume)
    stock_price_df['OC'] = stock_price_df['Open'] * stock_price_df['Close']
    stock_price_df['HL'] = stock_price_df['High'] * stock_price_df['Low']
    stock_price_df['logC'] = np.log(stock_price_df['Close']+1)
    stock_price_df['OHLCstd'] = stock_price_df[['Open','Close','High','Low']].std(axis=1)
    stock_price_df['OHLCskew'] = stock_price_df[['Open','Close','High','Low']].skew(axis=1)
    stock_price_df['OHLCkur'] = stock_price_df[['Open','Close','High','Low']].kurtosis(axis=1)
    stock_price_df['Cpos'] = (stock_price_df['Close']-stock_price_df['Low'])/(stock_price_df['High']-stock_price_df['Low']) -0.5
    stock_price_df['bsforce'] = stock_price_df['Cpos'] * stock_price_df['Volume']
    stock_price_df['Opos'] = (stock_price_df['Open']-stock_price_df['Low'])/(stock_price_df['High']-stock_price_df['Low']) -0.5
    stock_price_df['Date'] = pd.to_datetime(stock_price_df['Date'])
    stock_price_df['weekday'] = stock_price_df['Date'].dt.weekday+1
    stock_price_df['Monday'] = np.where(stock_price_df['weekday']==1,1,0)
    stock_price_df['Tuesday'] = np.where(stock_price_df['weekday']==2,1,0)
    stock_price_df['Wednesday'] = np.where(stock_price_df['weekday']==3,1,0)
    stock_price_df['Thursday'] = np.where(stock_price_df['weekday']==4,1,0)
    stock_price_df['Friday'] = np.where(stock_price_df['weekday']==5,1,0)
    return stock_price_df

price_features=FE(prices)
price_features=create_features(df=price_features)
price_features.drop(['RowId','SupervisionFlag','AdjustmentFactor','CumulativeAdjustmentFactor','Close'],axis=1,inplace=True)

In [None]:
price_names=price_features.merge(stock_list[['SecuritiesCode','Name','SectorName']], on='SecuritiesCode').set_index('Date')
price_names=price_names[price_names.index>='2020-12-29']
price_names.fillna(0, inplace=True)

features=['MovingAvg','Return', 'Volatility']
names=['Average', 'Period', 'Volatility']
buttons=[]

fig = make_subplots(rows=2, cols=2, 
                    shared_xaxes=True, 
                    vertical_spacing=0.1,
                    subplot_titles=('Adjusted Close Moving Average',
                                    'Stock Return', 'Stock Volatility'))

for i, sector in enumerate(price_names.SectorName.unique()):
    
    sector_df=price_names[price_names.SectorName==sector]
    periods=[0,10,30,50]
    colors=px.colors.qualitative.Vivid
    dash=['solid','dash', 'longdash', 'dashdot', 'longdashdot']
    row,col=1,1
    
    for j, (feature, name) in enumerate(zip(features, names)):
        if j>=2:
            row,periods=2,[10,30,50]
            colors=px.colors.qualitative.Bold[1:]
        if j%2==0:
            col=1
        else:
            col=2
        
        for k, period in enumerate(periods):
            if (k==0)&(j<2):
                plot_data=sector_df.groupby(sector_df.index)['AdjustedClose'].mean().rename('Adjusted Close')
            elif j>=2:
                plot_data=sector_df.groupby(sector_df.index)['{}_{}Day'.format(feature,period)].mean().mul(100).rename('{}-day {}'.format(period,name))
            else:
                plot_data=sector_df.groupby(sector_df.index)['{}_{}Day'.format(feature,period)].mean().rename('{}-day {}'.format(period,name))
            fig.add_trace(go.Scatter(x=plot_data.index, y=plot_data, mode='lines',
                                     name=plot_data.name, marker_color=colors[k+1],
                                     line=dict(width=2,dash=(dash[k] if j<2 else 'solid')), 
                                     showlegend=(True if (j==0) or (j==2) else False), legendgroup=row,
                                     visible=(False if i != 0 else True)), row=row, col=col)
            
    visibility=[False]*14*len(price_names.SectorName.unique())
    for l in range(i*14, i*14+14):
        visibility[l]=True
    button = dict(label = sector,
                  method = "update",
                  args=[{"visible": visibility}])
    buttons.append(button)

fig.update_layout(title='Stock Price Moving Average, Return,<br>and Volatility by Sector',
                  template=temp, yaxis3_ticksuffix='%', yaxis4_ticksuffix='%',
                  legend_title_text='Period', legend_tracegroupgap=250,
                  updatemenus=[dict(active=0, type="dropdown",
                                    buttons=buttons, xanchor='left',
                                    yanchor='bottom', y=1.105, x=.01)], 
                  hovermode='x unified', height=800,width=1200, margin=dict(t=150))
fig.show()