In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import os
import matplotlib.pyplot as plt
from pprint import pprint
import seaborn as sns
from matplotlib import cm
from IPython.core.display import display, HTML
from tqdm import tqdm
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import jpx_tokyo_market_prediction
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(15.7,6)})
import warnings
warnings.filterwarnings("ignore")
def getadvance(x):
    ret = 0
    if x > 0:
        ret = 1
    return(ret)

def get_month(dt):
    x = dt.strftime("%m")
    return(x)

def RSI(series, period):
    delta = series.diff().dropna()
    u = delta * 0
    d = u.copy()
    u[delta > 0] = delta[delta > 0]
    d[delta < 0] = -delta[delta < 0]
    u[u.index[period-1]] = np.mean( u[:period] ) #first value is sum of avg gains
    u = u.drop(u.index[:(period-1)])
    d[d.index[period-1]] = np.mean( d[:period] ) #first value is sum of avg losses
    d = d.drop(d.index[:(period-1)])
    rs = pd.DataFrame.ewm(u, com=period-1, adjust=False).mean() / \
         pd.DataFrame.ewm(d, com=period-1, adjust=False).mean()
    return 100 - 100 / (1 + rs)

def rsi_class(x):
    ret = "low"
    if x < 50:
        ret = "low"
    if x > 50:
        ret = "med"
    if x > 70:
        ret = "hi"
    return(ret)
#os.listdir('../input/jpx-tokyo-stock-exchange-prediction/train_files/')
def display_dataframe(df, title = ""):
    #tdstring = f'<td style="text-align: left; vertical-align: middle; font-size:1.2em;">{v}</td>'
    if (title != ""):
        text = f'<h2>{title}</h2><table><tr>'
    else:
        text = '<table><tr>'
    text += ''.join([f'<td style="text-align: left; vertical-align: middle; font-size:1.2em;"><b>{col}</b></td>' for col in df.columns.values]) + '</tr>'
    for row in df.itertuples():
        #text +=  '<tr>' + ''.join([f'<td valign="top">{v}</td>' for v in row[1:]]) + '</tr>'
        text +=  '<tr>' + ''.join([ f'<td style="text-align: left; vertical-align: middle; font-size:1.1em;">{v}</td>' for v in row[1:]]) + '</tr>'
    text += '</table>'
    display(HTML(text))
    
def prep_prices(price, test = False):
    from decimal import ROUND_HALF_UP, Decimal
    pcols = ["Open","High","Low","Close"]
    price.ExpectedDividend.fillna(0,inplace=True)
    def qround(x):
        return float(Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
    
    def adjust_prices(df):
        df = df.sort_values("Date", ascending=False)
        df.loc[:, "CumAdjust"] = df["AdjustmentFactor"].cumprod()

        # generate adjusted prices
        for p in pcols:     
            df.loc[:, p] = (df["CumAdjust"] * df[p]).apply(qround)
        df.loc[:, "Volume"] = df["Volume"] / df["CumAdjust"]
        df.ffill(inplace=True)
        df.bfill(inplace=True)
        
        # generate and fill Targets
        #df.loc[:, "Target"] = ((df.Close.shift(-2)/df.Close.shift(-1) - 1)).fillna(df.Target)
        if (not test):
            df.Target.fillna(0,inplace=True)

        return df

    # generate Adjusted
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(adjust_prices).reset_index(drop=True)
    price = price.sort_values("RowId")
    return price


def getdata():
    df = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv', parse_dates=True)
    df = prep_prices(df)
    df['Date'] = pd.to_datetime(df['Date'], format = "%Y-%m-%d")
    df['dtint'] = df['Date'].dt.strftime("%Y%m%d").astype(int)
    return(df)

Hello there, welcome to the analysis of Japan Stock Exchange data. I have tried to identify features that can be useful while making a model to predict prices. Hope you like this.

**Most Recent Update**: Bollinger Bands analysis added.

## About the Data

In the training files, 12 columns (variables) are provided where Target column is the one which needs to be the output column. Other important columns are "Open", "High", "Low" and "Close", where "Close" or Closing Price is an important variable.
Volume is another important column which is analyzed in the notebook and we are also going to examine the effect of RSI on stock price movement. Data is provided for 2000 securities and 2.33 million observations are provided.

**Adjusting Prices**

There is a variable called `AdjustmentFactor` which is important because there are sometimes sharp drop in prices, which is caused by stock split or switch. For this reason adjustment of price is required. This is built in `getdata` function.

In [None]:
df = getdata()
lst_data = []
lst_data.append({"Item": "No. of Securities", "Value": len(df['SecuritiesCode'].unique())})
lst_data.append({"Item": "No. of Observations (in mils.)", "Value": (df.shape[0]/1000000)})
lst_data.append({"Item": "No. of variables (columns)", "Value": len(df.columns)})
lst_data.append({"Item": "Data availability for", "Value": "1202 Days"})
lst_data.append({"Item": "No. of Securities with full availability", "Value": "1865"})
df_display = pd.DataFrame(data = lst_data)
display_dataframe(df_display, "Basic Statistics of Stock Data")

> Following is the distribution of stock with availability of data for number of days.

In [None]:
df_counts = df.groupby(['SecuritiesCode']).agg(count = ("Date", "count"))
df_counts = df_counts.reset_index()
df_counts = df_counts[['SecuritiesCode', 'count']].reset_index(drop = True)
df_counts.columns = ["Securities", "Count"]
df_summ = pd.DataFrame(df_counts['Count'].value_counts()).reset_index()
df_summ.columns = ["Number of securities", "Data available in days"]
display(HTML(df_summ.head(5).to_html(index = False)))

## Exploring `Target` variable

- There appears to be a good distribution of Target variable, where we have outliers at both ends.
- Distribution of Target variable appears to be normal, with mean close to zero.

In [None]:
sns.set(rc={'figure.figsize':(14.7,6)})
sns.set_style("whitegrid")
plt.figure(figsize=(8,6))
plt.scatter(range(df.shape[0]), np.sort(df['Target'].values))
plt.xlabel('index', fontsize=12)
plt.ylabel('Target', fontsize=12)
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(16.7,8)})
sns.set_style("whitegrid")
llimit = np.percentile(df.Target.values, 1)
ulimit = np.percentile(df.Target.values, 99)
df_plot = df[df['Target'] > llimit]
df_plot = df_plot[df_plot['Target'] < ulimit]
sns.distplot(df_plot.Target.values, bins=10, kde=False);

## Basic Exploration for five securities

Looking at Close values of some of the stocks. It is observed that sometimes all the stocks fall together but quantum of downside varies. But it is evident that with time, all stocks behave differently.

In [None]:
# Doing basic exploration for five securities having the maximum data available
seclist = [1301, 1332, 1333, 1376, 1377]
df_c = df.copy()
df_stocks = df_c[df_c['SecuritiesCode'].isin(seclist)]
sns.set_style("whitegrid")
ax = sns.lineplot(data = df_stocks, x = df_stocks['Date'], y = 'Close', hue = 'SecuritiesCode');
labels = [item.get_text() for item in ax.get_xticklabels()]
ax.set_xticklabels(labels)
plt.legend(loc='upper left')
plt.xlabel("time");
plt.show()

## Advance to Decline Ratio

Advance and Decline ratio is defined as the number of periods a stock has advanced over number of periods it has declined. This is being examined to identify if there is a trend here. For example, if there is an indication that in a particular month, stock tend to generally decline or advance.

**Observation** : There does not appear to be an evidence if Stock advancement is tied to a month.

In [None]:
# advance to decline ratio in a month
sns.set(rc={'figure.figsize':(14.7,3)})
sns.set_style("whitegrid")
seclist = [1301, 1332, 1333, 1376, 1377]
for SECURITY in seclist:
#seclist = [1301]
#SECURITY = 1301
    df_c = df.copy()
    df_stock = df_c[df_c['SecuritiesCode']== SECURITY].reset_index()
    df_stock = df_stock.sort_values(by = "Date", ascending = False)
    df_stock['pClose'] = df_stock['Close'].shift(-1)
    df_stock['delta'] = df_stock['Close'] - df_stock['pClose']
    df_stock['advance'] = list(map(getadvance, df_stock['delta']))
    df_stock['Date'] = pd.to_datetime(df_stock['Date'], format = "%Y-%m-%d")
    df_stock['Month'] =  list(map(get_month, df_stock['Date']))
    df_stats = df_stock.groupby(["Month"]).agg(
                        advances = ("advance", "sum"), total = ("advance",  "count")).reset_index()
    df_stats['advance_to_decline'] = df_stats['advances'] / (df_stats['total'] - df_stats['advances'])
    plt.title(f"Examining advance to decline ratio for:{SECURITY}")
    ax  = sns.barplot(x="Month", y="advance_to_decline", data=df_stats, palette="Blues_d")
    plt.show()

## Exploring and finding features

- The next step is to explore features which could be helpful in predictive modelling. Here, we are looking at two variables, Volume and RSI. While the Volume is provided, RSI is a metric which has been found useful while studying movement of stock prices and is calculated.
- Here, we see that stock could move either way (up or down) when volumes are high, this indicates heavy buying and heavy selling. This indicates that with high volumes, movement of stock is imminent, but the direction is not known.
- On the other hand, an increasing RSI represents bullishness in a stock, while a decreasing RSI appears to push the prices down.

In [None]:
sns.set_style("whitegrid")
SECURITY = 1301
df_stock = df[df['SecuritiesCode'] == SECURITY].reset_index(drop = True)
df_stock = df_stock.sort_values(by = "Date").reset_index()
df_stock['rsi'] = RSI( df_stock['Close'], 14 )
df_stock['rsicat'] = list(map(rsi_class, df_stock['rsi']))
fig, (ax1, ax2, ax3) = plt.subplots(3,1, figsize = (12,8))
fig.subplots_adjust(hspace=0.5)
ax1.plot(df_stock.index, df_stock['Close']);
labels = [item.get_text() for item in ax1.get_xticklabels()]
ax1.set_xticklabels(labels)
ax2.plot(df_stock.index, df_stock['Volume']);
labels = [item.get_text() for item in ax2.get_xticklabels()]
ax2.set_xticklabels(labels)
ax3.plot(df_stock.index, df_stock['rsi']);
labels = [item.get_text() for item in ax3.get_xticklabels()]
ax3.set_xticklabels(labels)
plt.suptitle(f"Comparing Price Close, Volume and RSI for security:{SECURITY}")
plt.tight_layout()

In [None]:
sns.set_style("whitegrid")
SECURITY = 1332
df_stock = df[df['SecuritiesCode'] == SECURITY].reset_index(drop = True)
df_stock = df_stock.sort_values(by = "Date").reset_index()
df_stock['rsi'] = RSI( df_stock['Close'], 14 )
df_stock['rsicat'] = list(map(rsi_class, df_stock['rsi']))
fig, (ax1, ax2, ax3) = plt.subplots(3,1, figsize = (12,8))
fig.subplots_adjust(hspace=0.5)
ax1.plot(df_stock.index, df_stock['Close']);
labels = [item.get_text() for item in ax1.get_xticklabels()]
ax1.set_xticklabels(labels)
ax2.plot(df_stock.index, df_stock['Volume']);
labels = [item.get_text() for item in ax2.get_xticklabels()]
ax2.set_xticklabels(labels)
ax3.plot(df_stock.index, df_stock['rsi']);
labels = [item.get_text() for item in ax3.get_xticklabels()]
ax3.set_xticklabels(labels)
plt.suptitle(f"Comparing Price Close, Volume and RSI for security:{SECURITY}")
plt.tight_layout()

## Closer look at RSI

Since RSI appears to correlate better to Stock Prices, here is another way of analyzing this. The RSI values are categorized in Low, Medium and High category. Looking at the RSI categories, it appears that when RSI is in medium range, the price movement is most favourable. RSI in higher range is sometimes an indication of impending correction. 

This looks promising and a strategy can be developed to remain invested in a stock when RSI values are in medium range. Or to be precise - **Do not buy a stock when RSI is in Low category.**

**Note**: This phenomenon is quite pronounced for Security code 1377.

In [None]:
sns.set(rc={'figure.figsize':(14.7,4)})
sns.set_style("whitegrid")
seclist = [1301, 1332, 1333, 1376, 1377]
for SECURITY in seclist:
    df_stock = df[df['SecuritiesCode'] == SECURITY].reset_index(drop = True)
    df_stock = df_stock.sort_values(by = "Date").reset_index()
    df_stock['rsi'] = RSI( df_stock['Close'], 14 )
    df_stock['rsicat'] = list(map(rsi_class, df_stock['rsi']))
    plt.title(f"Examining RSI on movement of Price for:{SECURITY}")
    ax = sns.scatterplot(x = df_stock.index, y = df_stock["Close"], hue = df_stock["rsicat"]);
    plt.show()

## RSI distribution

A look at how RSI is distributed can also be helpful to understand this variable. This is heartening to see that RSI distribution is similar to the securities which are examined. This appears to be a variable which can be useful in model creation.

In [None]:
sns.set(rc={'figure.figsize':(14.7,4)})
sns.set_style("whitegrid")
seclist = [1301, 1332, 1333, 1376, 1377]
df_all = pd.DataFrame()
for SECURITY in seclist:
    df_stock = df[df['SecuritiesCode'] == SECURITY].reset_index(drop = True)
    df_stock = df_stock.sort_values(by = "Date").reset_index()
    df_stock['rsi'] = RSI( df_stock['Close'], 14 )
    df_stock['rsicat'] = list(map(rsi_class, df_stock['rsi']))
    df_all = df_all.append(df_stock)
    
    
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(14.7,4)})    
sns.boxplot(x="SecuritiesCode", y="rsi", data=df_all)
plt.ylabel('RSI', fontsize=12)
plt.xlabel('Security Code', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Distribution of RSI", fontsize=15)
plt.show()

## Exploring more of RSI

Since RSI appears to be a good indicator, trying to see if this can help in predicting price increase. Here I am trying to see which RSI period helps in predicting the Target variable. This can be a feature for building models. This is done using calculating correlation between `Target` variable and RSI for various RSI periods between 5 to 20.

The results show that RSI period 5 has got best correlation followed by 19. This will be used as a feature for Machine Learning models.

**Note: Below code is commented which generates this statistic. This is done to save processing time. Interestd ones can uncomment the code.**

## Return Analysis

Return Analysis gives the information that how much an unit investment is worth. This is an useful feature which tells the investor that how much her/his investment is worth today.

This helps in comparing the returns on a Normalized scale, since stock prices of various stocks vary, it is difficult to compare them. Calculating a return index gives a more holistic comparison of various securities.

In [None]:
def get_return_df(SECURITY):
    df_stock = df[df['SecuritiesCode'] == SECURITY].reset_index(drop = True)
    df_stock = df_stock.sort_values(by = "Date").reset_index()
    returns = df_stock['Close'].pct_change()
    ret_index = (1 + returns).cumprod()
    ret_index[0] = 1
    df_stock['return_index'] = ret_index
    df_stock = df_stock[['Date', 'return_index', 'SecuritiesCode']]
    return(df_stock)
seclist = [1301, 1332, 1333, 1376, 1377]
df_stocks = pd.DataFrame()


for SECURITY in seclist:
    df_t = get_return_df(SECURITY)
    df_stocks = df_stocks.append(df_t)
    #plt.plot(df_t.index, df_t['return_index'], color = 'green')
  

with plt.style.context('seaborn-whitegrid'):
    groups = df_stocks.groupby('SecuritiesCode')
    for name, group in groups:
        plt.plot(group.index
                 , group.return_index
                 , marker='*'
                 , linestyle='-'
                 , markersize=1
                 , label=name)

    plt.title("Comparing returns of Securities")
    plt.legend();

## Analyzing SMA (Simple Moving Averages) and EMA (Exponential Moving Averages)

SMA and EMA are some well know pointers when it comes to Price Tracking and making decisions based on them. These methods help in identifying trends related to stock prices. While as the name suggests, SMA are jsut the average of a period where as EMA attach weights to the calculation and sensitive to recent price movements.

In [None]:
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(14.7,4)})
# sns.set_style("whitegrid")
seclist = [1301, 1332, 1333, 1376, 1377]
for SECURITY in seclist:
    df_stock = df[df['SecuritiesCode'] == SECURITY].reset_index(drop = True)
    df_stock = df_stock.sort_values(by = "Date").reset_index()
    df_s = df_stock[['Date', 'Close']].reset_index(drop = True)
    df_s['20D-SMA'] = df_s['Close'].rolling(window=20).mean()
    df_s['50D-SMA'] = df_s['Close'].rolling(window=50).mean()
    df_s['100D-SMA'] = df_s['Close'].rolling(window=100).mean()
    df_s.set_index('Date')
    del df_s['Date']
    sns.set_style("whitegrid")
    df_s.plot(title = "SMA analysis for Security Code:" + str(SECURITY));

In [None]:
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(14.7,4)})
# sns.set_style("whitegrid")
seclist = [1301, 1332, 1333, 1376, 1377]
for SECURITY in seclist:
    df_stock = df[df['SecuritiesCode'] == SECURITY].reset_index(drop = True)
    df_stock = df_stock.sort_values(by = "Date").reset_index()
    df_s = df_stock[['Date', 'Close']].reset_index(drop = True)
    df_s['20D-EMA'] = df_s['Close'].ewm(span=20,adjust=False).mean()
    df_s['50D-EMA'] = df_s['Close'].ewm(span=50,adjust=False).mean()
    df_s['100D-EMA'] = df_s['Close'].ewm(span=100,adjust=False).mean()
    df_s.set_index('Date')
    del df_s['Date']
    sns.set_style("whitegrid")
    df_s.plot(title = "EMA analysis for Security Code:" + str(SECURITY));

We have tried to see the impact 20, 50 and 100 days of SMA and EMA on Stock Prices. Some interesting observations are here.

- 50D EMA looks most promising.
- Convergence of various EMAs and SMAs are interesting points. When a stock closes above converging EMAs/SMAs, more often than not prices move upwards. Similar phenomenon is observed when they close below.

## Bollinger Bands

- Bollinger Bands are another useful feature when designing strategies.
- These are indicator of volatitility.
- These can be used to draw a support curve and a resistance curve.
- Share values remain within the curve most of the time (likely 95%).
- This is done based on Moving Average (SMA) for 20 sessions.
- We see that this behaviour is confirmed by the stocks chosen below. Often the hitting the support and resistance indicates that trend reversal is coming.

In [None]:
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(14.7,4)})
# sns.set_style("whitegrid")
seclist = [1301, 1332, 1333, 1376, 1377]
for SECURITY in seclist:
    df_stock = df[df['SecuritiesCode'] == SECURITY].reset_index(drop = True)
    df_stock = df_stock.sort_values(by = "Date").reset_index()
    df_s = df_stock[['Date', 'Close']].reset_index(drop = True)
    df_s['Middle Band'] = df_s['Close'].rolling(window=20).mean()
    df_s['Upper Band'] = df_s['Middle Band'] + 1.96*df_s['Close'].rolling(window=20).std()
    df_s['Lower Band'] = df_s['Middle Band'] - 1.96*df_s['Close'].rolling(window=20).std()
    df_s.set_index('Date')
    del df_s['Date']
    sns.set_style("whitegrid")
    df_s.plot(title = "Bollinger Bands for Security Code:" + str(SECURITY));

## Comparing daily returns of few securities (Volatility)

It is interesting to see how daily returns of various stocks fare with each other. Distributions of some stocks daily returns are plotted below. 

-  Max and Min range of daily returns are around -30 to +30%.
-  One stock (1435) is more volatile than the others.
- This is also seen that stocks who rise high also fall higher.

In [None]:
securities = list(df['SecuritiesCode'].unique())[0:20]
df_secs = df[df['SecuritiesCode'].isin(securities)]
df_secs['Date'] = pd.to_datetime(df_secs['Date'], format = "%Y-%m-%d")

df_final = pd.DataFrame()
for SECURITY in securities:
    df_stock = df[df['SecuritiesCode'] == SECURITY].reset_index(drop = True)
    df_stock = df_stock.sort_values(by = "Date", ascending = False).reset_index()
    df_stock['pClose'] = df_stock['Close'].shift(-1)
    df_stock['1DayReturn'] =  (df_stock['Close'] / df_stock['pClose']) - 1
    df_final = df_final.append(df_stock)
    
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(14.7,4)})    
sns.boxplot(x="SecuritiesCode", y="1DayReturn", data=df_final)
plt.ylabel('1 Day Return', fontsize=12)
plt.xlabel('Security Code', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Distribution of Single Day Gain/Loss", fontsize=15)
plt.show()

## Examining VWAP

VWAP is an important indicator used in trading setups. This is calculated by adding up the value traded for every transaction (price multiplied by the number of shares traded) and then dividing by the total shares traded.

We are looking to examine the impact of VWAP on Stock Price movement. Since this is being done to evaluate the various indicators as features, here I am looking to see the impact of previous day's VWAP on current day's target. The results are encouraging. 

In [None]:
seclist = [1301, 1332, 1333, 1376, 1377]
for SECURITY in seclist:
    df_stock = df[df['SecuritiesCode'] == SECURITY].reset_index(drop = True)
    df_stock = df_stock.sort_values(by = "Date").reset_index(drop = True)
    df_stock['average'] = (df_stock['High'] + df_stock['Low'] + df_stock['Close'])/3
    df_stock['vwap'] = (df_stock['average'] * df_stock['Volume'])/ df_stock['Volume']
    df_stock['vwap_pct_ret'] = df_stock['vwap'].pct_change()
    df_stock['close_pct_ret'] = df_stock['Close'].pct_change()
    df_stock = df_stock.sort_values(by = "Date", ascending = False).reset_index(drop = True)
    df_stock['pvwap'] = df_stock['vwap_pct_ret'].shift(-1)
    df_stock = df_stock.sort_values(by = "Date").reset_index(drop = True)

    with plt.style.context('seaborn-whitegrid'):
            plt.plot( df_stock.index
                     , df_stock["Target"]
                     , marker='*'
                     , linestyle='-'
                     , markersize=1
                     , label="Target")
            plt.plot( df_stock.index
                     , df_stock["pvwap"]
                     , marker='*'
                     , linestyle='-'
                     , markersize=1
                     , label="pvwap")

            plt.title("Examining vwap's impact pn Target for:" +  str(SECURITY))
            plt.legend();
            plt.show()

This concludes basic analysis of Volume ,RSI and Moving Averages on stock prices. Will be examining more features in due course of time. Stay tuned!

## Building Features

After identifying the features the next stage is to build the features. This is done in the following section.

In [None]:
def get_model_blueprint():
#     model = LGBMRegressor(
#             objective="rmse",
#             metric="rmse",
#             learning_rate=0.005,
#             n_estimators=50000,
#             device="cpu",
#             random_state=999,
#             extra_trees=True,
#             # categorical_feature=[0]
 #       )
    model = LGBMRegressor(device_type = 'cpu')
    return(model)

def get_model_data(df, SECURITY_CODE):
    df_data = df.copy()
    df_stock = df_data[df_data['SecuritiesCode'] == SECURITY_CODE]
    # feature 'average price, vwap
    # Some optimization here.
    test_length = -1
    if len(df_stock['typ'].unique() == 2): # This means this is test data creation
        test_length = len(df_stock[df_stock['typ'] == 'test'])
    
    df_stock = df_stock.sort_values(by = "Date", ascending = False).reset_index(drop = True)
    if test_length > 0:
            df_stock = df_stock[0: test_length + 125]
    df_stock['average'] = (df_stock['High'] + df_stock['Low'] + df_stock['Close'])/3
    df_stock['vwap'] = (df_stock['Close'] * df_stock['Volume'])/ df_stock['Volume']
    df_stock['vwap_pct_ret'] = df_stock['vwap'].pct_change()
    df_stock['pvwap'] = df_stock['vwap_pct_ret'].shift(-1)
    df_stock['20D-EMA'] = df_stock['Close'].ewm(span=20,adjust=False).mean()
    df_stock['50D-EMA'] = df_stock['Close'].ewm(span=50,adjust=False).mean()
    df_stock['100D-EMA'] = df_stock['Close'].ewm(span=100,adjust=False).mean()
    df_stock = df_stock.sort_values(by = "Date").reset_index(drop = True)
    df_stock['rsi'] = RSI( df_stock['Close'], 5 )
    df_stock['rsicat'] = list(map(rsi_class, df_stock['rsi']))
    df_stock['dt'] = pd.to_datetime(df_stock['Date'], format  = "%Y-%m-%d")
    df_stock['dayofweek'] = df_stock['dt'].dt.dayofweek
    # Another feature day of the week will also be added.
    df_stock['rsicat'] = pd.Series(df_stock['rsicat'], dtype="category")
    df_stock = df_stock.sort_values(by = "Date").reset_index(drop = True)
    df_model = df_stock[['Open', 'High', 'Low', 'Close'
                        , 'average', 'vwap', 'rsi', 'pvwap'
                        , '20D-EMA' , '50D-EMA', '100D-EMA'
                        , 'rsicat', 'dayofweek', 'Target'
                        , 'typ', 'RowId', 'Date', 'SecuritiesCode', 'dtint']]
    # '
    return(df_model)

## This is it
Stay tuned for more to come!

## Building Models - CURRENTLY STOPPED

Here, machine learning models are being built for all the securities. These models are preserved in a dictionary and later on they are utilized for making predictions.

In [None]:
# df_data = getdata()
# df_data = df_data.assign(typ = 'train') 
# # Build models
# dict_models = {}
# seclist = df_data['SecuritiesCode'].unique()
# #seclist = [1301, 1332, 1333, 1376, 1377]
# for SECURITY in (seclist):
#     df_model = get_model_data(df_data, SECURITY)
#     df_model = df_model.dropna().reset_index(drop=True)
#     del df_model['typ']
#     del df_model['RowId']
#     del df_model['Date']
#     del df_model['SecuritiesCode']
#     train_size = round(len(df_model) * .9)
#     test_size = len(df_model) - train_size
#     X_train = df_model[:train_size]
#     y_train = df_model['Target'][:train_size]
#     X_valid = df_model[train_size:]
#     y_valid = df_model['Target'][train_size:]
#     del X_train['Target']
#     del X_valid['Target']
#     model = get_model_blueprint()
#     model.fit(
#             X_train, y_train,
#             eval_set=[(X_valid, y_valid)],
#             early_stopping_rounds=50,
#             verbose=1000
#         )
#     valid_preds = model.predict(X_valid)
#     valid_score = np.sqrt(mean_squared_error(y_valid, valid_preds))
#     print(SECURITY, valid_score)
#     dict_models[SECURITY] = model

## Now is the time to Submit - CURRENTLY STOPPED

This is the last section. But please stay tuned as I will be adding more features in due course of time.

In [None]:
# env = jpx_tokyo_market_prediction.make_env()
# iter_test = env.iter_test()
# df_all = df_data.copy()
# df_all = df_all.assign(typ = 'train') 
# for i, (prices, options, financials, trades, secondary_prices, sample_prediction) in enumerate(tqdm(iter_test)):
#     sample_prediction.assign(Prediction = None, inplace = True)
#     df_all = df_data.copy()
#     prices = prep_prices(prices, True)
#     prices['Date'] = pd.to_datetime(prices['Date'], format = "%Y-%m-%d")
#     prices['dtint'] = prices['Date'].dt.strftime("%Y%m%d").astype(int)
#     df_all = df_all.append(prices)
#     df_all[['typ']] = df_all[['typ']].fillna(value= 'test')
#     #seclist = [1301, 1332] 
#     for SECURITY in seclist:
#         df_model = get_model_data(df_all, SECURITY)
#         df_test = df_model[df_model['typ'] == 'test']
#         index    = df_test.index.values[0]
#         rowid    = df_test['RowId'].values[0]
#         date     = df_test['Date'].values[0]
#         seccode  = df_test['SecuritiesCode'].values[0]
#         del df_test['RowId']
#         del df_test['typ']
#         del df_test['Date']
#         del df_test['Target']
#         del df_test['SecuritiesCode']
#         pred = dict_models[SECURITY].predict(df_test)
#         df_sample = sample_prediction.copy()
#         df_sample['Date'] = pd.to_datetime(df_sample['Date'], format = "%Y-%m-%d")
#         df_sample = df_sample[df_sample['Date'] == date]
#         df_sample = df_sample[df_sample['SecuritiesCode'] == seccode]
#         index = df_sample.index.values[0]
#         sample_prediction.at[index, 'Prediction'] = pred[0]
#     sample_prediction = sample_prediction.sort_values(by = "Prediction", ascending=False)
#     sample_prediction.Rank = np.arange(0,2000)
#     sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
#     sample_prediction.drop(["Prediction"],axis=1)
#     submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
#     #display(submission)
#     print("Submitting:", i)
#     env.predict(submission)