In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Creating Features from stock_list and price

In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Merging stock_list and stock price

In [None]:
df_TrainStockPrices = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
df_SuppStockPrices  =  pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv")
df_Price  = pd.concat([df_TrainStockPrices, df_SuppStockPrices])

In [None]:
df_Stocklist = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")
_stock_list = df_Stocklist.copy()
_stock_list.rename(columns={'Close': 'Close_x'}, inplace=True)
base_df = df_Price.merge(_stock_list, on='SecuritiesCode', how="left")

In [None]:
base_df.head()

In [None]:
base_df.columns

### Forward and backword method for getting rid of NaN

In [None]:
base_df = base_df.fillna(method='ffill')
base_df = base_df.fillna(method='bfill')

In [None]:
base_df.isna().sum()

### Set RowId as an index  and convert Date object to integer (DateInt)

In [None]:
base_df = base_df.set_index('RowId')
base_df.Date = pd.to_datetime(base_df.Date)
base_df['DateInt'] = base_df['Date'].dt.strftime("%Y%m%d").astype(int)

### Dropping useless columns

In [None]:
base_df = base_df.drop(["Universe0","Name"], axis=1)

# Creating Features

* Convert objects of dataframe to integer and float
* Determine ordinal or nominal values if nominal use function of get_dummies

In [None]:
base_df.info()

In [None]:
obj_df = base_df.select_dtypes(include=['object']).copy()
obj_df.head()

### Getting features from "NewIndexSeriesSize" and "NewIndexSeriesSizeCode"

In [None]:
obj_df['NewIndexSeriesSize'] = obj_df['NewIndexSeriesSize'].replace("-", np.nan)
obj_df["NewIndexSeriesSize"]=obj_df["NewIndexSeriesSize"].fillna(method='ffill')
obj_df['NewIndexSeriesSizeCode'] = obj_df['NewIndexSeriesSizeCode'].replace("-", np.nan)
obj_df["NewIndexSeriesSizeCode"]=obj_df["NewIndexSeriesSizeCode"].fillna(method='ffill')

In [None]:
obj_df["NewIndexSeriesSize"].unique()

I make use of this link for creating features ("NewIndexSeriesSize")
https://www.jpx.co.jp/english/markets/indices/line-up/files/e_fac_12_size.pdf

In [None]:
NewIndexSeriesSize_df = (pd.concat([obj_df["NewIndexSeriesSize"]] * (3+1), axis=1, ignore_index=True)).rename(columns={0: "NewIndexSeriesSize_1month", 1: "NewIndexSeriesSize_3month",2: "NewIndexSeriesSize_6month",3: "NewIndexSeriesSize_12month"})

In [None]:
NewIndexSeriesSize_df["NewIndexSeriesSize_1month"].replace({"TOPIX Small 2": 8.74, "TOPIX Mid400": 7.1,"TOPIX Small 1": 8.15,"TOPIX Large70": 4.89,"TOPIX Core30": 4.48}, inplace=True)
NewIndexSeriesSize_df["NewIndexSeriesSize_3month"].replace({"TOPIX Small 2": 9.54, "TOPIX Mid400": 10.23,"TOPIX Small 1": 9.02,"TOPIX Large70": 8.95,"TOPIX Core30": 8.71}, inplace=True)
NewIndexSeriesSize_df["NewIndexSeriesSize_6month"].replace({"TOPIX Small 2": 15.39, "TOPIX Mid400": 19.71,"TOPIX Small 1": 12.73,"TOPIX Large70": 23.12,"TOPIX Core30": 24.47}, inplace=True)
NewIndexSeriesSize_df["NewIndexSeriesSize_12month"].replace({"TOPIX Small 2": 40.09, "TOPIX Mid400": 44.09,"TOPIX Small 1": 37.04,"TOPIX Large70": 41.56,"TOPIX Core30": 42.2}, inplace=True)

In [None]:
obj_df = pd.concat([obj_df, NewIndexSeriesSize_df],axis=1).drop(columns=['NewIndexSeriesSize'])

In [None]:
obj_df.head()

### Convert objects of dataframe to integer and float

In [None]:
for col in ['33SectorCode', '17SectorCode', 'NewIndexSeriesSizeCode']:
    obj_df[col] = obj_df[col].astype(int)

### Getting dummies for nominal columns

In [None]:
obj_df = pd.get_dummies(obj_df, columns=["Section/Products","NewMarketSegment","33SectorCode","33SectorName","17SectorCode","17SectorName"])

### Concat stock price and stock_list again

In [None]:
df_Price = df_Price.set_index("RowId")
base_df =  pd.concat([df_Price, obj_df],axis=1)

In [None]:
base_df = pd.get_dummies(base_df, columns=["SupervisionFlag"])

In [None]:
base_df['ExpectedDividend'] = base_df['ExpectedDividend'].replace(np.nan,1)
base_df = base_df.ffill()
base_df.isna().sum().sum()

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
import warnings, gc
import numpy as np 
import pandas as pd
import matplotlib.colors
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
from datetime import datetime, timedelta
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error,mean_absolute_error
from lightgbm import LGBMRegressor
from decimal import ROUND_HALF_UP, Decimal
warnings.filterwarnings("ignore")
import plotly.figure_factory as ff

In [None]:
base_df.Date = pd.to_datetime(base_df.Date)
base_df['Date'] = base_df['Date'].dt.strftime("%Y%m%d").astype(int)

In [None]:
base_df.shape[1]

In [None]:
import re
base_df = base_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
ts_fold = TimeSeriesSplit(n_splits=10, gap=10000)
prices=base_df.sort_values(['Date','SecuritiesCode'])
y=prices['Target'].to_numpy()
X=prices.drop(['Target'],axis=1)

feat_importance=pd.DataFrame()
sharpe_ratio=[]
    
for fold, (train_idx, val_idx) in enumerate(ts_fold.split(X, y)):
    
    print("\n========================== Fold {} ==========================".format(fold+1))
    X_train, y_train = X.iloc[train_idx,:], y[train_idx]
    X_valid, y_val = X.iloc[val_idx,:], y[val_idx]
    
    print("Train Date range: {} to {}".format(X_train.Date.min(),X_train.Date.max()))
    print("Valid Date range: {} to {}".format(X_valid.Date.min(),X_valid.Date.max()))
    
    X_train.drop(['Date','SecuritiesCode'], axis=1, inplace=True)
    X_val=X_valid[X_valid.columns[~X_valid.columns.isin(['Date','SecuritiesCode'])]]
    val_dates=X_valid.Date.unique()[1:-1]
    print("\nTrain Shape: {} {}, Valid Shape: {} {}".format(X_train.shape, y_train.shape, X_val.shape, y_val.shape))
    
    params = {'n_estimators': 500,
              'num_leaves' : 100,
              'learning_rate': 0.1,
              'colsample_bytree': 0.9,
              'subsample': 0.8,
              'reg_alpha': 0.4,
              'metric': 'mae',
              'random_state': 21}
    
    gbm = LGBMRegressor(**params).fit(X_train, y_train, 
                                      eval_set=[(X_train, y_train), (X_val, y_val)],
                                      verbose=300, 
                                      eval_metric=['mae','mse'])
    y_pred = gbm.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    feat_importance["Importance_Fold"+str(fold)]=gbm.feature_importances_
    feat_importance.set_index(X_train.columns, inplace=True)
    
    rank=[]
    X_val_df=X_valid[X_valid.Date.isin(val_dates)]
    for i in X_val_df.Date.unique():
        temp_df = X_val_df[X_val_df.Date == i].drop(['Date','SecuritiesCode'],axis=1)
        temp_df["pred"] = gbm.predict(temp_df)
        temp_df["Rank"] = (temp_df["pred"].rank(method="first", ascending=False)-1).astype(int)
        rank.append(temp_df["Rank"].values)

    stock_rank=pd.Series([x for y in rank for x in y], name="Rank")
    df=pd.concat([X_val_df.reset_index(drop=True),stock_rank,
                  prices[prices.Date.isin(val_dates)]['Target'].reset_index(drop=True)], axis=1)
    sharpe=calc_spread_return_sharpe(df)
    sharpe_ratio.append(sharpe)
    print("Valid Sharpe: {}, RMSE: {}, MAE: {}".format(sharpe,rmse,mae))
    
    del X_train, y_train,  X_val, y_val
    gc.collect()
    
print("\nAverage cross-validation Sharpe Ratio: {:.4f}, standard deviation = {:.2f}.".format(np.mean(sharpe_ratio),np.std(sharpe_ratio)))

**We have 125 features from stock price and stock_list. You should identify useful features**

In [None]:
feat_importance['avg'] = feat_importance.mean(axis=1)
feat_importance = feat_importance.sort_values(by='avg',ascending=True)
pal=sns.color_palette("plasma_r", 124).as_hex()[2:]
temp = dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), width=800))
fig=go.Figure()
for i in range(len(feat_importance.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feat_importance['avg'][i], 
                       line_color=pal[::-1][i],opacity=0.7,line_width=4))
fig.add_trace(go.Scatter(x=feat_importance['avg'], y=feat_importance.index, mode='markers', 
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.0f}<extra></extra>'))
fig.update_layout(template=temp,title='Overall Feature Importance', 
                  xaxis=dict(title='Average Importance',zeroline=False),
                  yaxis_showgrid=False, margin=dict(l=120,t=80),
                  height=500, width=500)
fig.show()

In [None]:
feat_importance['avg'].iloc[75:][::-1]

* **Average cross-validation Sharpe Ratio: 0.1601, standard deviation = 0.07.**