In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import jpx_tokyo_market_prediction

from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from xgboost import plot_importance

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    if len(df.Date.unique()) < 2:
        return buf.mean()
    else:
        return buf.mean() / buf.std()
    

In [None]:
def get_avg(_id_):
    return average.loc[_id_]


def prepareData(df):
    df['Date_type'] = pd.to_datetime(df['Date'], format = '%Y-%m-%d')
    df["Avg"] = df["SecuritiesCode"].apply(get_avg)
    df = pd.get_dummies(df, columns = ['Section/Products', 'Section', 'NewMarketSegment', '33SectorName', '17SectorName', 'NewIndexSeriesSize'])
    return df
    

In [None]:
f_dir = '../input/jpx-tokyo-stock-exchange-prediction'

## Stock Prices

In [None]:
prices = pd.read_csv(f'{f_dir}/supplemental_files/stock_prices.csv')
# prices.index = prices.Date_type
prices.sample(3)

In [None]:
average = pd.DataFrame(prices.groupby("SecuritiesCode").Target.mean())
prices['Target_lag1'] = prices.Target.shift(1)

## Stock Information

In [None]:
stock_info = pd.read_csv(f'{f_dir}/stock_list.csv')
stock_info.sample(3)

In [None]:
stock_info = stock_info.astype({'EffectiveDate' : 'string'})
stock_info['Date'] = stock_info.apply(lambda row : row['EffectiveDate'][:4] + '-' + row['EffectiveDate'][4:6] + '-' + row['EffectiveDate'][6:], axis = 1)
stock_info.head(2)

In [None]:
# Remove domestic-non-domestic
stock_info['Section'] = stock_info.apply(lambda row : row['Section/Products'].split('(')[0], axis = 1)

In [None]:
# Relative Market Cap
market_cap = stock_info.groupby(['33SectorName']).agg(total_market_cap = ('MarketCapitalization', 'sum')).reset_index()
stock_info = pd.merge(stock_info, market_cap, on = '33SectorName', how = 'inner')
stock_info['MarketCapPct'] = stock_info['MarketCapitalization'] / stock_info['total_market_cap'] * 100

In [None]:
stock_info[stock_info['SecuritiesCode'] == 1301]

## Model Data

In [None]:
prices_info = pd.merge(prices, stock_info.filter(['SecuritiesCode', 'MarketCapPct', 'Section/Products', 'Section', 'NewMarketSegment', '33SectorName', '17SectorName', 'NewIndexSeriesSize']), on = ['SecuritiesCode'], how = 'left')
prices_info.sample(5)

In [None]:
prices_info = prepareData(prices_info)

In [None]:
prices_info.sample(2)

In [None]:
# GBM for target:
feat_cols = ['Avg', 'Target_lag1', 'MarketCapPct'] + [c for c in prices_info.columns if c.startswith('Section')] + [c for c in prices_info.columns if c.startswith('NewMarketSegment')] + [c for c in prices_info.columns if c.startswith('33SectorName')] + [c for c in prices_info.columns if c.startswith('17SectorName')] + [c for c in prices_info.columns if c.startswith('NewIndexSeriesSize')]  
gbm_cols = feat_cols + ['Target'] + ['Date']

test_rows = prices_info.Date.unique()[-2:]
gbm_dat = prices_info.filter(gbm_cols)
gbm_test = gbm_dat[gbm_dat.Date.isin(test_rows)]
gbm_train = gbm_dat[~gbm_dat.index.isin(gbm_test.index)]

X_train, y_train = gbm_train.filter(feat_cols), gbm_train.Target
X_test, y_test = gbm_test.filter(feat_cols), gbm_test.Target


## Model

In [None]:
model = XGBRegressor(
    max_depth=5,
    n_estimators=400,
    min_child_weight=0.5, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.01,
    seed=42)

model.fit(X_train, y_train, eval_metric="rmse", eval_set=[(X_train, y_train), (X_test, y_test)], 
    verbose=False, early_stopping_rounds = 20)

In [None]:
results = model.evals_result()
plt.plot(results['validation_0']['rmse'], label='train')
plt.plot(results['validation_1']['rmse'], label='test')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)
plot_features(model, (10,14))

In [None]:
latest_day = gbm_test[gbm_test.Date == gbm_test.Date.unique()[-1]]
latest_day['Target'] = model.predict(latest_day.filter(feat_cols))
latest_day = latest_day.sort_values(by = "Target", ascending=False)
latest_day['Rank'] = np.arange(0,2000)
calc_spread_return_sharpe(latest_day)  # 6.273

## Submit Predictions

In [None]:
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

for i, (prices, options, financials, trades, secondary_prices, sample_prediction) in enumerate(iter_test):
    cur_date = prices.Date.iloc[0]
    check = prices_info[prices_info.Date == cur_date]
    if len(check.index) > 0: # cur_date in prices_info file        
        df_lagged = check.filter(['Date', 'SecuritiesCode', 'Target_lag1'])        
        prices = pd.merge(prices, df_lagged, on = ['SecuritiesCode', 'Date'], how = 'left')
    else: # use previous sample_prediction
        if i == 0: # i.e. no previous dataframe in prediction
            prices['Target_lag1'] = 0
        else:
            prices = pd.merge(prices, df_prev, on = ['SecuritiesCode'], how = 'inner')
        
    
    prices = pd.merge(prices, stock_info.filter(['SecuritiesCode', 'MarketCapPct', 'Section/Products', 'Section',
                                                  'NewMarketSegment', '33SectorName', '17SectorName', 'NewIndexSeriesSize']),
                      on = ['SecuritiesCode'], how = 'left')
    prices = prepareData(prices)
    # probably one of the train columns is not in the sample_prediction - probably need to use a method to create repeats
    # for now create it with all 0s
    missing_cols = [x for x in feat_cols if x not in prices.columns]
    for m in missing_cols:
        prices[m] = 0
    X_eval = prices.filter(feat_cols)
    sample_prediction['Prediction'] = model.predict(X_eval) 
    sample_prediction = sample_prediction.sort_values(by = "Prediction", ascending=False)
    sample_prediction.Rank = np.arange(0,2000)
    sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
    sample_prediction.drop(["Prediction"],axis=1)
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    env.predict(submission)
    sample_prediction['Target_lag1'] = sample_prediction['Prediction']
    df_prev = sample_prediction.filter(['Date', 'Target_lag1'])