# Definition

In [None]:
import numpy as np
import pandas as pd
import jpx_tokyo_market_prediction
import lightgbm as lgb
import warnings

warnings.filterwarnings("ignore")

def init_stock_prices(prices, columns):
    prices = prices.set_index('RowId')
    prices.Date = pd.to_datetime(prices.Date)
    prices['DateInt'] = prices['Date'].dt.strftime("%Y%m%d").astype(int)
    prices = prices[columns]
    return prices

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

def add_rank(df):
    l = len(df)
    df = df.sort_values(by="Pred", ascending=False)
    df['Rank'] = np.arange(0, l)
    return df

fit_columns = ["DateInt", "SecuritiesCode", "High","Open","Close","Low","Volume"]
prices_columns = ['Date', 'DateInt', 'SecuritiesCode', 'Open', 'High',	'Low', 'Close', 'Volume']

# Load data & Add feature

In [None]:
%%time

def generate_xy():
    # stock_price
    prices_base1 = init_stock_prices(pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv"), prices_columns+['Target'])
    prices_base2 = init_stock_prices(pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv"), prices_columns+['Target'])
    prices_base = pd.concat([prices_base1, prices_base2]).dropna(axis=0)
    # slice train
    prices = prices_base.query('Date <= "2021-12-03"')
    X = prices[fit_columns]
    y = prices['Target']
    # slice test
    prices_test = prices_base.query('"2021-12-06" <= Date <= "2022-02-28"')
    test_X = prices_test[fit_columns]
    test_y = prices_test['Target']
    return X, y, test_X, test_y, prices_test

train_X, train_y, test_X, test_y, prices_test = generate_xy()

# Fit

In [None]:
%%time

model_o = lgb.LGBMRegressor(learning_rate=0.6818202991034834, max_bin=95, n_estimators=655, num_leaves=1263, random_seed=0)
model_o.fit(train_X, train_y)

# Prediction

In [None]:
%%time

sample_prediction = prices_test[['Date','SecuritiesCode']]
sample_prediction['Pred'] = model_o.predict(test_X[fit_columns])
sample_prediction['Target'] = test_y

df_ranked = sample_prediction.groupby('Date').apply(add_rank).set_index('Date')
df_ranked

# Evaluation

In [None]:
a = calc_spread_return_sharpe(df_ranked)
print(f"Score: {a:.3f}")