In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install ta

In [None]:
# import required package
import ta
from decimal import ROUND_HALF_UP, Decimal
from tqdm import tqdm


In [None]:
# set base_dir to load data
base_dir = "../input/jpx-tokyo-stock-exchange-prediction"
# There are three types of stock_price.csv
# We use one in the train_files folder for this notebook.
train_files_dir = f"{base_dir}/train_files"

In [None]:
import pickle
import pandas as pd
def save_pickle_file(path,data):
    pkl_file = open(path, 'wb')
    pickle.dump(data, pkl_file, protocol = 4)
    pkl_file.close()
def load_pickle_file(path):
    pkl_file = open(path, 'rb')
    data = pickle.load(pkl_file)
    pkl_file.close()
    return data

## 1、Adjust price

In [None]:
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price

In [None]:
# load stock price data
df_price = pd.read_csv(f"{train_files_dir}/stock_prices.csv")
df_price.head(5)


In [None]:
# generate AdjustedClose
df_price = adjust_price(df_price)
df_price.head(5)


In [None]:
df_price.info()

## 2、generate ta feat

In [None]:
from ta import add_all_ta_features
from ta.utils import dropna
from sklearn.model_selection import TimeSeriesSplit

In [None]:
def get_features_for_predict(price, code):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    close_col = "AdjustedClose"
    feats = price.loc[price["SecuritiesCode"] == code].copy()
    
    # Adds all 42 features
    feats = ta.add_all_ta_features(
        feats, "Open", "High", "Low", close_col, "Volume", fillna=False
    )
    
    # To only add specific features
    # Example: https://github.com/bukosabino/ta/blob/master/examples_to_use/bollinger_band_features_example.py
    # df['bb_bbm'] = indicator_bb.bollinger_mavg()
    # df['bb_bbh'] = indicator_bb.bollinger_hband()
    # df['bb_bbl'] = indicator_bb.bollinger_lband()
    
    # filling data for nan and inf
    feats = feats.fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)
    # drop AdjustedClose column
    feats = feats.drop([close_col], axis=1)

    return feats

In [None]:
# fetch prediction target SecuritiesCodes
codes = sorted(df_price["SecuritiesCode"].unique())
len(codes)

In [None]:
# generate feature
buff = []
for code in tqdm(codes):
    feat = get_features_for_predict(df_price, code)
    buff.append(feat)
feature = pd.concat(buff)

In [None]:
save_pickle_file("/kaggle/working/feature.pkl",feature)
#feature = load_pickle_file("/kaggle/working/feature.pkl")

In [None]:
# from IPython.display import FileLink
# %cd /kaggle/working
# FileLink("feature.pkl")



## 3、train

In [None]:
feature.index.value_counts().tail(500)

In [None]:
print(feature.columns)
feature.head(5)


In [None]:
ts_fold = TimeSeriesSplit(n_splits=5, gap=10000)
prices=feature.dropna().sort_values(['Date','SecuritiesCode'])
y=prices['Target'].to_numpy()
X=prices.drop(['Target'],axis=1)
feat_importance=pd.DataFrame()
sharpe_ratio=[]

for fold, (train_idx, val_idx) in enumerate(ts_fold.split(X, y)):
    
    print("\n========================== Fold {} ==========================".format(fold+1))
    X_train, y_train = X.iloc[train_idx,:], y[train_idx]
    X_valid, y_val = X.iloc[val_idx,:], y[val_idx]
    
    print("Train Date range: {} to {}".format(X_train.Date.min(),X_train.Date.max()))
    print("Valid Date range: {} to {}".format(X_valid.Date.min(),X_valid.Date.max()))
    
#     X_train.drop(['Date','SecuritiesCode'], axis=1, inplace=True)
#     X_val=X_valid[X_valid.columns[~X_valid.columns.isin(['Date','SecuritiesCode'])]]
#     val_dates=X_valid.Date.unique()[1:-1]
#     print("\nTrain Shape: {} {}, Valid Shape: {} {}".format(X_train.shape, y_train.shape, X_val.shape, y_val.shape))
    
#     params = {'n_estimators': 500,
#               'num_leaves' : 100,
#               'learning_rate': 0.1,
#               'colsample_bytree': 0.9,
#               'subsample': 0.8,
#               'reg_alpha': 0.4,
#               'metric': 'mae',
#               'random_state': 21}
    
#     gbm = LGBMRegressor(**params).fit(X_train, y_train, 
#                                       eval_set=[(X_train, y_train), (X_val, y_val)],
#                                       verbose=300, 
#                                       eval_metric=['mae','mse'])
#     y_pred = gbm.predict(X_val)
#     rmse = np.sqrt(mean_squared_error(y_val, y_pred))
#     mae = mean_absolute_error(y_val, y_pred)
#     feat_importance["Importance_Fold"+str(fold)]=gbm.feature_importances_
#     feat_importance.set_index(X_train.columns, inplace=True)
    
#     rank=[]
#     X_val_df=X_valid[X_valid.Date.isin(val_dates)]
#     for i in X_val_df.Date.unique():
#         temp_df = X_val_df[X_val_df.Date == i].drop(['Date','SecuritiesCode'],axis=1)
#         temp_df["pred"] = gbm.predict(temp_df)
#         temp_df["Rank"] = (temp_df["pred"].rank(method="first", ascending=False)-1).astype(int)
#         rank.append(temp_df["Rank"].values)

#     stock_rank=pd.Series([x for y in rank for x in y], name="Rank")
#     df=pd.concat([X_val_df.reset_index(drop=True),stock_rank,
#                   prices[prices.Date.isin(val_dates)]['Target'].reset_index(drop=True)], axis=1)
#     sharpe=calc_spread_return_sharpe(df)
#     sharpe_ratio.append(sharpe)
#     print("Valid Sharpe: {}, RMSE: {}, MAE: {}".format(sharpe,rmse,mae))
    
#     del X_train, y_train,  X_val, y_val
#     gc.collect()
    
print("\nAverage cross-validation Sharpe Ratio: {:.4f}, standard deviation = {:.2f}.".format(np.mean(sharpe_ratio),np.std(sharpe_ratio)))




In [None]:
#prices[:10][['RowId','SecuritiesCode','Target']]

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio