## Feature Preprocessing Phase

In [None]:
from glob import glob
import pandas as pd
import yfinance as yf

MAIN_SOURCE = True

LIVE_PRED_START = pd.Timestamp.today() + pd.to_timedelta(1, "d")
LIVE_PRED_START = LIVE_PRED_START.strftime("%Y-%m-%d")

LIVE_PRED_END = pd.to_datetime(LIVE_PRED_START) + pd.to_timedelta(1, "d")
LIVE_PRED_END = LIVE_PRED_END.date().strftime("%Y-%m-%d")

START_HOUR = 9
END_HOUR = 18

print("Pred Date Start: ", LIVE_PRED_START, "Pred Date End: ", LIVE_PRED_END)

stock_price_files = glob("../Dataset/*")

stock_price_list = [pd.read_csv(v) for v in stock_price_files]
stock_price_df = pd.concat(stock_price_list, ignore_index=True)
stock_price_df = stock_price_df.drop_duplicates().reset_index(drop=True)
stock_price_df["timestamp"] = pd.to_datetime(
    stock_price_df.timestamp).dt.tz_localize(None)
stock_price_df.sort_values(
    by=["short_name", "timestamp"], ignore_index=True, inplace=True
)
stock_price_df.drop_duplicates(ignore_index=True, inplace=True)
stock_price_df["date"] = stock_price_df.timestamp.dt.date
stock_price_df["hour"] = stock_price_df.timestamp.dt.hour

# Yahoo Features

START_DATE = "2018-01-02"
END_DATE = LIVE_PRED_END

def pull_yahoo_features(stock_name, start_date, end_date, action = True):
    yf_api = yf.Ticker(stock_name)
    stock_data = yf_api.history(
        start=start_date, end=end_date, interval="1d", actions=action
        ).reset_index()
    if action:
        stock_data.columns = [
            "date", "OPEN", "HIGH",
            "LOW", "CLOSE", "VOLUME", "DIVIDENDS", "STOCKSPLITS"]
    else:
        stock_data.columns = [
            "date", "OPEN", "HIGH", "LOW", "CLOSE", "VOLUME"]
        
    if ".IS" in stock_name:
        stock_data["date"] = pd.to_datetime(
            stock_data.date).dt.tz_localize(None)
    else:
        stock_data["date"] = pd.to_datetime(
            stock_data.date).dt.tz_convert(
                "Europe/Istanbul").dt.tz_localize(None)
        stock_data.drop(columns=["VOLUME"], inplace=True)
        
    stock_data["date"] = stock_data.date.dt.date
    stock_data = stock_data.melt(id_vars=["date"])
    stock_data["short_name"] = stock_name.split(".")[0]
    
    return stock_data

stock_name_yahoo = [f"{v}.IS" for v in stock_price_df.short_name.unique()] + ["XU030.IS"]

# bist features

bist_stocks = [
    pull_yahoo_features(v, START_DATE, END_DATE) for v in stock_name_yahoo
    ]
bist_stocks = pd.concat(bist_stocks, ignore_index=True)

drop_indices = bist_stocks[
    (bist_stocks.short_name == "XU030") & 
    (bist_stocks.variable.isin(["DIVIDENDS", "STOCKSPLITS"]))
].index.to_list()

bist_stocks = bist_stocks[~bist_stocks.index.isin(drop_indices)].reset_index(drop=True)

# global features

global_stocks_name = ["CL=F", "USDTRY=X", "EURTRY=X", "GC=F"]

global_stocks = [
    pull_yahoo_features(v, START_DATE, END_DATE, action=False) for v in global_stocks_name
    ]
global_stocks = pd.concat(global_stocks, ignore_index=True)

# global markets

global_market_name = ["^N225", "^NY", "^NDX"]

global_markets = [
    pull_yahoo_features(v, START_DATE, END_DATE, action=False) for v in global_market_name
]
global_markets = pd.concat(global_markets, ignore_index=True)

all_ohlc_features = pd.concat([
    bist_stocks, global_stocks, global_markets
], ignore_index=True)

# TR related features

inflation_tr = pd.read_csv("/home/ssc/Desktop/Untitled 1.csv")
inflation_tr.columns = ["date", "YEARLYCHNG", "MONTHLYCHNG"]
inflation_tr["date"] = pd.to_datetime(inflation_tr.date).dt.date
inflation_tr[["YEARLYCHNG", "MONTHLYCHNG"]] = inflation_tr[["YEARLYCHNG", "MONTHLYCHNG"]] / 100
inflation_tr = inflation_tr.melt(id_vars=["date"])
inflation_tr["short_name"] = "INFRATE"

overall_external_features = pd.concat(
    [all_ohlc_features, inflation_tr], ignore_index=True)
overall_external_features["variable"] = (
    overall_external_features["short_name"] + "_" + overall_external_features["variable"]
    )

org_stock_names = list(stock_price_df.short_name.unique())

stock_price_pivot = stock_price_df.pivot_table(
    index=["timestamp", "date", "hour"], columns="short_name", values="price"
).reset_index().rename_axis(None, axis=1)
stock_price_pivot[org_stock_names] = stock_price_pivot[org_stock_names].interpolate()

live_pred_empty_data = pd.date_range(
    start=LIVE_PRED_START, end=LIVE_PRED_END, freq="h", closed="left"
)
live_pred_empty_df = pd.DataFrame(live_pred_empty_data, columns=["timestamp"])
live_pred_empty_df["date"] = live_pred_empty_df.timestamp.dt.date
live_pred_empty_df["hour"] = live_pred_empty_df.timestamp.dt.hour
live_pred_empty_df = live_pred_empty_df[(live_pred_empty_df["hour"] >= START_HOUR) & (live_pred_empty_df["hour"] <= END_HOUR)].reset_index(drop=True)

stock_price_pivot = stock_price_pivot.merge(live_pred_empty_df, on=["date", "hour", "timestamp"], how="outer")

overall_external_features_pivot = overall_external_features.pivot_table(
    index=["date"], columns="variable", values="value"
).reset_index().rename_axis(None, axis=1)
overall_external_features_pivot["INFRATE_MONTHLYCHNG"] = overall_external_features_pivot["INFRATE_MONTHLYCHNG"].ffill() 
overall_external_features_pivot["INFRATE_YEARLYCHNG"] = overall_external_features_pivot["INFRATE_YEARLYCHNG"].ffill()

overall_external_features_pivot = overall_external_features_pivot.interpolate()

from itertools import product


ROLL_DAYS = [5, 20, 60, 120] 
rolling_iterations = product(org_stock_names, ROLL_DAYS)

rolling_df = stock_price_pivot[["timestamp", "date", "hour"]].copy()
for stock, day in rolling_iterations:
    rolling_df[f"ROLLING_{day}DAY_MEAN_{stock}"] = stock_price_pivot[stock].rolling(day * 10, closed="left").mean()
    rolling_df[f"ROLLING_{day}DAY_STD_{stock}"] = stock_price_pivot[stock].rolling(day * 10, closed="left").std()
    rolling_df[f"ROLLING_{day}DAY_MEDIAN_{stock}"] = stock_price_pivot[stock].rolling(day * 10, closed="left").median()
    rolling_df[f"ROLLING_{day}DAY_MIN_{stock}"] = stock_price_pivot[stock].rolling(day * 10, closed="left").min()
    rolling_df[f"ROLLING_{day}DAY_MAX_{stock}"] = stock_price_pivot[stock].rolling(day * 10, closed="left").max()

LAG_DAYS = [1, 2, 3, 4, 5] 
lag_iterations = product(org_stock_names, LAG_DAYS)

lag_df = stock_price_pivot[["timestamp", "date", "hour"]].copy()
for stock, day in lag_iterations:
    lag_df[f"LAG_{day}DAY_{stock}"] = stock_price_pivot[stock].shift(10 * day)

ar_features = rolling_df.merge(lag_df, how="left")

overall_rolling_vols = []
for day in ROLL_DAYS:
    volume_roll_df = overall_external_features_pivot.filter(like="VOLUME").rolling(day).mean()
    volume_roll_df.columns = [f"ROLLING_{day}DAY_MEAN_{x}" for x in list(volume_roll_df)]
    overall_rolling_vols.append(volume_roll_df)
overall_rolling_vols_df = pd.concat(overall_rolling_vols, axis=1)

overall_external_features_pivot = pd.concat(
    [overall_external_features_pivot, overall_rolling_vols_df], axis=1)

# Target
import numpy as np

response_df = stock_price_pivot.copy()
    
response_df[org_stock_names] = response_df[org_stock_names].pct_change(periods=1, limit=1).shift(-1)

drop_cols = ["timestamp", "date"] + org_stock_names

model_df = response_df.merge(overall_external_features_pivot, how="left", on=["date"]).merge(ar_features, how="left")
model_df["date"] = pd.to_datetime(model_df["date"])

model_df["year"] = model_df.date.dt.year
model_df["month"] = model_df.date.dt.month
model_df["day"] = model_df.date.dt.day
model_df["dow"] = model_df.date.dt.dayofweek
model_df["quarter"] = model_df.date.dt.quarter
model_df["doy"] = model_df.date.dt.dayofyear
model_df["woy"] = model_df.date.dt.isocalendar().week.astype("int")

model_df["is_monday_morning"] = 0
model_df.loc[(model_df.dow == 0) & (model_df.hour < 13), "is_monday_morning"] = 1
model_df["is_friday_noon"] = 0
model_df.loc[(model_df.dow == 4) & (model_df.hour >= 13), "is_friday_noon"] = 1

model_df["date"] = model_df["date"].astype("str")

## Modelling Phase

In [None]:
import pandas as pd
from glob import glob

LIVE_PRED_START = pd.Timestamp.today() + pd.to_timedelta(1, "d")
LIVE_PRED_START = LIVE_PRED_START.strftime("%Y-%m-%d")

LIVE_PRED_END = pd.to_datetime(LIVE_PRED_START) + pd.to_timedelta(1, "d")
LIVE_PRED_END = LIVE_PRED_END.date().strftime("%Y-%m-%d")

latest_actual_date = "2024-01-11"

START_HOUR = 9
END_HOUR = 18

stock_price_files = glob("../Dataset/*")

stock_price_list = [pd.read_csv(v) for v in stock_price_files]
stock_price_df = pd.concat(stock_price_list, ignore_index=True)
stock_price_df = stock_price_df.drop_duplicates().reset_index(drop=True)
stock_price_df["timestamp"] = pd.to_datetime(
    stock_price_df.timestamp).dt.tz_localize(None)
stock_price_df.sort_values(
    by=["short_name", "timestamp"], ignore_index=True, inplace=True
)
stock_price_df.drop_duplicates(ignore_index=True, inplace=True)
stock_price_df["date"] = stock_price_df.timestamp.dt.date
stock_price_df["hour"] = stock_price_df.timestamp.dt.hour

stock_names = list(stock_price_df.short_name.unique())

processed = model_df.copy()
predictors = processed.drop(columns=stock_names)

target_raw = stock_price_df.pivot_table(
    index=["timestamp", "date", "hour"], columns="short_name", values="price"
).reset_index().rename_axis(None, axis=1)
target_raw[stock_names] = target_raw[stock_names].interpolate()

# Target TL Diff
# Bir sonraki saat fiyat X TL arttı veya azaldı

live_pred_empty_data = pd.date_range(
    start=LIVE_PRED_START, end=LIVE_PRED_END, freq="h", closed="left"
)
live_pred_empty_df = pd.DataFrame(live_pred_empty_data, columns=["timestamp"])
live_pred_empty_df["date"] = live_pred_empty_df.timestamp.dt.date
live_pred_empty_df["hour"] = live_pred_empty_df.timestamp.dt.hour
live_pred_empty_df = live_pred_empty_df[(live_pred_empty_df["hour"] >= START_HOUR) & (live_pred_empty_df["hour"] <= END_HOUR)].reset_index(drop=True)
live_pred_empty_df = live_pred_empty_df[["date", "hour"]].copy()

target_try_return = target_raw[["date", "hour"]].copy()
for stock in stock_names:
    target_try_return[stock] = target_raw[stock].diff(periods=1).shift(-1)
target_try_return = target_try_return.merge(live_pred_empty_df, on=["date", "hour"], how="outer")

target_log_return = target_raw[["date", "hour"]].copy()
for stock in stock_names:
    log_transformed = target_raw[stock].apply(np.log)
    target_log_return[stock] = log_transformed.diff(periods=1).shift(-1)
target_log_return = target_log_return.merge(live_pred_empty_df, on=["date", "hour"], how="outer")

target_pct_return = target_raw[["date", "hour"]].copy()
for stock in stock_names:
    target_pct_return[stock] = target_raw[stock].pct_change(periods=1, limit=1).shift(-1)
target_pct_return = target_pct_return.merge(live_pred_empty_df, on=["date", "hour"], how="outer")

import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor

TEST_START = LIVE_PRED_START

param = {
    'objective': 'regression',
    'boosting_type': 'rf',
    'lambda_l1': 8.816979942542755,
    'lambda_l2': 3.281542231961584e-06,
    'num_leaves': 138,
    'feature_fraction': 0.9948710619824198,
    'bagging_fraction': 0.43847967094014917,
    'min_child_samples': 92,
    'max_depth': 5,
    'num_iterations': 114,
    'boost_from_average': True,
    'learning_rate': 0.04548612752166902,
    'bagging_freq': 1
    }

def different_targets_to_model(predictors_df, target_df, test_start, params):

    predictors_df["date"] = pd.to_datetime(predictors_df.date)
    target_df["date"] = pd.to_datetime(target_df.date)
    model_df = target_df.merge(predictors_df, how="right")

    drop_cols = ["timestamp", "date"] + stock_names

    train_data = model_df[model_df.date < test_start].reset_index(drop=True).dropna(axis=0).reset_index(drop=True)
    test_data = model_df[model_df.date >= test_start].reset_index(drop=True)

    X_train = train_data.drop(columns=drop_cols)
    y_train = train_data[stock_names].copy()
    X_test = test_data.drop(columns=drop_cols)

    model = MultiOutputRegressor(
        lgb.LGBMRegressor(**params), 
        n_jobs=-1)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    return preds

log_param = param.copy()
log_param["objective"] = "mape"

pct_param = param.copy()
pct_param["objective"] = "regression_l1"

pct_return_preds = different_targets_to_model(predictors.copy(), target_pct_return.copy(), TEST_START, params=pct_param)
log_return_preds = different_targets_to_model(predictors.copy(), target_log_return.copy(), TEST_START, params=log_param)
try_return_preds = different_targets_to_model(predictors.copy(), target_try_return.copy(), TEST_START, params=param)

## Submission Phase

In [None]:
def final_predicion(raw_preds, latest_actual_date, is_pct_return = False):

    close_baseline = target_raw[(target_raw.date == pd.to_datetime(latest_actual_date)) & (target_raw.hour == 18)].reset_index(drop=True)
    close_baseline = close_baseline.melt(id_vars=["timestamp", "date", "hour"]).reset_index(drop=True)
    close_baseline.drop(columns=["date", "hour", "timestamp"], inplace=True)

    test_period = target_pct_return.loc[(target_pct_return.date >= pd.to_datetime(TEST_START)), ["date", "hour"]].copy()
    test_period[stock_names] = raw_preds
    test_period = test_period.melt(id_vars=["date", "hour"], value_name="prediction_return")

    test_results = test_period.merge(close_baseline, how="left")

    if is_pct_return:
        test_results["prediction_price"] = test_results["value"] * (1 + test_results["prediction_return"])
    else:
        test_results["prediction_price"] = test_results["value"] + test_results["prediction_return"]
    test_results.rename(columns={"variable": "short_name"}, inplace=True)
    
    return test_results

pct_df = final_predicion(pct_return_preds, latest_actual_date, True)[["date", "hour", "short_name", "prediction_price"]]
log_df = final_predicion(log_return_preds, latest_actual_date, False)[["date", "hour", "short_name", "prediction_price"]]
try_df = final_predicion(try_return_preds, latest_actual_date, False)[["date", "hour", "short_name", "prediction_price"]]

ensemble_pred_df = pct_df[["date", "hour", "short_name"]].copy()
ensemble_pred_df["prediction"] = np.mean([pct_df.prediction_price.values, log_df.prediction_price.values, try_df.prediction_price.values], axis=0)

import json

submission_data = ensemble_pred_df[ensemble_pred_df.date == pd.to_datetime(LIVE_PRED_START)].reset_index(drop=True)
submission_data = submission_data.pivot_table(index=["date", "short_name"], columns="hour", values="prediction").reset_index().rename_axis(None, axis=1)

submission_dict = {}
for stock in stock_names:
    preds = submission_data.query(f"short_name == '{stock}'").iloc[0, 2:].to_list()
    submission_dict[stock] = preds

with open(f"./{LIVE_PRED_START}_stock_predictions.json", "w") as stock_file:
    json.dump(submission_dict, stock_file)