# Machine Learning
## Final Project LightGBM Model

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.linear_model import LassoCV

In [2]:
N_ESTIMATORS = 8000
NUM_LEAVES = 400
LEARNING_RATE = 1.5
SEED = 1231
DEVICE_TYPE = "cpu"

SUP_DATA = 1
TEST = 0
ASSETS_COMBINATIONS = 0
FEATURE_ENGINEERING = 0

feature_axis = 1

if (TEST):
    lgb_params = {
        "objective":"regression",
        "n_estimators":N_ESTIMATORS,     
        "num_leaves":NUM_LEAVES,      
        "learning_rate":LEARNING_RATE,   
        "seed":SEED,
        "tree_learner":"feature",
        "feature_fraction":0.88,
        "verbose":1,
        "device_type":DEVICE_TYPE}
else:
    lgb_params = {
        "objective":"regression",
        "n_estimators":N_ESTIMATORS,     
        "num_leaves":NUM_LEAVES,      
        "learning_rate":LEARNING_RATE,   
        "seed":SEED,
        "verbose":1,
        "device_type":DEVICE_TYPE}

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/g-research-crypto-forecasting/example_sample_submission.csv
/kaggle/input/g-research-crypto-forecasting/asset_details.csv
/kaggle/input/g-research-crypto-forecasting/example_test.csv
/kaggle/input/g-research-crypto-forecasting/train.csv
/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv
/kaggle/input/g-research-crypto-forecasting/gresearch_crypto/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/g-research-crypto-forecasting/gresearch_crypto/__init__.py


In [4]:
import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

In [5]:
df_train = pd.read_csv('../input/g-research-crypto-forecasting/train.csv')

In [6]:
if (SUP_DATA):
    df_train_sup = pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv')

In [7]:
df_asset = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')

In [8]:
df_train = df_train[~df_train.isin([np.nan, np.inf, -np.inf]).any(1)].reset_index(drop=True)

In [9]:
if (SUP_DATA):
    df_train_sup = df_train_sup[~df_train_sup.isin([np.nan, np.inf, -np.inf]).any(1)].reset_index(drop=True)

In [10]:
def get_features(df_feat):
    df_feat["high_div_low"] = df_feat["High"] / df_feat["Low"]
    df_feat["open_sub_close"] = df_feat["Open"] - df_feat["Close"]
    df_feat['trade']=df_feat['Close']-df_feat['Open']
    df_feat['gtrade']=df_feat['trade']/df_feat['Count']
    df_feat['upper_Shadow'] = df_feat['High'] - np.maximum(df_feat['Close'], df_feat['Open'])
    df_feat['lower_Shadow'] = np.minimum(df_feat['Close'], df_feat['Open']) - df_feat['Low']    
    df_feat['shadow1']=df_feat['trade']/df_feat['Volume']
    df_feat['shadow2']=df_feat['upper_Shadow']/df_feat['Low']
    df_feat['shadow3']=df_feat['upper_Shadow']/df_feat['Volume']
    df_feat['shadow4']=df_feat['lower_Shadow']/df_feat['High']
    df_feat['shadow5']=df_feat['lower_Shadow']/df_feat['Volume']    
    df_feat['spread'] = df_feat['High'] - df_feat['Low']
    df_feat['mean_trade'] = df_feat['Volume']/df_feat['Count']
    df_feat['diff1'] = df_feat['Volume'] - df_feat['Count']
    df_feat['mean1'] = (df_feat['shadow5'] + df_feat['shadow3']) / 2
    df_feat['mean2'] = (df_feat['shadow1'] + df_feat['Volume']) / 2
    df_feat['mean3'] = (df_feat['trade'] + df_feat['gtrade']) / 2
    df_feat['mean4'] = (df_feat['diff1'] + df_feat['upper_Shadow']) / 2
    df_feat['mean5'] = (df_feat['diff1'] + df_feat['lower_Shadow']) / 2
    df_feat['UPS'] = (df_feat['High'] - np.maximum(df_feat['Close'], df_feat['Open']))
    df_feat['UPS'] = df_feat['UPS']
    df_feat['LOS'] = (np.minimum(df_feat['Close'], df_feat['Open']) - df_feat['Low'])
    df_feat['LOS'] = df_feat['LOS']
    df_feat['RNG'] = ((df_feat['High'] - df_feat['Low']) / df_feat['VWAP'])
    df_feat['RNG'] = df_feat['RNG']
    df_feat['MOV'] = ((df_feat['Close'] - df_feat['Open']) / df_feat['VWAP'])
    df_feat['MOV'] = df_feat['MOV']
    df_feat['CLS'] = ((df_feat['Close'] - df_feat['VWAP']) / df_feat['VWAP'])
    df_feat['CLS'] = df_feat['CLS']
    df_feat["Close/Open"] = df_feat["Close"] / df_feat["Open"]
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"]
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"]
    df_feat["High/Low"] = df_feat["High"] / df_feat["Low"]
    df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis = feature_axis)
    df_feat["High/Mean"] = df_feat["High"] / df_feat["Mean"]
    df_feat["Low/Mean"] = df_feat["Low"] / df_feat["Mean"]
    df_feat["Volume/Count"] = df_feat["Volume"] / (df_feat["Count"] + 1)
    df_feat['Median'] = df_feat[['Open', 'High', 'Low', 'Close']].median(axis = feature_axis)
    df_feat['high2mean'] = df_feat['High'] / df_feat['Mean']
    df_feat['low2mean'] = df_feat['Low'] / df_feat['Mean']
    df_feat['high2median'] = df_feat['High'] / df_feat['Median']
    df_feat['low2median'] = df_feat['Low'] / df_feat['Median']
    df_feat['volume2count'] = df_feat['Volume'] / (df_feat['Count'] + 1)
    return df_feat

In [11]:
def features_selection(df) :
    df_sub = df.drop(["timestamp", "Asset_ID"], axis=1)
    X_sub = df_sub.loc[:, df_sub.columns != "Target"]
    y_sub = df_sub["Target"]
    del df_sub
    X_sub = X_sub.fillna(0)
    clf = LassoCV(
        random_state=SEED, max_iter = 50000, cv = 5
        ).fit(X_sub, y_sub)
    final_features_l1Reg = X_sub.columns[
        (abs(clf.coef_) > 0).flatten()
    ]
    THRESHOLD = 0.01
    corr_value = X_sub.corrwith(y_sub)
    corr_abs = abs(corr_value).sort_values(ascending=False)
    final_features_corr = corr_abs[corr_abs > THRESHOLD].index
    final_features = list(
    set(final_features_l1Reg).union(
            set(final_features_corr)
        )
    )
    features = final_features
    return features

In [12]:
def data_preprocess(asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id].copy()
    if (SUP_DATA):
        df = df.append(df_train_sup[df_train_sup["Asset_ID"] == asset_id])
    if (FEATURE_ENGINEERING):
        df = get_features(df)
    return df

In [13]:
def data_postprocess(df, features):
    x = df[features].copy()
    return x

In [14]:
def training(asset_id):
    if (ASSETS_COMBINATIONS):
        if (asset_id == 0):
            df = data_preprocess(asset_id)
            df = df.append(data_preprocess(3), ignore_index = True)
        elif (asset_id == 1):
            df = data_preprocess(asset_id)
            df = df.append(data_preprocess(6), ignore_index = True)
        elif (asset_id == 2):
            df = data_preprocess(asset_id)
            df = df.append(data_preprocess(9), ignore_index = True)
        else:
            df = data_preprocess(asset_id)
    else:
        df = data_preprocess(asset_id)
    if (FEATURE_ENGINEERING):
        features = features_selection(df)
    else:
        features = ["Count", "Open", "High", "Low", "Close", "Volume", "VWAP", "timestamp"]
    x = data_postprocess(df, features)
    y = df['Target'].copy()
    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(x, y)
    return model, features

In [15]:
models = {}
models_features = {}
for asset_id in range(14):
    if (ASSETS_COMBINATIONS):
        if (asset_id == 0):
            print("Training model for", df_asset[df_asset["Asset_ID"] == asset_id].iloc[0]["Asset_Name"], "and", df_asset[df_asset["Asset_ID"] == 3].iloc[0]["Asset_Name"])
            model, features = training(asset_id)
            models[asset_id] = model
            models[3] = model
            models_features[asset_id] = features
            models_features[3] = features
        elif (asset_id == 1):
            print("Training model for", df_asset[df_asset["Asset_ID"] == asset_id].iloc[0]["Asset_Name"], "and", df_asset[df_asset["Asset_ID"] == 6].iloc[0]["Asset_Name"])
            model, features = training(asset_id)
            models[asset_id] = model
            models[6] = model
            models_features[asset_id] = features
            models_features[6] = features
        elif (asset_id == 2):
            print("Training model for", df_asset[df_asset["Asset_ID"] == asset_id].iloc[0]["Asset_Name"], "and", df_asset[df_asset["Asset_ID"] == 9].iloc[0]["Asset_Name"])
            model, features = training(asset_id)
            models[asset_id] = model
            models[9] = model
            models_features[asset_id] = features
            models_features[9] = features
        elif (asset_id == 3 or asset_id == 6 or asset_id == 9):
            continue
        else:
            print("Training model for", df_asset[df_asset["Asset_ID"] == asset_id].iloc[0]["Asset_Name"])
            model, features = training(asset_id)
            models[asset_id] = model
            models_features[asset_id] = features
    else:
        print("Training model for", df_asset[df_asset["Asset_ID"] == asset_id].iloc[0]["Asset_Name"])
        model, features = training(asset_id)
        models[asset_id] = model
        models_features[asset_id] = features

Training model for Binance Coin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 2088840, number of used features: 8
[LightGBM] [Info] Start training from score 0.000031
Training model for Bitcoin
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 2115798, number of used features: 8
[LightGBM] [Info] Start training from score -0.000001
Training model for Bitcoin Cash
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 2108463, number of used features: 8
[LightGBM] [Info] Start training from score -0.000006
Training model for Cardano
You can set `force_col_wise=true` t

In [16]:
feature_axis = 0
for i, (df_test, df_pred) in enumerate(iter_test):
    for j, df_row in df_test.iterrows():   
        if (FEATURE_ENGINEERING):
            df = get_features(df_row)
        else:
            df = df_row
        model = models[df['Asset_ID']]
        x_test = data_postprocess(df, models_features[df['Asset_ID']])
        y_pred = model.predict(pd.DataFrame([x_test]))[0]
        df_pred.loc[df_pred['row_id'] == df_row['row_id'], 'Target'] = y_pred    
    env.predict(df_pred)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
