# Machine Learning
## Final Project LGBM

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.linear_model import LassoCV
import matplotlib.pyplot as plt

In [None]:
lgb_params = {
    "objective":"regression",
    "n_estimators":8000,     
    "num_leaves":1000,      
    "learning_rate":0.1,   
    "seed":1231,
    "tree_learner":"feature",
    "feature_fraction":0.88,
    "verbose":1,
    "device_type":"gpu"}

In [None]:
def upper_shadow(df): return df['High'] - np.maximum(df['Close'], df['Open'])
def lower_shadow(df): return np.minimum(df['Close'], df['Open']) - df['Low']

# A utility function to build features from the original df
def get_features(df_feat):

    df_feat["high_div_low"] = df_feat["High"] / df_feat["Low"]
    df_feat["open_sub_close"] = df_feat["Open"] - df_feat["Close"]

    df_feat['trade']=df_feat['Close']-df_feat['Open']
    df_feat['gtrade']=df_feat['trade']/df_feat['Count']

    df_feat['upper_Shadow'] = upper_shadow(df_feat)
    df_feat['lower_Shadow'] = lower_shadow(df_feat)   
    df_feat['shadow1']=df_feat['trade']/df_feat['Volume']
    df_feat['shadow2']=df_feat['upper_Shadow']/df_feat['Low']
    df_feat['shadow3']=df_feat['upper_Shadow']/df_feat['Volume']
    df_feat['shadow4']=df_feat['lower_Shadow']/df_feat['High']
    df_feat['shadow5']=df_feat['lower_Shadow']/df_feat['Volume']    

    df_feat['spread'] = df_feat['High'] - df_feat['Low']
    df_feat['mean_trade'] = df_feat['Volume']/df_feat['Count']
    df_feat['diff1'] = df_feat['Volume'] - df_feat['Count']
    df_feat['mean1'] = (df_feat['shadow5'] + df_feat['shadow3']) / 2
    df_feat['mean2'] = (df_feat['shadow1'] + df_feat['Volume']) / 2
    df_feat['mean3'] = (df_feat['trade'] + df_feat['gtrade']) / 2
    df_feat['mean4'] = (df_feat['diff1'] + df_feat['upper_Shadow']) / 2
    df_feat['mean5'] = (df_feat['diff1'] + df_feat['lower_Shadow']) / 2
    df_feat['UPS'] = (df_feat['High'] - np.maximum(df_feat['Close'], df_feat['Open']))
    df_feat['UPS'] = df_feat['UPS']
    df_feat['LOS'] = (np.minimum(df_feat['Close'], df_feat['Open']) - df_feat['Low'])
    df_feat['LOS'] = df_feat['LOS']
    df_feat['RNG'] = ((df_feat['High'] - df_feat['Low']) / df_feat['VWAP'])
    df_feat['RNG'] = df_feat['RNG']
    df_feat['MOV'] = ((df_feat['Close'] - df_feat['Open']) / df_feat['VWAP'])
    df_feat['MOV'] = df_feat['MOV']
    df_feat['CLS'] = ((df_feat['Close'] - df_feat['VWAP']) / df_feat['VWAP'])
    df_feat['CLS'] = df_feat['CLS']
    df_feat["Close/Open"] = df_feat["Close"] / df_feat["Open"]
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"]
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"]
    df_feat["High/Low"] = df_feat["High"] / df_feat["Low"]
    df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis = 1)
    df_feat["High/Mean"] = df_feat["High"] / df_feat["Mean"]
    df_feat["Low/Mean"] = df_feat["Low"] / df_feat["Mean"]
    df_feat["Volume/Count"] = df_feat["Volume"] / (df_feat["Count"] + 1)
    mean_price = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)
    median_price = df_feat[['Open', 'High', 'Low', 'Close']].median(axis=1)
    df_feat['high2mean'] = df_feat['High'] / mean_price
    df_feat['low2mean'] = df_feat['Low'] / mean_price
    df_feat['high2median'] = df_feat['High'] / median_price
    df_feat['low2median'] = df_feat['Low'] / median_price
    df_feat['volume2count'] = df_feat['Volume'] / (df_feat['Count'] + 1)
    return df_feat

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# import gresearch_crypto
# env = gresearch_crypto.make_env()
# iter_test = env.iter_test()

In [None]:
df_train = pd.read_csv('../input/g-research-crypto-forecasting/train.csv')
df_train_sup = pd.read_csv('../input/g-research-crypto-forecasting/supplemental_train.csv')
df_asset = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')

In [None]:
# def features_selection(df) :
#     features = ["Count", "Open", "High", "Low", "Close", "Volume", "VWAP", "timestamp"]
#     x = df[features].copy()
#     return x

In [None]:
df_train = df_train[~df_train.isin([np.nan, np.inf, -np.inf]).any(1)].reset_index(drop=True)
df_train_sup = df_train_sup[~df_train_sup.isin([np.nan, np.inf, -np.inf]).any(1)].reset_index(drop=True)
df_combine = pd.concat(
    [df_train, df_train_sup], axis = 0
)

del df_train, df_train_sup

In [None]:
pd.options.mode.chained_assignment = None
SEED = 1126
for asset_id in range(14):
    print(f"Asset ID = {asset_id}")
    df_sub = df_combine[df_combine["Asset_ID"] == asset_id]
    df_sub = get_features(df_sub)
    
    #preprocess
    df_sub = df_sub.drop(["timestamp", "Asset_ID"], axis=1)
    X_sub = df_sub.loc[:, df_sub.columns != "Target"]
    y_sub = df_sub["Target"]
    
    del df_sub
    
    #use L1 reg.
    X_sub = X_sub.fillna(0)
    clf = LassoCV(
        random_state=SEED, max_iter = 50000, cv = 5
        ).fit(X_sub, y_sub)
    
    final_features_l1Reg = X_sub.columns[
        (abs(clf.coef_) > 0).flatten()
    ]
    print(f"There are {len(final_features_l1Reg)} features selected by L1 regression")
    print(final_features_l1Reg.tolist())
    
    
    THRESHOLD = 0.01
    corr_value = X_sub.corrwith(y_sub)
    corr_abs = abs(corr_value).sort_values(ascending=False)
    final_features_corr = corr_abs[corr_abs > THRESHOLD].index
    print(f"There are {len(final_features_corr)} features selected by correlation coefficient")
    print(final_features_corr.tolist())
    
    final_features = list(
    set(final_features_l1Reg).union(
            set(final_features_corr)
        )
    )
    print(f"There are {len(final_features)} final features selected, union by L1 and corr coef. features:")
    print(final_features)
    
    plt.figure(figsize=(10,10))
    plt.plot(corr_abs)
    plt.xticks(rotation=45) 
    plt.show()
    del X_sub, y_sub

In [None]:
# def data_preprocess(df_data ,asset_id):
#     df = df_data[df_data["Asset_ID"] == asset_id]
#     df = get_features(df)
#     return df

In [None]:
# def training(asset_id):
#     df = data_preprocess(df_train, asset_id)
#     df = df.append(data_preprocess(df_train_sup, asset_id), ignore_index = True)
#     x = features_selection(df)
#     y = df['Target'].copy()
#     model = lgb.LGBMRegressor(**lgb_params)
#     model.fit(x, y)
#     return model

In [None]:
# models = {}
# for asset_id in range(14):
#     print("Training model for", df_asset[df_asset["Asset_ID"] == asset_id].iloc[0]["Asset_Name"])
#     model = training(asset_id)
#     models[asset_id] = model

In [None]:
# for i, (df_test, df_pred) in enumerate(iter_test):
#     for j, df_row in df_test.iterrows():              
#         model = models[df_row['Asset_ID']]
#         x_test = features_selection(df_row)
#         y_pred = model.predict(pd.DataFrame([x_test]))[0]
#         df_pred.loc[df_pred['row_id'] == df_row['row_id'], 'Target'] = y_pred    
#     env.predict(df_pred)