**Parent kernal [link ](https://www.kaggle.com/julian3833/g-research-starter-lgbm-pipeline-lb)**

**changes n_estimators=1000,num_leaves=500,max_depth=10**

In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
import gresearch_crypto


TRAIN_CSV = '/kaggle/input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'

In [2]:
df_train = pd.read_csv(TRAIN_CSV)
df_train.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [3]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
df_asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin
3,5,1.386294,EOS.IO
5,6,5.894403,Ethereum
4,7,2.079442,Ethereum Classic
11,8,1.098612,IOTA
6,9,2.397895,Litecoin


# Training

## Utility functions to train a model for one asset

In [4]:
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

# A utility function to build features from the original df
# It works for rows to, so we can reutilize it.
def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['upper_Shadow'] = upper_shadow(df_feat)
    df_feat['lower_Shadow'] = lower_shadow(df_feat)
    df_feat["high_div_low"] = df_feat["High"] / df_feat["Low"]
    #df_feat["open_sub_close"] = df_feat["Open"] - df_feat["Close"]
    df_feat['trade']=df_feat['Close']-df_feat['Open']
    df_feat['gtrade']=df_feat['trade']/df_feat['Count']
    df_feat['shadow1']=df_feat['trade']/df_feat['Volume']
    #df_feat['shadow2']=df_feat['upper_Shadow']/df['Low']
    df_feat['shadow3']=df_feat['upper_Shadow']/df['Volume']
    #df_feat['shadow4']=df_feat['lower_Shadow']/df['High']
    df_feat['shadow5']=df_feat['lower_Shadow']/df['Volume']
    return df_feat
def log(model,X_train, X_valid, y_train, y_valid,train_split=1.0):
    if train_split > 0:
        X_train=X_train[:int(train_split*X_train.shape[0])]
        y_train=y_train[:int(train_split*X_train.shape[0])]
    
        pred=model.predict(X_train)
        print('Training :- ')
        print(f'MSE : {np.mean((y_train-pred)**2)}')
        print(f'CV : {pearsonr(pred,y_train)[0]}')
    pred=model.predict(X_valid)
    print('Validation :- ')
    print(f'MSE : {np.mean((y_valid-pred)**2)}')
    print(f'CV : {pearsonr(pred,y_valid)[0]}')

def get_Xy_and_model_for_asset(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
   
    # TODO: Try different features here!
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.dropna(how="any")
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    # TODO: Try different models here!
    model = LGBMRegressor(n_estimators=1500,num_leaves=700,learning_rate=0.09)
    model.fit(X_train, y_train)
    print('[Finished Training] evaluating')
    log(model,X_train, X_test, y_train, y_test,0.0)
    
    
    return X, y, model

## Loop over all assets

In [5]:
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    X, y, model = get_Xy_and_model_for_asset(df_train, asset_id)    
    Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model

Training model for Binance Coin     (ID=0 )
[Finished Training] evaluating
Validation :- 
MSE : 3.210707062999415e-05
CV : 0.058383789318573416
Training model for Bitcoin          (ID=1 )
[Finished Training] evaluating
Validation :- 
MSE : 4.135510510165544e-06
CV : 0.1196698213099989
Training model for Bitcoin Cash     (ID=2 )
[Finished Training] evaluating
Validation :- 
MSE : 4.68995497741847e-05
CV : 0.07744650727432165
Training model for Cardano          (ID=3 )
[Finished Training] evaluating
Validation :- 
MSE : 2.1129872847109772e-05
CV : 0.06950121312370172
Training model for Dogecoin         (ID=4 )
[Finished Training] evaluating
Validation :- 
MSE : 7.270779763979233e-05
CV : 0.09513358055549055
Training model for EOS.IO           (ID=5 )
[Finished Training] evaluating
Validation :- 
MSE : 2.4185451795343348e-05
CV : 0.07854920452595515
Training model for Ethereum         (ID=6 )
[Finished Training] evaluating
Validation :- 
MSE : 6.338275611618649e-06
CV : 0.1222014181060854

In [6]:
# Check the model interface
x = get_features(df_train.iloc[1])
y_pred = models[0].predict([x])
y_pred[0]

-0.0007695276382153975

# Predict & submit

References: [Detailed API Introduction](https://www.kaggle.com/sohier/detailed-api-introduction)

Something that helped me understand this iterator was adding a pdb checkpoint inside of the for loop:

```python
import pdb; pdb.set_trace()
```

See [Python Debugging With Pdb](https://realpython.com/python-debugging-pdb/) if you want to use it and you don't know how to.


In [7]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():
        
        model = models[row['Asset_ID']]
        x_test = get_features(row)
        y_pred = model.predict([x_test])[0]
        
        df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
        
        
        # Print just one sample row to get a feeling of what it looks like
        if i == 0 and j == 0:
            display(x_test)

    # Display the first prediction dataframe
    if i == 0:
        display(df_pred)

    # Send submissions
    env.predict(df_pred)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


Count           1.201000e+03
Open            1.478556e+00
High            1.486030e+00
Low             1.478000e+00
Close           1.483681e+00
Volume          6.547996e+05
VWAP            1.481439e+00
upper_Shadow    2.348667e-03
lower_Shadow    5.558333e-04
high_div_low    1.005433e+00
trade           5.125500e-03
gtrade          4.267694e-06
shadow1         7.827586e-09
shadow3         3.586848e-09
shadow5         8.488603e-10
Name: 0, dtype: float64

Unnamed: 0,row_id,Target
0,0,-3.3e-05
1,1,-0.004612
2,2,-0.000384
3,3,-0.000277
4,4,0.000394
5,5,0.00023
6,6,-0.000631
7,7,0.000552
8,8,-0.000716
9,9,-0.000153
