# Import

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
import gresearch_crypto


TRAIN_CSV = '/kaggle/input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'

# Load Data

In [None]:
df_train = pd.read_csv(TRAIN_CSV)
df_train.head()

In [None]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
df_asset_details

# Features

Eda on these features :- https://www.kaggle.com/swaralipibose/new-features-eda-using-elon-musk-and-crypto-trends

In [None]:
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def add_features(dataframe):
    try:
        dataframe=dataframe[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP','timestamp','Target']]
    except:
        dataframe=dataframe[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP','timestamp']]
        
        
    
    dataframe['dat']=pd.to_datetime(dataframe['timestamp'],unit='s')
    dataframe['date']=dataframe['dat'].dt.date
    dataframe["Day of Week"] = dataframe['dat'].dt.dayofweek
    dataframe['weekend']=np.where(dataframe['Day of Week']>4,1,0)
    date_of_elon_musk_tweet='2018-6-04'
    dataframe['after']=(dataframe['dat']>date_of_elon_musk_tweet).astype(int)
    date_of_elon_musk_tweet='2021-03-29'
    dataframe['afterdoge']=(dataframe['dat']>date_of_elon_musk_tweet).astype(int)
    dataframe['Month']=dataframe['dat'].dt.month
    dataframe['upper_Shadow'] = upper_shadow(dataframe)
    dataframe['lower_Shadow'] = lower_shadow(dataframe)
    dataframe["high_div_low"] = dataframe["High"] / dataframe["Low"]
    #dataframe["open_sub_close"] = dataframe["Open"] - dataframe["Close"]
    dataframe['trade']=dataframe['Close']-dataframe['Open']
    dataframe['gtrade']=dataframe['trade']/dataframe['Count']
    dataframe['shadow1']=dataframe['trade']/dataframe['Volume']
    #dataframe['shadow2']=dataframe['upper_Shadow']/df['Low']
    dataframe['shadow3']=dataframe['upper_Shadow']/dataframe['Volume']
    #dataframe['shadow4']=dataframe['lower_Shadow']/dataframe['High']
    dataframe['shadow5']=dataframe['lower_Shadow']/dataframe['Volume']
    
 
    return dataframe.drop(columns=['dat','date','timestamp'])

# Training and feature importance

In [None]:
import lightgbm

In [None]:
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split

def log(model,X_train, X_valid, y_train, y_valid,train_split=1.0):
    if train_split > 0:
        X_train=X_train[:int(train_split*X_train.shape[0])]
        y_train=y_train[:int(train_split*y_train.shape[0])]
    
        pred=model.predict(X_train)
        print('Training :- ')
        print(f'MSE : {np.mean((y_train-pred)**2)}')
        print(f'CV : {pearsonr(pred,y_train)[0]}')
    pred=model.predict(X_valid)
    print('Validation :- ')
    print(f'MSE : {np.mean((y_valid-pred)**2)}')
    print(f'CV : {pearsonr(pred,y_valid)[0]}')


def get_Xy_and_model_for_asset(df_train, asset_id):
    df_proc = add_features(df_train[df_train["Asset_ID"] == asset_id])
   
    # TODO: Try different features here!

    df_proc = df_proc.dropna(how="any")
    
    X = df_proc.drop("Target", axis=1)
    y = df_proc["Target"]
    X_train=X[:int(0.7*X.shape[0])]
    y_train=y[:int(0.7*y.shape[0])]#
    X_test=X[int(X.shape[0]*0.7):]
    y_test=y[int(y.shape[0]*0.7):]
    # TODO: Try different models here!
    model = LGBMRegressor(n_estimators=200,num_leaves=300,learning_rate=0.09)
    model.fit(X_train, y_train)
    print('[Finished Training] evaluating')
    log(model,X_train, X_test, y_train, y_test,0.3)
    
    print(X_train.columns)
    
    ax=lightgbm.plot_importance(model)
    plt.savefig(f'{asset_id}.png')
    
    
    return model

In [None]:
import matplotlib.pyplot as plt


In [None]:
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    model = get_Xy_and_model_for_asset(df_train, asset_id)    
    models[asset_id] = model

In [None]:
asset_details=pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
asset_details.head()

In [None]:
dic=dict(zip(asset_details['Asset_ID'],asset_details['Asset_Name']))

In [None]:
import cv2
subplots=plt.subplots(14,1,figsize=(100,100))[1]
for asset in range(14):
    img=cv2.imread(f'{asset}.png')
    subplots[asset].imshow(img)
    subplots[asset].text(150,15,dic[asset])
    
    

In [None]:
# Check the model interface
x = add_features(df_train.iloc[[1,2]]).drop(columns=['Target']
                                        )
y_pred = models[0].predict(x)
y_pred[0]

In [None]:
''' 
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    df_test=add_features(df_test)
    for j , row in df_test.iterrows():
        
        model = models[row['Asset_ID']]
        x_test = row
        y_pred = model.predict([x_test])[0]
        
        df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
        
        
        # Print just one sample row to get a feeling of what it looks like
        if i == 0 and j == 0:
            display(x_test)

    # Display the first prediction dataframe
    if i == 0:
        display(df_pred)

    # Send submissions
    env.predict(df_pred)
''' 