In [None]:
# -*- coding: utf-8 -*-
"""
Created on Wed Dec  8 17:10:17 2021

@source1: https://www.kaggle.com/tarlannazarov/g-research-crypto-starter-xgb-pipeline

@source2: Memory reduction function from https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/285721

@source3: class MemReducer(...) from https://www.kaggle.com/jpmiller/skmem

@source4: https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/285289

@source5: https://www.kaggle.com/cstein06/tutorial-to-the-g-research-crypto-competition
"""

#!pip install skmem #Cannot install on Kaggle. Class MemReducer(...) copied from Source3 mentioned above.

##Import and load dfs
#References: Tutorial to the G-Research Crypto Competition
import pandas as pd
import numpy as np
import gresearch_crypto
import xgboost as xgb
import traceback
import datatable as dt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import validation
from datetime import datetime
import time

# auxiliary function, from datetime to timestamp
totimestamp = lambda s: np.int32(time.mktime(datetime.strptime(s, "%d/%m/%Y").timetuple()))

cutoff_timestamp = totimestamp('13/06/2021')

class MemReducer(BaseEstimator, TransformerMixin):
    def __init__(self, max_unique_pct=0.2, nullables=True):
        self.max_unique_pct = max_unique_pct
        self.nullables = nullables
        
    def fit(self, df, float_cols=None):
        if not isinstance(df, pd.DataFrame):
            raise TypeError(f"'{type(df).__name__}' object is not a pandas \
                    dataframe.")
        
        self.float_candidates = float_cols
        return self

    
    # Helper functions for .transform()
    def reduce_ints(self, df):
        int_cols = df.select_dtypes('integer').columns
        if not int_cols.empty:
            print("Starting integers.", flush=True)
            start_types = df[int_cols].dtypes

            mins = df[int_cols].min()
            unsigneds = mins.index[mins >= 0]
            df[unsigneds] = df[unsigneds].apply(pd.to_numeric,
                                                downcast='unsigned')
            signeds = mins.index[mins < 0]
            df[signeds] = df[signeds].apply(pd.to_numeric,
                                            downcast='signed')
            end_types = df[int_cols].dtypes
            changed = end_types[~end_types.eq(start_types)]
            print(f"Downcast {len(changed)} standard integer columns.")
        return df

    
    def reduce_floats(self, df, float_cols):
        print("Starting floats.", flush=True)
        
        if not isinstance(float_cols, list):
            print(f"'{type(float_cols).__name__}' object is not a list,\
                    skipping floats.")
        else:
            true_float_cols = df.select_dtypes(np.float64).columns.tolist()
            non_float64s = [f for f in float_cols if f not in true_float_cols]
            if non_float64s:
                print("Skipping columns that are not np.float64")

            convertibles = [f for f in float_cols if f in true_float_cols]
            if convertibles:
                df[convertibles] = df[convertibles].astype(np.float32)
                print(f"Downcast {len(convertibles)} float columns.")
        return df
    
    
    def reduce_objs(self, df, max_pct):   
        if not 0<=max_pct<=1:
            raise ValueError("max_unique_pct must be between 0 and 1")

        obj_cols = df.select_dtypes('object').columns
        if not obj_cols.empty:
            print("Starting objects.", flush=True)
            for oc in obj_cols:
                try:
                    df[oc] = pd.to_numeric(df[oc], downcast='integer')
                except:
                    pass
                else: 
                    print(f"Converted {len(oc)} columns to numbers.")
                    
        new_obj_cols = df.select_dtypes('object').columns    
        if not new_obj_cols.empty:
            category_mask = df[new_obj_cols].nunique().to_numpy()/len(df) <= max_pct
            cat_cols = new_obj_cols[category_mask]
            if not cat_cols.empty:
                df[cat_cols] = df[cat_cols].astype('category')
                print(f"Converted {len(cat_cols)} columns to categories.")
        return df
    
    
    def reduce_nullables(self, df):
        print("Starting nullables.", flush=True)
        
        true_float_cols = df.select_dtypes('float').columns
        remainders = df[true_float_cols].mod(1).max(axis=0)
        nulls = df[true_float_cols].isnull().sum()
        
        convertibles = remainders[remainders==0].index \
                        .intersection(nulls[nulls!=0].index) \
                        .tolist()
        if convertibles:
            start_types = df[convertibles].dtypes
            df[convertibles] = df[convertibles].convert_dtypes()
            end_types = df[convertibles].dtypes

            changed = end_types[~end_types.eq(start_types)]                     
            changed_nums = changed[end_types!='string'].index
                        
            #TODO: change ifs and loops to np.arrays or similar
            #       and add in unsigned ints
            
            for cc in changed_nums:  
                max_int = df[cc].abs().max()
                if 32767 < max_int <=  2147483647:
                    df[cc] = df[cc].astype('Int32')
                elif 127 < max_int <= 32767:
                    df[cc] = df[cc].astype('Int16')
                elif max_int <= 127:
                    df[cc] = df[cc].astype('Int8')
                    
            print(f"Converted {len(changed)} columns to nullable types.")
        
        else:
            print("No candidates for nullable integers.")

        return df

    
    def transform(self, df):
        """ Convert dataframe columns to dtypes requiring lower memory.

        Parameters
        ----------
        df : pandas DataFrame
            The dataframe to be converted.
        """

        validation.check_is_fitted(self, 'float_candidates')

        print("Getting memory usage.")
        memory_MB_in = df.memory_usage(deep=True).sum()/(1024**2)
        print(f"Memory in: {memory_MB_in:.2f} MB")

        df = self.reduce_ints(df)
        if self.float_candidates:
            df = self.reduce_floats(df, self.float_candidates)
        df = self.reduce_objs(df, self.max_unique_pct)
        if self.nullables:
            df = self.reduce_nullables(df)

        memory_MB_out = df.memory_usage(deep=True).sum()/(1024**2)
        print(f"Memory out: {memory_MB_out:.2f} MB",
              f"Reduction: {1 - memory_MB_out/memory_MB_in:.1%}\n")

        return df
    
    
def drop_memory(path):
    df = dt.fread(path).to_pandas()
    df = df[df['timestamp'] < cutoff_timestamp]
    mr = MemReducer()
    floats = df.select_dtypes('float').columns.tolist()
    df = mr.fit_transform(df, float_cols=floats)
    return df

TRAIN_CSV = '/kaggle/input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'

df_train = drop_memory(TRAIN_CSV)
df_train.head()

df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
df_asset_details

df_train.replace([np.inf, -np.inf], np.nan)
df_train = df_train.dropna(how="any")

##Training
#Utility functions to train a model for one asset

# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

# A utility function to build features from the original df
# It works for rows to, so we can reutilize it.
def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    return df_feat

def get_Xy_and_model_for_asset(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    
    # TODO: Try different features here!
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    #df_proc = df_proc.dropna(how="any")
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    # TODO: Try different models here!
    #model = LGBMRegressor(random_state=1111, n_estimators=1200)
    #model.fit(X, y)
    #return X, y, model
    
    model = xgb.XGBRegressor(
    n_estimators=1787,
    learning_rate=0.05,
    max_depth=12,
    subsample=0.9,
    colsample_bytree=0.7,
    #colsample_bylevel=0.75,
    missing=-999,
    random_state=1111,
    tree_method='gpu_hist'  
    )
    
    model.fit(X, y)
    return X, y, model

##Loop over all assets
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    try:
        X, y, model = get_Xy_and_model_for_asset(df_train, asset_id)    
        Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model
    except: 
        traceback.print_exc()
        Xs[asset_id], ys[asset_id], models[asset_id] = None, None, None    

#sh?? Deal with the inf and nan

# Check the model interface
# x = get_features(df_train.iloc[1])
#y_pred = models[0].predict([x])
#y_pred[0]
# y_pred = models[0].predict(pd.DataFrame([x]))
# y_pred[0]

##Predict & submit
#References: Detailed API Introduction

#Something that helped me understand this iterator was adding a pdb checkpoint inside of the for loop:

# import pdb; pdb.set_trace()

#See Python Debugging With Pdb if you want to use it and you don't know how to.

env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():
        
        if models[row['Asset_ID']] is not None:
            try:
                model = models[row['Asset_ID']]
                x_test = get_features(row)
                y_pred = model.predict(pd.DataFrame([x_test]))[0]
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
        
    env.predict(df_pred)

#This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.