In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler
import gc

In [None]:
supplemental_train = pd.read_csv("../input/g-research-crypto-forecasting/supplemental_train.csv")
asset_details = pd.read_csv("../input/g-research-crypto-forecasting/asset_details.csv")
ss = pd.read_csv("../input/g-research-crypto-forecasting/example_sample_submission.csv")
example_test=  pd.read_csv("../input/g-research-crypto-forecasting/example_test.csv")
train = pd.read_csv("../input/g-research-crypto-forecasting/train.csv")

In [None]:
asset_details_dict = dict(zip(asset_details["Asset_ID"] , asset_details["Asset_Name"]))

In [None]:
def pre_process(df: pd.DataFrame, DEBUG = False, scaler = None):
    def nans(df_group: pd.DataFrame):
        df_group.set_index("timestamp")
        asset_name = asset_details_dict[df_group["Asset_ID"].iloc[0]]
        if(DEBUG):
            print(asset_name)
        df_group.replace([np.inf, -np.inf], np.nan, inplace=True)
        df_group.fillna(0, inplace= True)
        if(DEBUG):
            print("missing rows  gaps before [above 60 - missing rows]:\n {}".format((df_group.index[1:]-df_group.index[:-1]).value_counts().head()))
        df_group=df_group.reindex(range(df_group.index[0],df_group.index[-1]+60,60),method='pad')
        if(DEBUG):
            print("missing rows  gaps after [above 60 - missing rows]:\n {}".format((df_group.index[1:]-df_group.index[:-1]).value_counts().head()))
        return df_group
    
    if(DEBUG):
        print("nan count before: {}".format(df.isna().sum().sum()))
    df = df.groupby("Asset_ID").apply(nans).reset_index(drop = True)
    if(DEBUG):
        print("nan count after: {}".format(df.isna().sum().sum()))
        
    return_target = False
    if "Target" in df:
        return_target = True
        target = df["Target"]
        df = df.drop("Target", axis = 1)
        
    if (scaler == None):
        scaler = StandardScaler()
        keep_Asset_ID = df["Asset_ID"].copy()
        keep_timestamp = df["timestamp"].copy()

        df = df.drop(["Asset_ID", "timestamp"], axis = 1)
        df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
        
        df["Asset_ID"] = keep_Asset_ID
        df["timestamp"] = keep_timestamp

    
    else:
        keep_Asset_ID = df["Asset_ID"].copy()
        keep_timestamp = df["timestamp"].copy()
        df = df.drop(["Asset_ID", "timestamp"], axis = 1)
        df = pd.DataFrame(scaler.transform(df),  columns = df.columns)
        
        df["Asset_ID"] = keep_Asset_ID
        df["timestamp"] = keep_timestamp



    if return_target:
        df["Target"] = target
        
    return df, scaler



In [None]:
def FE(df):
    def get_prev(df, cols, depth, reps):
        for col in cols:
            for gap in range(depth, reps * depth):
                gap_name = "{}_{}".format(col, gap)
                df[gap_name] = df[col].shift(periods=gap, fill_value=0)
        gc.collect()
        return df

    asset_ids = list(df.Asset_ID.unique())

    cols = ["Count", "Open" , "High", "Low", "Close", "Volume", "VWAP"]
    df = df.groupby("Asset_ID").apply(get_prev, cols, 1, 2)
    return df

In [None]:
all_vars = %who_ls
if "train" in  all_vars:
    if train.isna().sum().sum() == 0: # pre_process already ran, restart
        print("restart")
        train = pd.read_csv("../input/g-research-crypto-forecasting/train.csv")
        
else: # train not loaded
    print("loading train")
    train = pd.read_csv("../input/g-research-crypto-forecasting/train.csv")
train, scaler = pre_process(train, DEBUG=False)
train = FE(train)

In [None]:
from sklearn.linear_model import LinearRegression
from tqdm import tqdm

train_columns = train.drop("Target", axis = 1).columns

models_dict = {}
for group_id in tqdm(train["Asset_ID"].unique()):
    temp_train = train.loc[train["Asset_ID"] == int(group_id)]
    models_dict[str(group_id)] = LinearRegression().fit(temp_train[train_columns], temp_train["Target"])

In [None]:
import gresearch_crypto

# gresearch_crypto.competition.make_env.__called__ = False

env = gresearch_crypto.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    test_df, _ = pre_process(test_df.drop("row_id", axis = 1),False, scaler)
    test_df = FE(test_df)

    group_id = train["Asset_ID"].at[0]
    clf = models_dict[str(group_id)]
    sample_prediction_df['Target'] = clf.predict(test_df[train_columns])  # make your predictions here
    sample_prediction_df.fillna(0, inplace=True)
    env.predict(sample_prediction_df)   # register your predictions