In [3]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import time
import datetime
import pickle
import gc

In [4]:
n_fold = 10

In [5]:
models_dir = 'models/'

In [6]:
names = ['Silver', 'Gold', 'Copper', 'Palladium', 'Platinum']

In [7]:
lags = [10, 20, 50]

In [8]:
models = [[[] for split in range(n_fold)] for name in names]

for n in range(len(names)):
    for split in range(n_fold):
        models[n][split] = pickle.load(open(f'{models_dir}/trained_fullmetal_model_id_{names[n]}_fold{str(split)}.pkl', 'rb'))

In [9]:
len(models),len(models[0])

(5, 10)

In [10]:
gc.collect()

0

In [14]:
TRAIN_CSV = 'train_metals_features.csv'

In [11]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [24]:
df_train = pd.read_csv(TRAIN_CSV)

In [27]:
df_train.drop(columns='Unnamed: 0', inplace=True)

In [49]:
#df_train = reduce_mem_usage(df_train)
train_merged = df_train.copy()

not_use_features_train = ['Date']

for name in names:
    not_use_features_train.append(f'Price_{name}')

features = train_merged.drop(columns=not_use_features_train).columns.to_list()
print(len(features), features)

108 ['Price_Lag_10_Silver', 'log_close/mean_10_Silver', 'log_return_10_Silver', 'Price_Lag_20_Silver', 'log_close/mean_20_Silver', 'log_return_20_Silver', 'Price_Lag_50_Silver', 'log_close/mean_50_Silver', 'log_return_50_Silver', 'mean_close/mean_10_Silver', 'mean_log_returns_10_Silver', 'log_close/mean_10-mean_close/mean_10_Silver', 'log_return_10-mean_log_returns_10_Silver', 'mean_close/mean_20_Silver', 'mean_log_returns_20_Silver', 'log_close/mean_20-mean_close/mean_20_Silver', 'log_return_20-mean_log_returns_20_Silver', 'mean_close/mean_50_Silver', 'mean_log_returns_50_Silver', 'log_close/mean_50-mean_close/mean_50_Silver', 'log_return_50-mean_log_returns_50_Silver', 'Year', 'Month', 'Week_Number', 'Price_Lag_10_Gold', 'log_close/mean_10_Gold', 'log_return_10_Gold', 'Price_Lag_20_Gold', 'log_close/mean_20_Gold', 'log_return_20_Gold', 'Price_Lag_50_Gold', 'log_close/mean_50_Gold', 'log_return_50_Gold', 'mean_close/mean_10_Gold', 'mean_log_returns_10_Gold', 'log_close/mean_10-mean_cl

In [50]:
train_merged.dtypes.unique()

array([dtype('float64'), dtype('O'), dtype('int64')], dtype=object)

In [51]:
train_merged = reduce_mem_usage(train_merged)

Memory usage of dataframe is 2.16 MB
Memory usage after optimization is: 0.62 MB
Decreased by 71.4%


In [None]:
def merge_for_infer(df):
    df_merged = pd.DataFrame()
    df_merged[['timestamp', 'Asset_ID', 'Close']] = 0
    for id in range(14):
        df_merged = df_merged.merge(df.loc[df["Asset_ID"] == id, ['timestamp', 'Close']].copy(), on="timestamp", how='outer',suffixes=['', "_"+str(id)])
 
    df_merged = df_merged.drop(['Asset_ID', 'Close'], axis=1)
#     df_merged = df_merged.sort_values('timestamp', ascending=True)
    return df_merged

In [53]:
keep_hist = max(lags)
keep_hist

50

In [None]:
start = time.time()

for i, (df_test, df_pred) in enumerate(iter_test):
    df_test_merged = merge_infer_2(df_test, one_line)
    history_merged = pd.concat([history_merged, df_test_merged])
    x_test = get_features(history_merged, train=False)
    x_calc = x_test.iloc[-1]
    for j , (asset_id,row_id) in enumerate(  zip(   df_test['Asset_ID'].values,  df_test['row_id'].values   )   ): 
        y_pred_list = []
        try:
            for split in range(n_fold):
                y_pred_list.append(models[ asset_id ][split].predict(x_calc[features]))
            y_pred = np.median(y_pred_list)
        except Exception:
            y_pred = 0
        df_pred.loc[  df_pred['row_id'] == row_id ,  'Target'  ] = y_pred

    history_merged = history_merged.tail(keep_hist)

    env.predict(df_pred)

stop = time.time()
print(stop-start)