In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_train = '/kaggle/input/g-research-crypto-forecasting/train.csv'
data_asset_details = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'
data_supplemental_train = '/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv'

In [None]:
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
import time
import datetime
import plotly.graph_objects as go

In [None]:
df_train = pd.read_csv(data_train, 
                       dtype={'Asset_ID': 'int8', 'Count': 'int32', 'row_id': 'int32', 'Count': 'int32', 
                              'Open': 'float64', 'High': 'float64', 'Low': 'float64', 'Close': 'float64', 
                              'Volume': 'float64', 'VWAP': 'float64'
                             }
                      )
df_train.head()

In [None]:
import gresearch_crypto

In [None]:
df_assets = pd.read_csv(data_asset_details).sort_values(by='Asset_ID')
df_assets.head()

In [None]:
import xgboost as xgb

# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    
     ## Adding some more features
    df_feat["Close/Open"] = df_feat["Close"] / df_feat["Open"] 
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"] 
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"] 
    df_feat["High/Low"] = df_feat["High"] / df_feat["Low"]
    
    df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)


    df_feat["Median"] = df_feat[["Open", "High", "Low", "Close"]].median(axis=1)
    return df_feat

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.metrics import f1_score
import pandas as pd
from skopt import gp_minimize
from skopt.space import Real, Integer
from sklearn.model_selection import train_test_split
from functools import partial

In [None]:
def m(x, w):
    return np.sum(x * w) / np.sum(w)

def cov(x, y, w):
    return np.sum(w * (x - m(x, w)) * (y - m(y, w))) / np.sum(w)

def wcc(x, y, w):
    return cov(x, y, w) / np.sqrt(cov(x, x, w) * cov(y, y, w))

In [None]:
space = [
    Real(0.5, 0.8, name="colsample_bytree"),
    Real(0.001, 0.5, name="learning_rate"),
    Integer(6, 15, name="max_depth"),
    Integer(100, 1000, name="n_estimators"),
    Real(0.5, 0.95, name="subsample"),
]

In [None]:
train_ratio = 0.75

In [None]:
df_train['Asset_ID'].value_counts()

In [None]:
weight = df_assets[df_assets['Asset_ID'] == 12]['Weight'].values[0]

In [None]:
weight

In [None]:
df = df_train[df_train['Asset_ID'] == 12].reset_index(drop=True)

In [None]:
df

In [None]:
df_proc = get_features(df)
df_proc['y'] = df['Target']
df_proc = df_proc.dropna(how="any")
df_proc = df_proc.reset_index(drop=True)
df_proc = df_proc.drop(labels=np.where(np.isinf(df_proc))[0], axis=0)
df_proc = df_proc.reset_index(drop=True)

X = df_proc.drop("y", axis=1)
y = df_proc["y"]

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
df_proc

In [None]:
col_names = df_proc.drop('y', axis=1).columns

In [None]:
col_names

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio, random_state=42)
target_col = "y"
df_train = pd.DataFrame(X_train, columns=col_names)
#df_train.loc[:, target_col] = y_train
df_test = pd.DataFrame(X_test, columns=col_names)
#df_test.loc[:, target_col] = y_test

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
def return_model_assessment(args, X_train, y_train, X_test, w):
    global models, train_scores, test_scores, curr_model_hyper_params
    params = {curr_model_hyper_params[i]: args[i] for i, j in enumerate(curr_model_hyper_params)}
    model = xgb.XGBRegressor(missing=-999, random_state=2022)
    model.set_params(**params)
    fitted_model = model.fit(X_train, y_train, sample_weight=None)
    models.append(fitted_model)
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    train_score = wcc(train_predictions, y_train, w)
    test_score = wcc(test_predictions, y_test, w)
    train_scores.append(train_score)
    test_scores.append(test_score)
    return 1 - test_score

In [None]:
models = []
train_scores = []
test_scores = []
curr_model_hyper_params = ['colsample_bytree', 'learning_rate',
                        'max_depth', 'n_estimators', 'subsample']
objective_function = partial(return_model_assessment, X_train=X_train, y_train=y_train, X_test=X_test, w=weight)

# running the algorithm
n_calls = 10 # number of times you want to train your model
results = gp_minimize(objective_function, space, base_estimator=None, n_calls=10, n_random_starts=n_calls-1, random_state=42)

In [None]:
print(results)

In [None]:
import plotly.express as px
metrics = pd.DataFrame(train_scores + test_scores)
metrics.loc[:,'dataset'] = ["train_score"]*n_calls + ["test_score"]*n_calls
metrics.loc[:,'Iteration Number'] = list(range(1,n_calls+1)) + list(range(1,n_calls+1))
metrics.columns = ["WCC", "dataset", "Iteration Number"]
fig = px.line(metrics, x="Iteration Number", y="WCC", color="dataset")
fig.show()

In [None]:
print(results.x)