In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing required libraries

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt
import gresearch_crypto
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

### Reading the input files

In [None]:
asset_details = pd.read_csv("../input/g-research-crypto-forecasting/asset_details.csv")
train_data = pd.read_csv("../input/g-research-crypto-forecasting/train.csv")
target = 'Target'

In [None]:
train_data = train_data.groupby(['Asset_ID']).head(2000).reset_index(drop=True)

## Data Transformation

In [None]:
def transform_data(df, scaler_obj=None):
    df = df.merge(asset_details, on='Asset_ID', how='left')
    df['time_format'] = pd.to_datetime(df['timestamp'], unit='s')
    df['upper_shadow'] = df['High'] / df[['Close', 'Open']].max(axis=1)
    df['lower_shadow'] = df[['Close', 'Open']].min(axis=1) / df['Low']
    df['high_low'] = df['High'] / df['Low']
    mean_price = df[['Low', 'High', 'Open', 'Close']].mean(axis=1)
    median_price = df[['Low', 'High', 'Open', 'Close']].median(axis=1)
    df['high_mean'] = df['High'] / mean_price
    df['low_mean'] = df['Low'] / mean_price
    df['high_median'] = df['High'] / median_price
    df['low_median'] = df['Low'] / median_price
    df['volume_count'] = df['Volume'] / (df['Count'] + 1)
    features = ['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP', 'Weight', 'upper_shadow', 'lower_shadow', 'high_low', 'high_mean', 'high_median', 'low_mean', 'low_median', 'volume_count']
    if scaler_obj:
        scaled_values = scaler_obj.transform(df[features])
    else:
        scaler_obj = StandardScaler()
        scaled_values = scaler_obj.fit_transform(df[features])
    return pd.DataFrame(scaled_values, columns=features), scaler_obj

## Fitting the model and finding the mean squarred error

In [None]:
def calculate_mse(cv_model, cv_params, x_train, x_test, y_train, y_test):
    gs_model = GridSearchCV(cv_model, cv_params, cv=3)
    gs_model.fit(x_train, y_train)
    best_model = gs_model.best_estimator_
    y_pred = best_model.predict(x_test)
    print(mean_squared_error(y_pred, y_test))
    return best_model, mean_squared_error(y_pred, y_test)

## Linear Regression

In [None]:
cv_model = LinearRegression()
cv_params = {}

## SVM

In [None]:
cv_model = SVR()
cv_params = {'max_iter':[100, 200, 250], 'C':np.logspace(1, 7, 15)}

## Random Forest

In [None]:
cv_model = RandomForestRegressor()
cv_params = {'ccp_alpha': np.logspace(-2,-3, 12), 'random_state':[1], 'n_estimators':[50, 100], 'n_jobs':[4]}

## Gradient Boosting Regressor

In [None]:
cv_model = GradientBoostingRegressor()
cv_params = {'ccp_alpha':np.logspace(-1, -2, 4), 'loss':['ls'], 'random_state': [1]}

## Ada Boosting Regressor

In [None]:
# cv_model = AdaBoostRegressor()
# cv_params = {}

## XGBRegressor

In [None]:
# cv_model = XGBRegressor()
# cv_params = {'n_jobs':[4]}

In [None]:
cv_model.get_params()

## Transforming test data

In [None]:
train_data.replace([np.inf, -np.inf], np.nan, inplace=True)
train_data.dropna(inplace=True)
x_train, x_test, y_train, y_test = train_test_split(train_data, train_data[target], test_size=0.2, random_state=1)
feature_df, scaler_obj = transform_data(x_train)
x_test, scaler_obj = transform_data(x_test, scaler_obj)

In [None]:
best_model, mse = calculate_mse(cv_model, cv_params, feature_df, x_test, y_train, y_test)

In [None]:
best_model

## Mean Squared Error Values
* Linear Regression -> 8.979067012125199e-05
* SVR -> 0.00029163926796389863
* Random Forest Regressor -> 9.053569723455033e-05
* Gradient Boosting Regressor -> 9.053079163170843e-05
* AdaBoost Regressor -> 0.00014881919823361302
* XGB Regressor -> 8.08510457127688e-05

## Preparing the test environment

**Commented out the test environment to avoid errors**

In [None]:
# import gresearch_crypto
# env = gresearch_crypto.make_env()
# iter_test = env.iter_test()

## Running against the test environment

In [None]:
# for (test_df, sample_prediction_df) in iter_test:
#     scaled_data, scaled_obj = transform_data(test_df, scaler_obj)
#     y_pred = best_model.predict(scaled_data)
#     sample_prediction_df['Target'] = y_pred
#     env.predict(sample_prediction_df)