# Introduction

## What we have?

In [None]:
import os
import numpy as np
import pandas as pd

directory = "../input/g-research-crypto-forecasting" #you need to change this to your data location
crypto_df = pd.read_csv(os.path.join(directory, 'train.csv'))
# data ['Time']=pd.to_datetime(data['timestamp'], unit='s')
crypto_df.head()

In [None]:
details_df = pd.read_csv(os.path.join(directory, 'asset_details.csv'))
# data = pd.merge(data, details, on ="Asset_ID", how = 'left')
details_df.head()

In [None]:
data = {row["Asset_Name"]: crypto_df[crypto_df['Asset_ID'] == row['Asset_ID']]
        for _, row in details_df.iterrows()}

## What is our goal?

predict the target values

## Useful Links
- [Competition Info](https://www.kaggle.com/c/g-research-crypto-forecasting/overview)
- [API tool for submission](https://www.kaggle.com/sohier/detailed-api-introduction)
- [Tutorial](https://www.kaggle.com/cstein06/tutorial-to-the-g-research-crypto-competition)
- [Eval Metrics](https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/291845), [Eval Metrics](https://www.kaggle.com/c/g-research-crypto-forecasting/discussion/286778)

# Data Preprocess

## Cleaning and preparing time series data

In [None]:
eth_df = data["Ethereum"][-2000:]
eth_df.info(show_counts = True)
eth_df = eth_df.set_index("timestamp")

## check time gap between rows

according to tutorial
> Missing asset data, for a given minute, is not represented by NaN's, but instead by the absence of those rows. 

We can check the timestamp difference between consecutive rows to see if there is missing data.

In [None]:
(eth_df.index[1:] - eth_df.index[:-1]).value_counts().head()

## check values still missing

In [None]:
eth_df.isna().sum()

## define the data prepare flow

1. padding the missing rows (according to the tutorial)
2. naturally fill the missing values with previous observations

In [None]:
def prepare_data(df):
    index_name = "timestamp"
    if df.index.name != index_name:
        df = df.set_index(index_name)
    df = df.reindex(range(df.index[0], df.index[-1] + 60, 60), method='pad')
    df = df.fillna(method='ffill')
    return df

In [None]:
eth_df = prepare_data(eth_df)

## Explorate Data

## Auto correlation

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt

def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)[1:]

lret = log_return(eth_df["Close"])

In [None]:
plot_acf(lret, lags=15)
plt.show()

In [None]:
plot_pacf(lret, lags=15)
plt.show()

## Correlation between assets

# Baselines

In [None]:
train_data = eth_df[:1400]
test_data = eth_df[1400:]

In [None]:
from statsmodels.tsa.arima_model import ARIMA
model = ARIMA(train_data["Target"].diff().iloc[1:].values, order=(2,1,0))
result = model.fit()
result.plot_predict(start=1, end=len(train_data))
plt.show()

In [None]:
y_hat = result.predict(start=1, end=600)
print('Test score', f"{np.corrcoef(y_hat, test_data['Target'])[0,1]:.2f}")

In [None]:
from sklearn.neural_network import MLPRegressor

X_train, y_train = train_data.drop(["Target"], axis=1), train_data["Target"]
X_test, y_test = test_data.drop(["Target"], axis=1), test_data["Target"]

clf = MLPRegressor(random_state=1, max_iter=300).fit(X_train.values, y_train.values)
y_test_hat = clf.predict(X_test)
print('Test score', f"{np.corrcoef(y_test_hat, test_data['Target'])[0,1]:.2f}")