In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session




import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import plotly.express as px
from itertools import product
import warnings
import statsmodels.api as sm
plt.style.use('seaborn-darkgrid')

#matplotlib inline

from pandas.plotting import lag_plot
import datetime as dt
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error

#arime as in notebook https://www.kaggle.com/akashmathur2212/bitcoin-price-prediction-arima-xgboost-lstm-fbprop/data#ARIMA-Model
#import pmdarima as pm

In [None]:
# Reading the csv file
bitstamp = pd.read_csv("/kaggle/input/bitcoin-historical-data/bitstampUSD_1-min_data_2012-01-01_to_2020-09-14.csv")
bitstamp.head()

In [None]:
# looking at the data:
bitstamp.info()
print("\n\/\/Some basic aggregations\/\/")
bitstamp.describe()

In [None]:
# Converting the Timestamp column from string to datetime
bitstamp['Timestamp'] = [datetime.fromtimestamp(x) for x in bitstamp['Timestamp']]

In [None]:
#some basic visualization:
bitstamp.set_index("Timestamp").Weighted_Price.plot(figsize=(14,7), title="Bitcoin Weighted Price")

## **Handling missing or 'defect' data:**

In [None]:
#calculating missing values in the dataset
#counts number of rows with NaN-values per column:
missing_values = bitstamp.isnull().sum()
#calculates percentage of nan rows out of all:
missing_per = (missing_values/bitstamp.shape[0])*100

#displaying missing values in a table:
missing_table = pd.concat([missing_values,missing_per], axis=1, ignore_index=True) 
missing_table.rename(columns={0:'Total Missing Values',1:'Missing %'}, inplace=True)

#printing the table:
missing_table


### over a fourth of the dataset consists of NaN values

## Imputation using Linear Interpolation method:

In [None]:
def fill_missing(df):
    ### function to impute missing values using interpolation ###
    df['Open'] = df['Open'].interpolate()
    df['Close'] = df['Close'].interpolate()
    df['Weighted_Price'] = df['Weighted_Price'].interpolate()

    df['Volume_(BTC)'] = df['Volume_(BTC)'].interpolate()
    df['Volume_(Currency)'] = df['Volume_(Currency)'].interpolate()
    df['High'] = df['High'].interpolate()
    df['Low'] = df['Low'].interpolate()

#     print(df.head())
#     print(df.isnull().sum())

In [None]:
#cleaning the dataset with linear interpolation
fill_missing(bitstamp)

In [None]:
#indexing the dataset:

#created a copy 
bitstamp_non_indexed = bitstamp.copy()

bitstamp = bitstamp.set_index('Timestamp')
bitstamp.head()

In [None]:
sns.kdeplot(bitstamp['Weighted_Price'], shade=True)

### Lag plots

In [None]:
plt.figure(figsize=(15,13))
plt.suptitle('Lag Plots', fontsize=22)

#lag in minutes
plt.subplot(3,3,1)
pd.plotting.lag_plot(bitstamp['Weighted_Price'], lag=1) #minute lag
plt.title('1-Minute Lag')

plt.subplot(3,3,2)
pd.plotting.lag_plot(bitstamp['Weighted_Price'], lag=60) #hourley lag
plt.title('1-Hour Lag')

plt.subplot(3,3,3)
pd.plotting.lag_plot(bitstamp['Weighted_Price'], lag=1440) #Daily lag
plt.title('Daily Lag')

plt.subplot(3,3,4)
pd.plotting.lag_plot(bitstamp['Weighted_Price'], lag=10080) #weekly lag
plt.title('Weekly Lag')

plt.subplot(3,3,5)
pd.plotting.lag_plot(bitstamp['Weighted_Price'], lag=43200) #month lag
plt.title('1-Month Lag')

plt.legend()
plt.show()

In [None]:
hourly_data = bitstamp.resample('1H').mean()
hourly_data = hourly_data.reset_index()

hd = hourly_data.copy()

hourly_data.head()

In [None]:
bitstamp_daily = bitstamp.resample("24H").mean() #daily resampling

### Time series decomposition and statistical tests

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import kpss
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
fill_missing(bitstamp_daily)

In [None]:
plt.figure(figsize=(17,14))
series = bitstamp_daily.Weighted_Price
result = seasonal_decompose(series, model='additive',period=1).plot()

In [None]:
#plotting auto correlation function
acf = plot_acf(series, lags=50, alpha=0.05)
plt.title("ACF for Weighted Price", size=20)
plt.show()

In [None]:
#plotting partial auto correlation function
plot_pacf(series, lags=50, alpha=0.05, method='ols')
plt.title("PACF for Weighted Price", size=20)
plt.show()

In [None]:
df = bitstamp_daily

In [None]:
df.reset_index(drop=False, inplace=True)

lag_features = ["Open", "High", "Low", "Close","Volume_(BTC)"]
window1 = 3
window2 = 7
window3 = 30

df_rolled_3d = df[lag_features].rolling(window=window1, min_periods=0)
df_rolled_7d = df[lag_features].rolling(window=window2, min_periods=0)
df_rolled_30d = df[lag_features].rolling(window=window3, min_periods=0)

df_mean_3d = df_rolled_3d.mean().shift(1).reset_index()
df_mean_7d = df_rolled_7d.mean().shift(1).reset_index()
df_mean_30d = df_rolled_30d.mean().shift(1).reset_index()

df_std_3d = df_rolled_3d.std().shift(1).reset_index()
df_std_7d = df_rolled_7d.std().shift(1).reset_index()
df_std_30d = df_rolled_30d.std().shift(1).reset_index()

for feature in lag_features:
    df[f"{feature}_mean_lag{window1}"] = df_mean_3d[feature]
    df[f"{feature}_mean_lag{window2}"] = df_mean_7d[feature]
    df[f"{feature}_mean_lag{window3}"] = df_mean_30d[feature]
    
    df[f"{feature}_std_lag{window1}"] = df_std_3d[feature]
    df[f"{feature}_std_lag{window2}"] = df_std_7d[feature]
    df[f"{feature}_std_lag{window3}"] = df_std_30d[feature]

df.fillna(df.mean(), inplace=True)

df.set_index("Timestamp", drop=False, inplace=True)
df.head()



In [None]:
df["month"] = df.Timestamp.dt.month
df["week"] = df.Timestamp.dt.week
df["day"] = df.Timestamp.dt.day
df["day_of_week"] = df.Timestamp.dt.dayofweek
df.head(10)

# Model building

In [None]:
df_train = df[df.Timestamp < "2020"]
df_valid = df[df.Timestamp >= "2020"]

print('train shape :', df_train.shape)
print('validation shape :', df_valid.shape)

# arima as in kaggle notebook

In [None]:
!pip install pmdarima

In [None]:
import pmdarima as pm

In [None]:
exogenous_features = ['Open_mean_lag3',
       'Open_mean_lag7', 'Open_mean_lag30', 'Open_std_lag3', 'Open_std_lag7',
       'Open_std_lag30', 'High_mean_lag3', 'High_mean_lag7', 'High_mean_lag30',
       'High_std_lag3', 'High_std_lag7', 'High_std_lag30', 'Low_mean_lag3',
       'Low_mean_lag7', 'Low_mean_lag30', 'Low_std_lag3', 'Low_std_lag7',
       'Low_std_lag30', 'Close_mean_lag3', 'Close_mean_lag7',
       'Close_mean_lag30', 'Close_std_lag3', 'Close_std_lag7',
       'Close_std_lag30', 'Volume_(BTC)_mean_lag3', 'Volume_(BTC)_mean_lag7',
       'Volume_(BTC)_mean_lag30', 'Volume_(BTC)_std_lag3',
       'Volume_(BTC)_std_lag7', 'Volume_(BTC)_std_lag30', 'month', 'week',
       'day', 'day_of_week']

In [None]:
model = pm.auto_arima(df_train.Weighted_Price, exogenous=df_train[exogenous_features], trace=True, error_action="ignore", suppress_warnings=True)
model_cheat = pm.auto_arima(df_train.Weighted_Price, exogenous=df_train[exogenous_features], trace=True, error_action="ignore", suppress_warnings=True)

model_test = pm.auto_arima(df_train.Weighted_Price, trace=True, error_action="ignore", suppress_warnings=True)

# found best models are
# 1: ARIMA(1,0,2)(0,0,0)[0]
# 2: 
# 3: 

In [None]:
model_cheat.fit(df_train.Weighted_Price, exogenous=df_train[exogenous_features])
model.fit(df_train.Weighted_Price)

model_test.fit(df_train.Weighted_Price)

In [None]:
forecast_cheat, conf_int_cheat = model_cheat.predict(n_periods=len(df_valid), exogenous=df_valid[exogenous_features], return_conf_int=True)
forecast, conf_int = model.predict(n_periods=258, return_conf_int=True)

print(len(df_valid))
df_valid_cheat = df_valid.copy()

df_valid["Forecast_ARIMAX"] = forecast
df_valid_cheat["Forecast_ARIMAX"] = forecast_cheat

In [None]:
conf_int_lower_cheat = []
conf_int_upper_cheat = []
for x in conf_int_cheat:
    conf_int_lower_cheat.append(x[0])
    conf_int_upper_cheat.append(x[1])

df_valid_cheat["conf_int_lower"] = conf_int_lower_cheat
df_valid_cheat["conf_int_upper"] = conf_int_upper_cheat

#actual prediction
conf_int_lower = []
conf_int_upper = []
for x in conf_int:
    conf_int_lower.append(x[0])
    conf_int_upper.append(x[1])

df_valid["conf_int_lower"] = conf_int_lower
df_valid["conf_int_upper"] = conf_int_upper

In [None]:
df_valid[["Weighted_Price", "Forecast_ARIMAX", "conf_int_lower","conf_int_upper"]].plot(figsize=(14, 7))

df_valid_cheat[["Weighted_Price", "Forecast_ARIMAX"]].plot(figsize=(14, 7))


# using facebook prophet model

In [None]:
# Resampling originial data to day level and forward fill the missing values
pData_D = bitstamp.resample("24H").mean() #daily resampling
fill_missing(pData_D)

#renaming for prophet
pData_D = pData_D.reset_index()[['Timestamp','Close']]
pData_D = pData_D.rename(columns = {"Timestamp":"ds","Close":"y"})
pData_D.head()

In [None]:
from fbprophet import Prophet
m_prophet = Prophet(daily_seasonality = True) # the Prophet class (model)
m_prophet.fit(pData_D) # fit the model using all data

In [None]:
#periods specifies how many days into the future the prediction should go
future = m_prophet.make_future_dataframe(periods=365)
prediction = m_prophet.predict(future)
m_prophet.plot(prediction)

plt.title("Prediction of BTC price using Prophet")
plt.xlabel("Date")
plt.ylabel("Close BTC Price")
plt.show()

legend:
* black dots: actual data
* blue line: prediction
* ligh blue area: confidence interval

In [None]:
m_prophet.plot_components(prediction)
plt.show()

conclusion from the above plots:
* first plot: estimated trend is positiv -> prophet expects prices to rise in the future
* second plot: based on the estimated trends, price is max mostly on Tuesdays and Saturdays
* third plot: based on the estimated trends, price is max in August and late December

# using LSTM modell

In [None]:
import math
import matplotlib.pyplot as plt
import keras
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping

In [None]:
# using copy of hourly_data: hd
hd.head()

In [None]:
lstm_train, lstm_valid = hd[:int(len(hd)*0.7)], hd[int(len(hd)*0.7):]

print("train shape:", lstm_train.shape)
print("valid shape:", lstm_valid.shape)

lstm_v = lstm_valid.iloc[:, 1:2].values
lstm_t = lstm_train.iloc[:, 1:2].values

print(lstm_t)

## building the input features with a timelag of 1 day

In [None]:
# Feature Scaling
sc = MinMaxScaler(feature_range = (0, 1))
training_set_scaled = sc.fit_transform(lstm_t)

# Creating a data structure with 60 time-steps and 1 output
X_train = []
y_train = []
for i in range(60, int(len(lstm_train))):
    X_train.append(training_set_scaled[i-60:i, 0])
    y_train.append(training_set_scaled[i, 0])
    
X_train, y_train = np.array(X_train), np.array(y_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
#(740, 60, 1)

print("xtrain shape: ", X_train.shape)
print("ytrain shape: ", y_train.shape)

In [None]:
# model = Sequential()

# #Adding the first LSTM layer and some Dropout regularisation
# model.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 1)))
# model.add(Dropout(0.2))

# # Adding a second LSTM layer and some Dropout regularisation
# model.add(LSTM(units = 50, return_sequences = True))
# model.add(Dropout(0.2))

# # Adding a third LSTM layer and some Dropout regularisation
# model.add(LSTM(units = 50, return_sequences = True))
# model.add(Dropout(0.2))

# # Adding a fourth LSTM layer and some Dropout regularisation
# model.add(LSTM(units = 50))
# model.add(Dropout(0.2))

# # Adding the output layer
# model.add(Dense(units = 1))

# # Compiling the RNN
# model.compile(optimizer = 'adam', loss = 'mean_squared_error')

# # Fitting the RNN to the Training set
# model.fit(X_train, y_train, epochs = 100, batch_size = 32)

In [None]:
lstm_train, lstm_valid = hd[:int(len(hd)*0.7)], hd[int(len(hd)*0.7):]

print("train shape:", lstm_train.shape)
print("valid shape:", lstm_valid.shape)

lstm_v = lstm_valid.iloc[:, 1:2]
lstm_t = lstm_train.iloc[:, 1:2]

dataset_total = pd.concat((lstm_t, lstm_v), axis = 0)

inputs = dataset_total[len(dataset_total) - len(lstm_v) - 60:].values

print(inputs)

inputs = inputs.reshape(-1,1)
inputs = sc.transform(inputs)

X_test = []
for i in range(60, 519):
    X_test.append(inputs[i-60:i, 0])
    
X_test = np.array(X_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

print(X_test.shape)
# (459, 60, 1)

In [None]:
predicted_btc_price = model.predict(X_test)
predicted_btc_price = sc.inverse_transform(predicted_btc_price)

In [None]:
# Visualising the results
# plt.plot(hd.loc[int(len(hd)*0.7):int(len(hd)), "Timestamp"],lstm_t.values, color = "red", label = "Real btc Price")
plt.plot(hd.loc[:458, "Timestamp"],predicted_stock_price, color = "blue", label = "Predicted btc Price")

plt.xticks(np.arange(0,459,50))

plt.title('btc Price Prediction')
plt.xlabel('Time')
plt.ylabel('btc Price')

plt.legend()
plt.show()

# using ARIMA modell

In [None]:
# # Resampling originial data to day level and forward fill the missing values
# aData_D = bitstamp_non_indexed
# fill_missing(aData_D)

# aData_D.Timestamp = aData_D.Timestamp.apply(lambda x: dt.datetime(x.year,x.month,x.day))

In [None]:
# aData_D = aData_D.groupby("Timestamp", as_index = False).agg("mean")

In [None]:
# #checking for cross-correlation in the dataset:
# plt.figure()
# lag_plot(aData_D['Open'], lag=3)
# plt.title('BTC price - Autocorrelation plot with lag = 3')
# plt.show()

In [None]:
# #splittin the data in test and training sets:
# #training data: 70% of dataset
# #test data: 30% of dataset
# arima_train_data, arima_test_data = aData_D[int(len(aData_D)*0.3):int(len(aData_D)*0.7)], aData_D[int(len(aData_D)*0.7):]

# arima_training_data = arima_train_data['Close'].values
# arima_test_data = arima_test_data['Close'].values

# #history array containing all observations, during model creation the real datapoint of the test data
# # will be appended step by step
# history = [x for x in arima_training_data]
# model_predictions = []
# N_test_observations = len(arima_test_data)


# #p: The number of lag observations included in the model, also called the lag order.
# #d: The number of times that the raw observations are differenced, also called the degree of dfferencing.
# #q: The size of the moving average window, also called the order of moving average.

# for time_point in range(N_test_observations):
#     model = ARIMA(history, order=(1,1,20)) #param: p, d, q
#     model_fit = model.fit(disp=0)
#     output = model_fit.forecast()
#     yhat = output[0]
#     model_predictions.append(yhat)
#     true_test_value = arima_test_data[time_point]
#     history.append(true_test_value)
    
# MSE_error = mean_squared_error(arima_test_data, model_predictions)
# print('Testing Mean Squared Error is {}'.format(MSE_error))

In [None]:
# test_set_range = aData_D[int(len(aData_D)*0.7):].index

# plt.plot(test_set_range, model_predictions, color='blue', marker='o', linestyle='dashed',label='Predicted Price')
# plt.plot(test_set_range, arima_test_data, color='red', label='Actual Price')

# plt.title('BTC Prices Prediction')
# plt.xlabel('Date')
# plt.ylabel('Prices')
# plt.xticks(np.arange(881,1259,50), aData_D.Timestamp[881:1259:50])
# plt.legend()
# plt.show()