# Bitcoin Price. Prediction by ARIMA

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy import stats
import statsmodels.api as sm
import warnings
from itertools import product
from datetime import datetime
warnings.filterwarnings('ignore')
plt.style.use('seaborn-poster')

## Data Exploration

In [None]:
import os

In [None]:
for x,y,z in os.walk('/kaggle/input'):
    print(x,z)

In [None]:
df_sen_hodl = pd.read_csv('/kaggle/input/sentiment-bitcoin/sentiment_hodl.csv')
df_sen_hodl = df_sen_hodl.sort_values(by=['date'])

In [None]:
df_sentiment = pd.read_csv('/kaggle/input/sentiment-bitcoin/sentiment_nb.csv')
df_sentiment = df_sentiment.sort_values(by=['date'])
df_sentiment = df_sentiment[1620:]
df_sentiment = df_sentiment.append(df_sen_hodl)
df_sentiment = df_sentiment.sort_values(by=['date'])
df_sentiment = df_sentiment[:8017]

In [None]:
import datetime,dateutil
temp = []
for index,x in enumerate(df_sentiment.groupby(['date']).mean().index):
    if dateutil.parser.parse(x) - dateutil.parser.parse(df_sentiment.groupby(['date']).mean().index[index+1]) == dateutil.parser.parse('2017-10-26')-datetime.datetime(2017,10,27):
        pass
    else:
        y = dateutil.parser.parse(x)+datetime.timedelta(days=1)
        while y < dateutil.parser.parse(df_sentiment.groupby(['date']).mean().index[index+1]):
            temp.append(y)
            y += datetime.timedelta(days=1)
        
    

In [None]:
mean_sentiment = df_sentiment.groupby(['date']).mean()
mean_sentiment

In [None]:
new_temp = []
for x in temp:
    new_temp.append([x.strftime('%Y-%m-%d'),0])

In [None]:
newdf = pd.DataFrame(new_temp,columns=['date','sentiment_value'])
newdf.index = newdf.date
newdf = newdf.drop(['date'],axis=1)
newdf

In [None]:
mean_sentiment = mean_sentiment.append(newdf).sort_values(by=['date'])
mean_sentiment

In [None]:
# Load data
df = pd.read_csv('/kaggle/input/bitcoin-historical-data/bitstampUSD_1-min_data_2012-01-01_to_2020-12-31.csv')
df.head()

In [None]:
# Unix-time to 
df.Timestamp = pd.to_datetime(df.Timestamp, unit='s')

# Resampling to daily frequency
df.index = df.Timestamp

df = df.resample('D').mean()

# Resampling to monthly frequency
df_month = df.resample('M').mean()

# Resampling to annual frequency
df_year = df.resample('A-DEC').mean()

# Resampling to quarterly frequency
df_Q = df.resample('Q-DEC').mean()

In [None]:
df = df.iloc[2130:]

In [None]:
df

In [None]:
temp = []
for index,x in enumerate(mean_sentiment.values):
    if index == 0:
        temp.append(0)
    else:
        temp.append(x[0]-mean_sentiment.values[index-1][0])
    

In [None]:
mean_sentiment['difference'] = temp
mean_sentiment

In [None]:
df['sentiment_value'] = mean_sentiment['difference']

In [None]:
# PLOTS
fig = plt.figure(figsize=[15, 7])
plt.suptitle('Bitcoin exchanges, mean USD', fontsize=22)

plt.subplot(221)
plt.plot(df.Weighted_Price, '-', label='By Days')
plt.legend()

## Stationarity check and STL-decomposition of the series

In [None]:
df = df.dropna()
plt.figure(figsize=[15,7])
sm.tsa.seasonal_decompose(df.Weighted_Price).plot()
print("Dickey–Fuller test: p=%f" % sm.tsa.stattools.adfuller(df.Weighted_Price)[1])
plt.show()

The series are not stationary.

## Box-Cox Transformations

In [None]:
# Box-Cox Transformations
df['Weighted_Price_box'], lmbda = stats.boxcox(df.Weighted_Price)
print("Dickey–Fuller test: p=%f" % sm.tsa.stattools.adfuller(df.Weighted_Price)[1])

The series are not stationary.

## Seasonal differentiation

In [None]:
# Seasonal differentiation
df['prices_box_diff'] = df.Weighted_Price_box - df.Weighted_Price_box.shift(12)
print("Dickey–Fuller test: p=%f" % sm.tsa.stattools.adfuller(df.prices_box_diff[12:])[1])

The series are not stationary.

## Regular differentiation

In [None]:
# Regular differentiation
df['prices_box_diff2'] = df.prices_box_diff - df.prices_box_diff.shift(1)
plt.figure(figsize=(15,7))

# STL-decomposition
sm.tsa.seasonal_decompose(df.prices_box_diff2[13:]).plot()   
print("Dickey–Fuller test: p=%f" % sm.tsa.stattools.adfuller(df.prices_box_diff2[13:])[1])

plt.show()

The series are stationary.

## Model Selection

In [None]:
real_df = df

In [None]:
df = real_df[1067:1097]

In [None]:
df

In [None]:
date_lists = [datetime.datetime(2020, 10,31),
datetime.datetime(2020, 11,1),
datetime.datetime(2020, 11,2),
datetime.datetime(2020, 11,3),
datetime.datetime(2020, 11,4),
datetime.datetime(2020, 11,5),
datetime.datetime(2020, 11,6),
datetime.datetime(2020, 11,7),
datetime.datetime(2020, 11,8),
datetime.datetime(2020, 11,9),
datetime.datetime(2020, 11,10),
datetime.datetime(2020, 11,11),
datetime.datetime(2020, 11,12),
datetime.datetime(2020, 11,13),
datetime.datetime(2020, 11,14),
datetime.datetime(2020, 11,15),
datetime.datetime(2020, 11,16),
datetime.datetime(2020, 11,17),
datetime.datetime(2020, 11,18),
datetime.datetime(2020, 11,19),
datetime.datetime(2020, 11,20),
datetime.datetime(2020, 11,21),
datetime.datetime(2020, 11,22),
datetime.datetime(2020, 11,23),
datetime.datetime(2020, 11,24),
datetime.datetime(2020, 11,25),
datetime.datetime(2020, 11,26),
datetime.datetime(2020, 11,27),
datetime.datetime(2020, 11,28),
datetime.datetime(2020, 11,29)]

In [None]:
real_df[1090:1097]

In [None]:
# Initial approximation of parameters
result_forecast_win30 = []
for iterator in range(10):
    print(iterator)
    df = real_df[1090+iterator:1097+iterator]
    Qs = range(0, 2)
    qs = range(0, 3)
    Ps = range(0, 3)
    ps = range(0, 3)
    D=1
    d=1
    parameters = product(ps, qs, Ps, Qs)
    parameters_list = list(parameters)
    len(parameters_list)

    # Model Selection
    results = []
    best_aic = float("inf")
    warnings.filterwarnings('ignore')
    for param in parameters_list:
        try:
            model=sm.tsa.statespace.SARIMAX(df.Weighted_Price_box,exog=df.sentiment_value, order=(param[0], d, param[1]), 
                                            seasonal_order=(param[2], D, param[3], 12),enforce_stationarity=False).fit(disp=-1)
        except ValueError:
            print('wrong parameters:', param)
            continue
        aic = model.aic
        if aic < best_aic:
            best_model = model
            best_aic = aic
            best_param = param
        results.append([param, model.aic])


    # Best Models
    result_table = pd.DataFrame(results)
    result_table.columns = ['parameters', 'aic']
#     print(result_table.sort_values(by = 'aic', ascending=True).head())
#     print(best_model.summary())

    df_hodl_forecast = pd.read_csv('/kaggle/input/sentiment-bitcoin/sentiment_hodl.csv')
    df_hodl_forecast = df_hodl_forecast.sort_values(by=['date'])
    df_sentiment_forecast = pd.read_csv('/kaggle/input/sentiment-bitcoin/sentiment_nb.csv')
    df_sentiment_forecast = df_sentiment_forecast.sort_values(by=['date'])
    df_sentiment_forecast = df_sentiment_forecast.append(df_hodl_forecast)
    df_sentiment_forecast = df_sentiment_forecast.sort_values(by=['date'])
    df_sentiment_forecast = df_sentiment_forecast.groupby(['date']).mean()
    df_sentiment_forecast = df_sentiment_forecast[523:]

    
    df_sentiment_forecast = df_sentiment_forecast[1095+iterator:1097+iterator]
    temp = []
    for index,x in enumerate(df_sentiment_forecast.values):
        if index == 0:
            temp.append(0)
        else:
            temp.append(x[0]-df_sentiment_forecast.values[index-1][0])

    df2 = df[['Weighted_Price']]
    date_list = [date_lists[iterator]]
    future = pd.DataFrame(index=date_list, columns= df2.columns)
    df2 = pd.concat([df2, future])
    df2['forecast'] = invboxcox(best_model.predict(start=6,end=7,exog=[temp[1]]), lmbda)
   
    result_forecast_win30.append(df2['forecast'][-1:])
#     plt.figure(figsize=(15,7))
#     real_df[1067+iterator:1098+iterator].Weighted_Price.plot()

#     df2.forecast.plot(color='r', ls='--', label='Predicted Weighted_Price')
#     plt.legend()
#     plt.title('Bitcoin exchanges, by months')
#     plt.ylabel('mean USD')
#     plt.show()

In [None]:
a = pd.Series()
b = 0
for index,x in enumerate(result_forecast_win30):
    
    a = a.append(x)
        
a

In [None]:
plt.figure(figsize=(15,7))
df2 = real_df[1067+iterator:1098+iterator]
df2.Weighted_Price.plot()
df2['forecast'] = a
df2.forecast.plot(color='r', ls='--', label='Predicted Weighted_Price')
plt.legend()
plt.title('Bitcoin exchanges, by months')
plt.ylabel('mean USD')
plt.show()

In [None]:
df2['forecast'][1:]

In [None]:
from sklearn.metrics import mean_squared_error
import math
print(mean_squared_error(df2['forecast'][-10:].values,df2['Weighted_Price'][-10:].values))
# print(math.sqrt(mean_squared_error(df2.forecast[1128:1158], df2.Weighted_Price[1128:1158])))

from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(df2['forecast'][-10:].values,df2['Weighted_Price'][-10:].values))
#MAE: 26745.1109986

#### Without Using Sentiment
30 days before, used to predict 31 Oct to 29 Nov, result<br>
MSE = 192067.0320751612
333.51605274289125

#### Using Sentiment
30 days before, used to predict 31 Oct to 29 Nov, result<br>
MSE = 188773.36671742506<br>
MAE = 333.7377<br>

In [None]:
# Inverse Box-Cox Transformation Function
def invboxcox(y,lmbda):
   if lmbda == 0:
      return(np.exp(y))
   else:
      return(np.exp(np.log(lmbda*y+1)/lmbda))

In [None]:
df_sentiment_forecast['sentiment_value'] = temp

In [None]:
from sklearn.metrics import mean_squared_error
import math
print(mean_squared_error(df2.forecast[1128:1158], df2.Weighted_Price[1128:1158]))
# print(math.sqrt(mean_squared_error(df2.forecast[1128:1158], df2.Weighted_Price[1128:1158])))

from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(df2.forecast[1128:1158], df2.Weighted_Price[1128:1158]))
#MAE: 26745.1109986

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

In [None]:
# Initial approximation of parameters
result_forecast_win30 = []
for iterator in range(30):
    print(iterator)
    df = real_df[1060+iterator:1097+iterator]
    model = LinearRegression().fit(df[['sentiment_value','prices_box_diff2','Close']],df['Weighted_Price'])

    df_hodl_forecast = pd.read_csv('/kaggle/input/sentiment-bitcoin/sentiment_hodl.csv')
    df_hodl_forecast = df_hodl_forecast.sort_values(by=['date'])
    df_sentiment_forecast = pd.read_csv('/kaggle/input/sentiment-bitcoin/sentiment_nb.csv')
    df_sentiment_forecast = df_sentiment_forecast.sort_values(by=['date'])
    df_sentiment_forecast = df_sentiment_forecast.append(df_hodl_forecast)
    df_sentiment_forecast = df_sentiment_forecast.sort_values(by=['date'])
    df_sentiment_forecast = df_sentiment_forecast.groupby(['date']).mean()
    df_sentiment_forecast = df_sentiment_forecast[523:]

    
    df_sentiment_forecast = df_sentiment_forecast[1095+iterator:1097+iterator]
    temp = []
    for index,x in enumerate(df_sentiment_forecast.values):
        if index == 0:
            temp.append(0)
        else:
            temp.append(x[0]-df_sentiment_forecast.values[index-1][0])

    result_forecast_win30.append(model.predict(real_df[1096+iterator:1097+iterator][['sentiment_value','prices_box_diff2','Close']]))
#     plt.figure(figsize=(15,7))
#     real_df[1067+iterator:1098+iterator].Weighted_Price.plot()

#     df2.forecast.plot(color='r', ls='--', label='Predicted Weighted_Price')
#     plt.legend()
#     plt.title('Bitcoin exchanges, by months')
#     plt.ylabel('mean USD')
#     plt.show()

In [None]:
real_df[1097:1127]

In [None]:
model = LinearRegression(fit_intercept=True).fit(np.array(real_df[1060:1097]['sentiment_value']).reshape(-1,1),df['Weighted_Price'])


In [None]:
real_df[1060:1097]['sentiment_value'] * model.coef_[0] + model.intercept_

In [None]:
plt.plot(real_df[1060:1097]['sentiment_value'].values * model.coef_[0] + model.intercept_)
plt.plot(real_df[1060:1097]['Weighted_Price'].values,color='r')
plt.show()