In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from statsmodels.tsa.api import VAR

In [None]:
sentiment = pd.read_csv('/kaggle/input/another-sentiment-bitcoin/cryptopanic_sentiment.csv', index_col = 0)
sentiment.head(10)

In [None]:
mean_sentiment = sentiment.groupby(['Date']).mean()
print(len(mean_sentiment))
mean_sentiment.tail(10)

In [None]:
temp = []
for index,x in enumerate(mean_sentiment.values):
    if index == 0:
        temp.append(0)
    else:
        temp.append(x[0]-mean_sentiment.values[index-1][0])

        
mean_sentiment['difference'] = temp
mean_sentiment

In [None]:
df = pd.read_csv('/kaggle/input/bitcoin-historical-data/bitstampUSD_1-min_data_2012-01-01_to_2020-12-31.csv')
df.head(10)

In [None]:
from dateutil.parser import parse
temp = []
for x in sentiment.Date:
    temp.append(parse(x))
sentiment.Date = temp

In [None]:
df.Timestamp = pd.to_datetime(df.Timestamp, unit='s')

# Resampling to daily frequency
df.index = df.Timestamp

df = df.resample('D').mean()

# Resampling to monthly frequency
df_month = df.resample('M').mean()

# Resampling to annual frequency
df_year = df.resample('A-DEC').mean()

# Resampling to quarterly frequency
df_Q = df.resample('Q-DEC').mean()

train = df.iloc[2130:]
train


In [None]:
train['sentiment_value'] = sentiment.groupby(['Date']).mean()
train = train.dropna()
train

In [None]:
train.shape

In [None]:
train

In [None]:
model = VAR(train[['Close','sentiment_value']])

In [None]:
print(model.select_order(trend='c'))

In [None]:
model_fit = model.fit(ic = 'aic')
# number of lags
num_lag = model_fit.k_ar
num_lag

In [None]:
model_fit.summary()

In [None]:
model_fit.forecast(y = train[['Close','sentiment_value']].values, steps = 1)

In [None]:
def predict(data, fitted_model, lag_order, predict_steps):
    # empty list for our predictions
    prediction = []
  
    # for loop to iterate fitted_model over data
    for i in range(lag_order, len(data)):
        # window of lagged data that the model uses to predict next observation
        window = data.iloc[i - lag_order : i].copy()
        # results of fitted_model being applied to window
        results = fitted_model.forecast(y = window.values, steps = predict_steps)
        # append results to prediction list
        prediction.append(results)
        
    # convert prediction (which is a list of numpy arrays) to a dataframe
    df = np.vstack(prediction)
    df = pd.DataFrame(df)
    # df column names from data
    df.columns = list(data.columns)
    # df index from data
    df.index = data.iloc[len(data) - len(prediction) :].index
    
    # return df
    return df

In [None]:
def rmse(predicted, actual):
    # formula for rmse
    residual = predicted - actual
    residual_sq = residual ** 2
    mean_sq = np.mean(residual_sq)
    rmse_value = np.sqrt(mean_sq)
    # return rmse_value
    return rmse_value

# mean absolute error
def mae(predicted, actual):
    # formula for mae
    absolute_residual = np.absolute(predicted - actual)
    mae_value = np.mean(absolute_residual)
    # return mae_value
    return mae_value

In [None]:
def model_graphs(predicted, actual, title = str):
    # RMSE
    rmse_value = rmse(predicted = predicted, actual = actual)
    # MAE
    mae_value = mae(predicted = predicted, actual = actual)
    # start_year (for putting in text box)
    start_year = predicted.iloc[ : 1].index.copy()
    # text box in line plot
    text_str = 'RMSE = ' + str(rmse_value) + '\n MAE = ' + str(mae_value)
    # line plot
    plt.figure(1)
    plt.plot(actual, color = 'blue', linewidth = 2, label = 'actual')
    plt.plot(predicted, color = 'red', linewidth = 1, label = 'predicted')
    plt.legend()
    plt.title(title + ' Actual vs Predicted')
    plt.text(x = start_year, y = 0.2, s = text_str)
    # residual & hist
    plt.figure(2)
    residual = actual - predicted
    plt.hist(residual, bins = 200)
    plt.title('Distribution of ' + title + ' residual')
    plt.axvline(residual.mean(), color = 'k', linestyle = 'dashed', linewidth = 1)
    # show graphics
    plt.show()

In [None]:
def category(x):
    if x >= 0:
        return 'up'
    elif x < 0:
        return 'down'

# function that returns confusion matrix of model with metrics
def confusion_matrix(predicted, actual, title = str):
    df = pd.DataFrame()
    df['predicted'] = predicted.apply(category)
    df['actual'] = actual.apply(category)
    # code
    df.loc[(df['predicted'] == 'up') & (df['actual'] == 'up'), 'code'] = 'true_positive'
    df.loc[(df['predicted'] == 'up') & (df['actual'] == 'down'), 'code'] = 'false_positive'
    df.loc[(df['predicted'] == 'down') & (df['actual'] == 'down'), 'code'] = 'true_negative'
    df.loc[(df['predicted'] == 'down') & (df['actual'] == 'up'), 'code'] = 'false_negative'
    # confusion dictionary
    z = dict(df['code'].value_counts())
    # confusion metrics
    accuracy = (z['true_positive'] + z['true_negative']) / (z['true_positive'] + z['true_negative'] + z['false_positive'] + z['false_negative'])
    true_positive_rate = z['true_positive'] / (z['true_positive'] + z['false_negative'])
    false_positive_rate = z['false_positive'] / (z['false_positive'] + z['true_negative'])
    true_negative_rate = z['true_negative'] / (z['true_negative'] + z['false_positive'])
    false_negative_rate = z['false_negative'] / (z['false_negative'] + z['true_positive'])
    # print metrics
    print('\nMetrics for [{0}]\nAccuracy:{1:6.3f} \nTP Rate:{2:7.3f} \nFP Rate:{3:7.3f}\nTN Rate:{4:7.3f} \nFN Rate:{5:7.3f}'.format(str(title), accuracy, true_positive_rate, false_positive_rate, true_negative_rate, false_negative_rate))
    # print confusion matrix graph
    print('\n'+
      '            [{title}] Confusion Matrix\n'.format(title = str(title))+
      '\n'+
      '           |-------------|-------------|\n'+
      '  n= {0}  | Predicted:  | Predicted:  |\n'.format(z['true_positive']+z['false_positive']+z['true_negative']+z['false_negative'])+
      '           |    Down     |    Up       |\n'+
      '|----------|-------------|-------------|------------|\n'+
      '| Actual:  |             |             |            |\n'+
      '|  Down    |  tn: {0}    |  fp: {1}    |    {2}     |\n'.format(z['true_negative'], z['false_positive'], z['true_negative']+z['false_positive'])+
      '|----------|-------------|-------------|------------|\n'+
      '| Actual:  |             |             |            |\n'+
      '|   UP     |  fn: {0}    |  tp: {1}    |    {2}    |\n'.format(z['false_negative'], z['true_positive'] ,z['false_negative']+z['true_positive'])+
      '|----------|-------------|-------------|------------|\n'+
      '           |             |             |\n'+
      '           |      {0}    |      {1}   |\n'.format(z['true_negative']+z['false_negative'], z['false_positive']+z['true_positive'])+
      '           |-------------|-------------|\n')
    # return df
    return df

In [None]:
# train results
train_predicted = model_fit.fittedvalues.copy()
train_actual = train.iloc[num_lag : len(train)]

In [None]:
train_predicted

In [None]:
model_graphs(predicted = train_predicted['Close'], actual = train_actual['Close'], title = 'Training')

In [None]:
test_data = pd.read_csv('/kaggle/input/another-sentiment-bitcoin/jan_cryptopanic.csv')
test_data

In [None]:
temp = []
for x in test_data.published_at:
    temp.append(parse(x))
test_data['published_at'] = temp
test_data

In [None]:
test_data['Date'] = temp
mean_data = test_data[['Date','sentiment_value']].groupby(['Date']).mean()

In [None]:
bitcoinprice2021 = [['2021-02-03', 37646.8],
 ['2021-02-02', 35485.2],
 ['2021-02-01', 33515.7],
 ['2021-01-31', 33108.1],
 ['2021-01-30', 34283.1],
 ['2021-01-29', 34301.8],
 ['2021-01-28', 33374.8],
 ['2021-01-27', 30404.0],
 ['2021-01-26', 32502.1],
 ['2021-01-25', 32252.3],
 ['2021-01-24', 32241.3],
 ['2021-01-23', 32088.9],
 ['2021-01-22', 33000.5],
 ['2021-01-21', 30842.1],
 ['2021-01-20', 35476.3],
 ['2021-01-19', 36002.9],
 ['2021-01-18', 36613.2],
 ['2021-01-17', 35839.6],
 ['2021-01-16', 36019.5],
 ['2021-01-15', 36845.8],
 ['2021-01-14', 39175.7],
 ['2021-01-13', 37382.2],
 ['2021-01-12', 34076.1],
 ['2021-01-11', 35544.3],
 ['2021-01-10', 38192.2],
 ['2021-01-09', 40151.9],
 ['2021-01-08', 40599.3],
 ['2021-01-07', 39460.2]] 
temp = []
for x in bitcoinprice2021:
    temp.append([parse(x[0]),x[1]])
df_2021 = pd.DataFrame(temp,columns=['Date','Close'])

In [None]:
df_2021.index = df_2021['Date']

In [None]:
mean_data['Close'] = df_2021['Close']
mean_data

In [None]:
# test_lag = stationary.iloc[len(train) - num_lag :]
test_predicted = predict(data = mean_data[['Close','sentiment_value']], fitted_model = model_fit, lag_order = num_lag, predict_steps = 1)
test_actual = mean_data[['Close','sentiment_value']]

In [None]:
model_graphs(predicted = test_predicted['Close'], actual = test_actual['Close'], title = 'Test')