In [3]:
import torch
import pandas as pd
import numpy as np
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder
from pytorch_forecasting.metrics import QuantileLoss
from sklearn.preprocessing import MinMaxScaler

In [10]:
BTC_Dataset = pd.read_csv('datasets/BTC-USD.csv')
Gold_Dataset = pd.read_csv('datasets/Gold Price.csv')
Google_Trends_Dataset = pd.read_csv('datasets/google_trends.csv')
Interest_Rate_Dataset = pd.read_csv('datasets/Interest_rate.csv')
VIX_Dataset = pd.read_csv('datasets/vix_historical.csv')

In [7]:
BTC_Dataset.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-08-28,10203.426758,10279.366211,9716.65625,9754.422852,9754.422852,17603790323
1,2019-08-29,9756.786133,9756.786133,9421.629883,9510.200195,9510.200195,17045878501
2,2019-08-30,9514.844727,9656.124023,9428.302734,9598.173828,9598.173828,13595263986
3,2019-08-31,9597.539063,9673.220703,9531.799805,9630.664063,9630.664063,11454806419
4,2019-09-01,9630.592773,9796.755859,9582.944336,9757.970703,9757.970703,11445355859


preprocessing steps
    Fill NaN values
    Drop unwanted Columns
    Convert Data types
    Fill missing dates and value with prev values (if necessary)
    set same date range (if necessary)
    change the date format (if necessary)
    remove dates after 2024-08-28 (if necessary)
    Reverse the order (if necessary)

In [11]:
BTC_Dataset = BTC_Dataset.drop(['Open', 'High', 'Low', 'Adj Close'], axis=1)

In [15]:
BTC_Dataset.tail()

Unnamed: 0,Date,Close,Volume
1823,2024-08-24,64178.992188,21430585163
1824,2024-08-25,64333.542969,18827683555
1825,2024-08-26,62880.660156,27682040631
1826,2024-08-27,59504.132813,39103882198
1827,2024-08-28,59621.035156,42965946368


In [24]:
BTC_Dataset.head()

Unnamed: 0,Date,Close,Volume
0,2019-08-28,9754.422852,17603790323
1,2019-08-29,9510.200195,17045878501
2,2019-08-30,9598.173828,13595263986
3,2019-08-31,9630.664063,11454806419
4,2019-09-01,9757.970703,11445355859


In [14]:
total_nan = BTC_Dataset.isna().sum().sum()
print("Total number of NaN values in DataFrame:", total_nan)

Total number of NaN values in DataFrame: 0


In [18]:
print(BTC_Dataset.dtypes)

Date      datetime64[ns]
Close            float64
Volume             int64
dtype: object


In [17]:
# convert date data type
BTC_Dataset['Date'] = pd.to_datetime(BTC_Dataset['Date'])

In [22]:
BTC_Dataset = BTC_Dataset.drop(BTC_Dataset.index[-1])

In [23]:
BTC_Dataset.tail()

Unnamed: 0,Date,Close,Volume
1822,2024-08-23,64094.355469,42530509233
1823,2024-08-24,64178.992188,21430585163
1824,2024-08-25,64333.542969,18827683555
1825,2024-08-26,62880.660156,27682040631
1826,2024-08-27,59504.132813,39103882198


GOLD Dataset

In [25]:
Gold_Dataset.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,11/01/2024,2761.95,2756.6,2767.4,2754.5,,0.46%
1,10/31/2024,2749.3,2799.1,2801.2,2741.8,242.57K,-1.84%
2,10/30/2024,2800.8,2786.9,2801.8,2782.4,191.55K,1.16%
3,10/29/2024,2768.8,2741.9,2773.2,2741.9,0.85K,0.92%
4,10/28/2024,2743.6,2739.4,2745.6,2725.8,0.57K,0.05%


In [27]:
Gold_Dataset = Gold_Dataset.drop(['Open', 'High', 'Low', 'Vol.', 'Change %'], axis=1)
Gold_Dataset.head()

KeyError: "['Open', 'High', 'Low', 'Vol.', 'Change %'] not found in axis"

In [28]:
Gold_Dataset.head()

Unnamed: 0,Date,Price
0,11/01/2024,2761.95
1,10/31/2024,2749.3
2,10/30/2024,2800.8
3,10/29/2024,2768.8
4,10/28/2024,2743.6


In [29]:
# reverse the order
Gold_Dataset = Gold_Dataset.iloc[::-1].reset_index(drop=True)
Gold_Dataset.head()

Unnamed: 0,Date,Price
0,08/28/2019,1542.8
1,08/29/2019,1533.7
2,08/30/2019,1526.2
3,09/02/2019,1555.9
4,09/03/2019,1552.8


In [30]:
# format date type
Gold_Dataset['Date'] = pd.to_datetime(Gold_Dataset['Date']).dt.strftime('%Y-%m-%d')
Gold_Dataset.head()

Unnamed: 0,Date,Price
0,2019-08-28,1542.8
1,2019-08-29,1533.7
2,2019-08-30,1526.2
3,2019-09-02,1555.9
4,2019-09-03,1552.8


In [31]:
print(Gold_Dataset.dtypes)

Date     object
Price    object
dtype: object


In [32]:
# convert data types
Gold_Dataset['Date'] = pd.to_datetime(Gold_Dataset['Date'])
Gold_Dataset['Price'] = Gold_Dataset['Price'].str.replace(',', '').astype(float)

In [33]:
print(Gold_Dataset.dtypes)

Date     datetime64[ns]
Price           float64
dtype: object


In [34]:
Gold_Dataset.tail()

Unnamed: 0,Date,Price
1333,2024-10-28,2743.6
1334,2024-10-29,2768.8
1335,2024-10-30,2800.8
1336,2024-10-31,2749.3
1337,2024-11-01,2761.95


In [35]:
# check for NaN values
total_nan = Gold_Dataset.isna().sum().sum()
print("Total number of NaN values in DataFrame:", total_nan)

Total number of NaN values in DataFrame: 0


In [36]:
# remove unwanted dates
start_date = '2024-08-28'
end_date = '2024-11-01'

# Filter out rows within the specified date range
Gold_Dataset = Gold_Dataset[~((Gold_Dataset['Date'] >= start_date) & (Gold_Dataset['Date'] <= end_date))]

In [37]:
Gold_Dataset.tail()

Unnamed: 0,Date,Price
1286,2024-08-21,2524.1
1287,2024-08-22,2493.5
1288,2024-08-23,2522.6
1289,2024-08-26,2531.4
1290,2024-08-27,2529.3


In [39]:
Gold_Dataset.set_index('Date', inplace=True)

# Create a complete date range from the minimum to maximum date in the dataset
all_dates = pd.date_range(start=Gold_Dataset.index.min(), end=Gold_Dataset.index.max())

# Reindex the DataFrame to include all dates, filling missing dates with NaN
df = Gold_Dataset.reindex(all_dates)

# Forward-fill missing 'Price' values without using 'inplace'
Gold_Dataset['Price'] = Gold_Dataset['Price'].ffill()

# Reset the index to make 'Date' a column again
Gold_Dataset.reset_index(inplace=True)
Gold_Dataset.rename(columns={'index': 'Date'}, inplace=True)

           Date   Price
1822 2024-08-23  2522.6
1823 2024-08-24  2522.6
1824 2024-08-25  2522.6
1825 2024-08-26  2531.4
1826 2024-08-27  2529.3


In [40]:
Gold_Dataset.tail()

Unnamed: 0,Date,Price
1822,2024-08-23,2522.6
1823,2024-08-24,2522.6
1824,2024-08-25,2522.6
1825,2024-08-26,2531.4
1826,2024-08-27,2529.3


google trends

In [43]:
Google_Trends_Dataset.tail()

Unnamed: 0,Category: All categories
2024-07-28,20
2024-08-04,29
2024-08-11,19
2024-08-18,18
2024-08-25,19


In [46]:
Google_Trends_Dataset = pd.read_csv('datasets/google_trends.csv')

In [47]:
Google_Trends_Dataset.tail()

Unnamed: 0,Category: All categories
2024-07-28,20
2024-08-04,29
2024-08-11,19
2024-08-18,18
2024-08-25,19


In [49]:
Google_Trends_Dataset = Google_Trends_Dataset.rename(columns={'Category: All categories': 'Value', '': 'Date'})

In [51]:
Google_Trends_Dataset = Google_Trends_Dataset.drop(0)

KeyError: '[0] not found in axis'

Interest Rate

In [61]:
Interest_Rate_Dataset.tail()

Unnamed: 0,Date,Interest Rate
1822,2024-08-23,5.33
1823,2024-08-24,5.33
1824,2024-08-25,5.33
1825,2024-08-26,5.33
1826,2024-08-27,5.33


In [53]:
Interest_Rate_Dataset = Interest_Rate_Dataset.rename(columns={'DFF': 'Interest Rate'})

In [57]:
Interest_Rate_Dataset = Interest_Rate_Dataset.rename(columns={'DATE': 'Date'})

In [60]:
Interest_Rate_Dataset.dtypes

Date             datetime64[ns]
Interest Rate           float64
dtype: object

In [59]:
Interest_Rate_Dataset['Date'] = pd.to_datetime(Interest_Rate_Dataset['Date'])

VIX Data

In [63]:
VIX_Dataset.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
1298,2024-08-19,15.94,16.07,14.46,14.65,14.65,0.0
1299,2024-08-20,14.89,15.93,14.78,15.88,15.88,0.0
1300,2024-08-21,16.25,17.17,15.92,16.27,16.27,0.0
1301,2024-08-22,16.27,18.059999,15.76,17.549999,17.549999,0.0
1302,2024-08-23,17.120001,17.209999,15.61,15.86,15.86,0.0


In [64]:
# drop columns
VIX_Dataset = VIX_Dataset.drop(['Open', 'High', 'Low', 'Adj Close', 'Volume'], axis=1)
VIX_Dataset.head()

Unnamed: 0,Date,Close
0,2019-08-28,19.35
1,2019-08-29,17.879999
2,2019-08-30,18.98
3,2019-09-02,
4,2019-09-03,19.66


In [74]:
# handle NaN values
total_nan = VIX_Dataset.isna().sum().sum()
print("Total number of NaN values in DataFrame:", total_nan)

Total number of NaN values in DataFrame: 0


In [73]:
VIX_Dataset.tail()

Unnamed: 0,Date,Close
1298,2024-08-19,14.65
1299,2024-08-20,15.88
1300,2024-08-21,16.27
1301,2024-08-22,17.549999
1302,2024-08-23,15.86


In [72]:
VIX_Dataset['Close'] = VIX_Dataset['Close'].ffill()

In [75]:
VIX_Dataset.dtypes

Date     datetime64[ns]
Close           float64
dtype: object

In [76]:
VIX_Dataset.set_index('Date', inplace=True)

# Create a complete date range from the minimum to maximum date in the dataset
all_dates = pd.date_range(start=VIX_Dataset.index.min(), end=VIX_Dataset.index.max())

# Reindex the DataFrame to include all dates, filling missing dates with NaN
VIX_Dataset = VIX_Dataset.reindex(all_dates)

# Forward-fill missing 'Price' values without using 'inplace'
VIX_Dataset['Close'] = VIX_Dataset['Close'].ffill()

# Reset the index to make 'Date' a column again
VIX_Dataset.reset_index(inplace=True)
VIX_Dataset.rename(columns={'index': 'Date'}, inplace=True)

In [80]:
VIX_Dataset.tail()

Unnamed: 0,Date,Close
1818,2024-08-19,14.65
1819,2024-08-20,15.88
1820,2024-08-21,16.27
1821,2024-08-22,17.549999
1822,2024-08-23,15.86


remove unwanted dates from all data frames

In [92]:
VIX_Dataset.head()

Unnamed: 0,Date,Interest Rate
0,2019-08-28,2.12
1,2019-08-29,2.12
2,2019-08-30,2.13
3,2019-08-31,2.13
4,2019-09-01,2.13


In [95]:
VIX_Dataset.tail()

Unnamed: 0,Date,Close
1818,2024-08-19,14.65
1819,2024-08-20,15.88
1820,2024-08-21,16.27
1821,2024-08-22,17.549999
1822,2024-08-23,15.86


In [93]:
start_date = '2024-08-24'
end_date = '2024-08-27'

# Filter out rows within the specified date range
Interest_Rate_Dataset = Interest_Rate_Dataset[~((Interest_Rate_Dataset['Date'] >= start_date) & (Interest_Rate_Dataset['Date'] <= end_date))]

In [96]:
BTC_Dataset.describe()

Unnamed: 0,Date,Close,Volume
count,1823,1823.0,1823.0
mean,2022-02-24 00:00:00,31513.505546,31186720000.0
min,2019-08-28 00:00:00,4970.788086,5331173000.0
25%,2020-11-25 12:00:00,16493.061524,19323220000.0
50%,2022-02-24 00:00:00,28701.779297,27868910000.0
75%,2023-05-25 12:00:00,44328.509765,38341090000.0
max,2024-08-23 00:00:00,73083.5,350967900000.0
std,,18481.856783,17932280000.0


In [97]:
Gold_Dataset.describe()

Unnamed: 0,Date,Price
count,1823,1823.0
mean,2022-02-24 00:00:00,1868.151728
min,2019-08-28 00:00:00,1457.2
25%,2020-11-25 12:00:00,1753.0
50%,2022-02-24 00:00:00,1843.6
75%,2023-05-25 12:00:00,1959.55
max,2024-08-23 00:00:00,2527.4
std,,213.61066


In [98]:
Interest_Rate_Dataset.describe()

Unnamed: 0,Date,Interest Rate
count,1823,1823.0
mean,2022-02-24 00:00:00,2.246912
min,2019-08-28 00:00:00,0.04
25%,2020-11-25 12:00:00,0.09
50%,2022-02-24 00:00:00,1.55
75%,2023-05-25 12:00:00,5.08
max,2024-08-23 00:00:00,5.33
std,,2.235756


In [99]:
VIX_Dataset.describe()

Unnamed: 0,Date,Close
count,1823,1823.0
mean,2022-02-24 00:00:00,21.065222
min,2019-08-28 00:00:00,11.54
25%,2020-11-25 12:00:00,15.145
50%,2022-02-24 00:00:00,19.190001
75%,2023-05-25 12:00:00,24.775001
max,2024-08-23 00:00:00,82.690002
std,,8.257763


In [100]:
Final_DataFrame = BTC_Dataset.merge(Gold_Dataset, on='Date', how='outer')\
                .merge(Interest_Rate_Dataset, on='Date', how='outer')\
                .merge(VIX_Dataset, on='Date', how='outer')

In [104]:
Final_DataFrame.tail()

Unnamed: 0,Date,BTC_Close,Volume,Price,Interest Rate,VIX_Close
1818,2024-08-19,59493.453125,25911207712,2518.1,5.33,14.65
1819,2024-08-20,59012.792969,31613400008,2527.4,5.33,15.88
1820,2024-08-21,61175.191406,32731154072,2524.1,5.33,16.27
1821,2024-08-22,60381.914063,27625734377,2493.5,5.33,17.549999
1822,2024-08-23,64094.355469,42530509233,2522.6,5.33,15.86


In [102]:
Final_DataFrame = Final_DataFrame.rename(columns={'Close_y': 'VIX_Close', 'Close_x': 'BTC_Close'})

In [105]:
Final_DataFrame = Final_DataFrame.rename(columns={'Volume': 'BTC_Volume', 'BTC_Close': 'BTC_Price', 'Price': 'Gold_Price'})

In [107]:
Final_DataFrame

Unnamed: 0,Date,BTC_Price,BTC_Volume,Gold_Price,Interest Rate,VIX_Close
0,2019-08-28,9754.422852,17603790323,1542.8,2.12,19.350000
1,2019-08-29,9510.200195,17045878501,1533.7,2.12,17.879999
2,2019-08-30,9598.173828,13595263986,1526.2,2.13,18.980000
3,2019-08-31,9630.664063,11454806419,1526.2,2.13,18.980000
4,2019-09-01,9757.970703,11445355859,1526.2,2.13,18.980000
...,...,...,...,...,...,...
1818,2024-08-19,59493.453125,25911207712,2518.1,5.33,14.650000
1819,2024-08-20,59012.792969,31613400008,2527.4,5.33,15.880000
1820,2024-08-21,61175.191406,32731154072,2524.1,5.33,16.270000
1821,2024-08-22,60381.914063,27625734377,2493.5,5.33,17.549999


In [108]:
# Scale features (0,1) using MinMaxScaler for faster convergence
scaler = MinMaxScaler()
Final_DataFrame[Final_DataFrame.columns] = scaler.fit_transform(Final_DataFrame[Final_DataFrame.columns])

DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>)

In [109]:
Final_DataFrame.dtypes

Date             datetime64[ns]
BTC_Price               float64
BTC_Volume                int64
Gold_Price              float64
Interest Rate           float64
VIX_Close               float64
dtype: object

In [110]:
Final_DataFrame['BTC_Volume'] = Final_DataFrame['BTC_Volume'].str.replace(',', '').astype(float)

AttributeError: Can only use .str accessor with string values!

In [111]:
# Separate the DateTime columns and numeric columns
date_columns = Final_DataFrame.select_dtypes(include=['datetime64']).columns
numeric_columns = Final_DataFrame.select_dtypes(include=['float64', 'int64']).columns

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Scale only the numeric columns
Final_DataFrame[numeric_columns] = scaler.fit_transform(Final_DataFrame[numeric_columns])

In [114]:
Final_DataFrame

Unnamed: 0,Date,BTC_Price,BTC_Volume,Gold_Price,Interest Rate,VIX_Close
0,2019-08-28,0.070231,0.035507,0.079985,0.393195,0.109768
1,2019-08-29,0.066646,0.033893,0.071482,0.393195,0.089108
2,2019-08-30,0.067937,0.023910,0.064474,0.395085,0.104568
3,2019-08-31,0.068414,0.017717,0.064474,0.395085,0.104568
4,2019-09-01,0.070283,0.017690,0.064474,0.395085,0.104568
...,...,...,...,...,...,...
1818,2024-08-19,0.800477,0.059542,0.991310,1.000000,0.043710
1819,2024-08-20,0.793420,0.076040,1.000000,1.000000,0.060998
1820,2024-08-21,0.825168,0.079274,0.996916,1.000000,0.066479
1821,2024-08-22,0.813521,0.064503,0.968324,1.000000,0.084469
