## Jacob Roach

In [196]:
# Import the needed Packages.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, Input
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM
from tensorflow import keras

## Data Collection and Feature Engineering
Before any modeling was performed, the necessary data was collected using two distinct platforms. The first data that was collected was Twitter data. This was done using the Twitter Developer API, as well as the `tweepy` module. Tweets containing the word "bitcoin" were streamed for several days. This data was written to a `.pkl` file, and saved for later feature engineering.

The other data that was collected was the value of a single Bitcoin. During the same interval (plus twenty-four hours after the last Tweet was recorded) that the Twitter data was collected, the value of a Bitcoin was recorded each minute, along with the corresponding time stamp.

Once the Twitter and Bitcoin data was recorded, further feature engineering was employed. For each Tweet stored, the corresponding price of Bitcoin at the time the Tweet was made was added as the `inital_price` for the Tweet. Then, for each Tweet, if the price of Bitcoin increased within three hours of the time the Tweet was made, the feature `increase` was assigned a value of `1`. Otherwise, `increase` is assigned the value of `0`.

Finally, for each Tweet recorded, the text of that Tweet is cleaned and standardized. This cleaned Tweet is then BERTified, and a vector of length 384 is returned. This vector is stored as the `embedded` feature.

In [183]:
# Read in the training data.
data = pd.read_pickle("../data/training_data.pkl")

# Reset the index, convert each embedding to an array.
data = data.reset_index(drop=True)
data["embedding"] = data["embedding"].apply(lambda x: np.asarray(x))

# Create a new train-test split (for aggregation).
stamps = np.unique(data.time)
data.set_index(["time"], inplace=True)
test_stamps = np.random.choice(stamps, size=int(stamps.shape[0] * .20))
test_data = data.loc[test_stamps, :]
train_data = data.loc[~data.index.isin(test_stamps), :]

Once the training data has been read in, the data will be quickly inspected, to show the reader the nature of the dataset.

In [184]:
# Investigate the DataFrame.
print("There are", len(data), "rows in the DataFrame.")
print("There are", len(data.loc[data["increase"] == 1, ]), "records with an increase, and", 
        len(data.loc[data["increase"] == 0, ]), "with a decrease.\n")

There are 336030 rows in the DataFrame.
There are 174261 records with an increase, and 161769 with a decrease.



In [185]:
# Create training and testing data.
x_train = train_data["embedding"]
y_train = train_data["increase"]
x_test = test_data["embedding"]
y_test = test_data["increase"]

# Conver to Tensors.
x_train = tf.convert_to_tensor(x_train.to_list())
y_train = tf.convert_to_tensor(y_train.to_list())
x_test = tf.convert_to_tensor(x_test.to_list())
y_test = tf.convert_to_tensor(y_test.to_list())

In [186]:
# Train the model.
input_layer = Input((384,))
dense = Dense(128, activation="relu")(input_layer)
output = Dense(2, activation="softmax")(dense)  # Output values is the number of classes.
rnn_model = Model(input_layer, output)

# Compile the model.
rnn_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam",metrics=["accuracy"])

# Fit the model. MAKE SURE TO CHANGE THIS TO 25 EPOCHS.
rnn_model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x174429220>

In [219]:
# Try a new model.
model = keras.Sequential()
model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True), input_shape=(384,1)))
model.add(layers.Bidirectional(layers.LSTM(32)))
model.add(layers.Dense(10))
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam",metrics=["accuracy"])
model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x170ad7940>

In [None]:
# Other models to try:
### - SVM
### - Naive Bayes
### - kNN
### - Random Forrests

In [142]:
# Confusion Matrix.

In [187]:
# Apply to DataFrame.
predictions = rnn_model.predict(x_test)
predictions = np.array(list(map(lambda x: 0 if x[0] > x[1] else 1, predictions)))
test_data["prediction"] = predictions

In [188]:
# Create a new DataFrame.
aggregated = pd.DataFrame(test_stamps, columns=["time"])

# Get the actual.
agg_count = test_data.loc[:, ["increase"]].groupby("time").count()
agg_sum = test_data.loc[:, ["increase", "prediction"]].groupby("time").sum()

# Change column names.
agg_count = agg_count.rename(columns={"increase": "total_count"})
agg_sum = agg_sum.rename(columns={"increase": "actual", "prediction": "pred_count"})

# Final join.
agg = agg_count.join(agg_sum)
agg["actual"] = agg["actual"].apply(lambda x: 0 if x == 0 else 1)
agg["pred_perc"] = agg["pred_count"] / agg["total_count"]
agg = agg[["actual", "total_count", "pred_count", "pred_perc"]]
agg

Unnamed: 0_level_0,actual,total_count,pred_count,pred_perc
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-02-16 09:15:00,0,219,182,0.831050
2022-02-16 09:30:00,0,213,161,0.755869
2022-02-16 09:31:00,0,193,146,0.756477
2022-02-16 09:33:00,1,166,136,0.819277
2022-02-16 09:40:00,0,202,163,0.806931
...,...,...,...,...
2022-02-17 18:40:00,0,182,95,0.521978
2022-02-17 18:43:00,0,154,95,0.616883
2022-02-17 18:54:00,0,149,75,0.503356
2022-02-17 18:55:00,0,149,89,0.597315


In [192]:
# Best cutoff.
check = agg.groupby("actual").mean()
check

Unnamed: 0_level_0,total_count,pred_count,pred_perc
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,184.4,108.567568,0.605305
1,186.978261,136.603261,0.736755


In [None]:
# New Confusion Matrix.