<a href="https://colab.research.google.com/github/stenforsm/predict-stock-news/blob/main/functional.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('g_news.csv')
spo = df.spo.values
price = df["close price change"].values

In [3]:
# class_plus = 1; class_minus = 0
y = np.sign(price)
y = np.where(y==1,1,0)

In [5]:
# train split
from sklearn.model_selection import train_test_split
spo_train, spo_split, y_train, y_split = train_test_split(spo, y, train_size = 0.8, random_state = 42)
# test valid split
spo_valid, spo_test, y_valid, y_test = train_test_split(spo_split, y_split, test_size = 0.5, random_state = 42)

In [6]:
# BOW text vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(binary=True)
tfidf_train = tfidf.fit_transform(spo_train).astype('float16')
tfidf_valid = tfidf.transform(spo_valid).astype('float16')
tfidf_test = tfidf.transform(spo_test).astype('float16')
n = tfidf_train.shape[1] # to determine input shape

In [7]:
# count text vectorization
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
count_train = (count.fit_transform(spo_train)).toarray()
count_valid = (count.transform(spo_valid)).toarray()
count_test = (count.transform(spo_test)).toarray()

In [8]:
from tensorflow.keras import layers

tfidf_input = tf.keras.Input(shape=(n), name='tfidf')
count_input = tf.keras.Input(shape=(n), name='count')

x = layers.concatenate([tfidf_input, count_input]) # merge all available features into a single large vector via concatenation
x = layers.Dense(64, activation='relu')(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dense(16, activation='relu')(x)
outputs = layers.Dense(10)(x)

model = tf.keras.Model(inputs=[tfidf_input, count_input], outputs=outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tfidf (InputLayer)              [(None, 7437)]       0                                            
__________________________________________________________________________________________________
count (InputLayer)              [(None, 7437)]       0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 14874)        0           tfidf[0][0]                      
                                                                 count[0][0]                      
__________________________________________________________________________________________________
dense (Dense)                   (None, 64)           952000      concatenate[0][0]            

In [9]:
from tensorflow.keras import losses

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.RMSprop(),
    metrics=["accuracy"]
)

callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)

model.fit({'tfidf': tfidf_train, 'count': count_train}, y_train, validation_data= ({'tfidf': tfidf_valid, 'count': count_valid}, y_valid), callbacks=callback, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


<keras.callbacks.History at 0x7f83a895bd90>

In [10]:
model.evaluate({'tfidf': tfidf_test, 'count': count_test}, y_test)



[0.9753336310386658, 0.6899293065071106]

In [11]:
predictions = model.predict({'tfidf': tfidf_test, 'count': count_test})
y_predict = np.argmax(predictions, axis=1)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_predict))

from sklearn.metrics import matthews_corrcoef
print(matthews_corrcoef(y_test, y_predict))

[[543 148]
 [203 238]]
0.33484707205914116
