<a href="https://colab.research.google.com/github/stenforsm/predict-stock-news/blob/main/sequential.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
df = pd.read_csv('b_news.csv') # choose which company file to work with here
x = df.title.values # process full headline
price = df["close price change"].values

In [3]:
# class_plus = 1; class_minus = 0
y = np.sign(price) # labels extracted by checking if +ve or -ve
y = np.where(y==1,1,0)
weight = np.abs(price) # sample weight is absolute price change value

In [4]:
# test train split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test, w_train, w_test = train_test_split(x, y, weight, test_size=0.2)

In [5]:
# text vectorization

def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

max_features = 10000
sequence_length = 250

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (without labels), then call adapt
vectorize_layer.adapt(x_train)

def vectorize_text(text):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text)

x_train = vectorize_text(x_train)
x_test = vectorize_text(x_test)

In [6]:
embedding_dim = 16

model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.GlobalAveragePooling1D(),
  layers.Dense(1)])

model.summary()

class_weight = {0: 1.,
                1: 50.} # this can be adjusted

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 2)                 34        
Total params: 160,050
Trainable params: 160,050
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

model.fit(x_train, y_train, class_weight=class_weight, sample_weight=w_train, epochs=5)

Epoch 1/5


ValueError: ignored

In [None]:
model.evaluate(x_test, y_test)

In [None]:
predictions = model.predict(x_test)
y_predict = np.argmax(predictions, axis=1)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_predict))

from sklearn.metrics import matthews_corrcoef
print(matthews_corrcoef(y_test, y_predict))