In [170]:
import numpy as np
import pandas as pd
import re
import string
import tensorflow as tf

# for model 
import nltk
nltk.download('stopwords')
from textblob import Word
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import StringLookup

[nltk_data] Downloading package stopwords to /Users/sarah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [74]:
# read in good news data
df = pd.read_csv("articles.csv")
df.head()

Unnamed: 0,title,content
0,Butler University Creates 2-Year Debt-Free Col...,Butler University of Indianapolis has created ...
1,"Since Pandemic Closed His Business, New Jersey...",The owner of a New Jersey frame shop has been ...
2,PayPal Commits Over $500 Million to Support Mi...,PayPal yesterday announced a $530 million comm...
3,"9-Year-Old and Friends Have Raised $100,000 fo...",Some unlikely heroes in Minneapolis have raise...
4,Hanes is Equipping America’s Homeless With 1 M...,Hanes basic apparel is not only encouraging Am...


# **MODEL 1**

## Read Data

In [225]:
# load financial sentiment analysis data
# https://www.kaggle.com/datasets/sbhatti/financial-sentiment-analysis
# data provides a column for financial sentences 
# and a column for sentiment ('positive', 'negative', or 'neutral')
fin_data = pd.read_csv("fin_data.csv")
fin_data.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


## Pre-processing

### Label Encoder
We want to transform the Sentiment column into integers using a label encoder.

In [226]:
le = LabelEncoder()
fin_data.loc[:, "Sentiment"] = le.fit_transform(fin_data["Sentiment"])
fin_data.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,2
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",0
2,"For the last quarter of 2010 , Componenta 's n...",2
3,According to the Finnish-Russian Chamber of Co...,1
4,The Swedish buyout firm has sold its remaining...,1


In [227]:
le.classes_ # 0 = negative, 1 = neautral, 2 = positive

array(['negative', 'neutral', 'positive'], dtype=object)

### Standardization
We create a `standardization()` function to clean the text for our data later on. The `standardization()` function outputs text in lower case, removes punctuation, digits, and stop words, and lemmatizes the sentences.

In [230]:
def standardization(df, col_name):
    stop_words = stopwords.words('english')  
    # transforms text to lowercase letters
    df[col_name] = df[col_name].str.lower()
    # remove digits 
    df[col_name] = df[col_name].apply(lambda x: ''.join(d for d in x if not d.isdigit()))
    # remove punctuation
    df[col_name] = df[col_name].apply(lambda x: ''.join(x for x in x if x not in string.punctuation))
    # removes stop words for each word in Sentence column
    df[col_name] = df[col_name].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))
    # lemmatization: reduce inflected words to root form for each word in Sentence column
    df[col_name] = df[col_name].apply(lambda x: ' '.join([Word(x).lemmatize() for x in x.split()]))
    # remove empty strings and strings with whitespace
    df = df[df[col_name].str.strip().astype(bool)]
    return df.dropna(ignore_index=True)

In [231]:
fin_data = standardization(fin_data, "Sentence")
fin_data.head()

Unnamed: 0,Sentence,Sentiment
0,geosolutions technology leverage benefon gps s...,2
1,esi low bk real possibility,0
2,last quarter componenta net sale doubled eurm ...,2
3,according finnishrussian chamber commerce majo...,1
4,swedish buyout firm sold remaining percent sta...,1


### Splitting Data 
Create a dataset with predictor data (Sentence) and target data (Sentiment) and then split into training, validation, and testing sets.

In [232]:
data = tf.data.Dataset.from_tensor_slices((fin_data["Sentence"].astype(str).values, fin_data["Sentiment"].astype(int).values))
data = data.shuffle(buffer_size = len(data))

train_size = int(0.7*len(data))
val_size   = int(0.1*len(data))

train = data.take(train_size)
val   = data.skip(train_size).take(val_size)
test  = data.skip(train_size + val_size)

len(train), len(val), len(test)

(4088, 584, 1169)

### Vectorization
We will now convert our text data into a vector where the frequency of the word defines the integer representation of the word. 

In [233]:
# only the top distinct words will be tracked
max_tokens = 2500

# each headline will be a vector of length 25
# avg words in the Sentence column is 80
sequence_length = 150

vectorize_layer = TextVectorization(
    max_tokens = max_tokens, # only consider this many words
    output_mode = 'int',
    output_sequence_length = sequence_length)

In order for the vectorization to know the most frequent words, it needs to undergo an adaptation process.

In [235]:
def vectorize_sentence(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), [label]

train_vec = train.map(vectorize_sentence)
val_vec   = val.map(vectorize_sentence)
test_vec  = test.map(vectorize_sentence)

Vectorize each data set using a `vectorize_sentence()` function that will accept two arguments (representing sentence and sentiment) and return two variables.

In [235]:
def vectorize_sentence(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), [label]

train_vec = train.map(vectorize_sentence)
val_vec   = val.map(vectorize_sentence)
test_vec  = test.map(vectorize_sentence)

In [243]:
model1 = tf.keras.Sequential([
    layers.Embedding(max_tokens, output_dim =120, name="embedding"),
    layers.SpatialDropout1D(0.4),
    layers.LSTM(704, dropout=0.2, recurrent_dropout=0.2),
    layers.Dense(352, activation='LeakyReLU'),
    layers.Dense(3, activation='softmax')
])

model2 = tf.keras.Sequential([
  layers.Embedding(max_tokens, output_dim = 3, name="embedding"),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(3)]
)

In [244]:
model1.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
model1.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 120)         300000    
                                                                 
 spatial_dropout1d_2 (Spati  (None, None, 120)         0         
 alDropout1D)                                                    
                                                                 
 lstm_2 (LSTM)               (None, 704)               2323200   
                                                                 
 dense_4 (Dense)             (None, 352)               248160    
                                                                 
 dense_5 (Dense)             (None, 3)                 1059      
                                                                 
Total params: 2872419 (10.96 MB)
Trainable params: 2872419 (10.96 MB)
Non-trainable params: 0 (0.00 Byte)
______________

In [245]:
model2.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
model2.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 3)           7500      
                                                                 
 dropout (Dropout)           (None, None, 3)           0         
                                                                 
 global_average_pooling1d (  (None, 3)                 0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_1 (Dropout)         (None, 3)                 0         
                                                                 
 dense_6 (Dense)             (None, 3)                 12        
                                                                 
Total params: 7512 (29.34 KB)
Trainable params: 7512 (29.34 KB)
Non-trainable params: 0 (0.00 Byte)
____________________

In [246]:
# train model
# getting error
# history = model1.fit(train_vec, epochs = 20, validation_data = val_vec)

In [247]:
history = model2.fit(train_vec, epochs = 20, validation_data = val_vec)

Epoch 1/20


ValueError: in user code:

    File "/Users/sarah/anaconda3/envs/PIC16B-2/lib/python3.11/site-packages/keras/src/engine/training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "/Users/sarah/anaconda3/envs/PIC16B-2/lib/python3.11/site-packages/keras/src/engine/training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/sarah/anaconda3/envs/PIC16B-2/lib/python3.11/site-packages/keras/src/engine/training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "/Users/sarah/anaconda3/envs/PIC16B-2/lib/python3.11/site-packages/keras/src/engine/training.py", line 1127, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/sarah/anaconda3/envs/PIC16B-2/lib/python3.11/site-packages/keras/src/engine/training.py", line 1185, in compute_loss
        return self.compiled_loss(
    File "/Users/sarah/anaconda3/envs/PIC16B-2/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/sarah/anaconda3/envs/PIC16B-2/lib/python3.11/site-packages/keras/src/losses.py", line 143, in __call__
        losses = call_fn(y_true, y_pred)
    File "/Users/sarah/anaconda3/envs/PIC16B-2/lib/python3.11/site-packages/keras/src/losses.py", line 270, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/sarah/anaconda3/envs/PIC16B-2/lib/python3.11/site-packages/keras/src/losses.py", line 2221, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/Users/sarah/anaconda3/envs/PIC16B-2/lib/python3.11/site-packages/keras/src/backend.py", line 5575, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (1, 1) and (None, 3) are incompatible


In [None]:
model.evaluate(X_test,y_test)