In [74]:
import numpy as np
import pandas as pd
import re
import string
import tensorflow as tf

# for model 
import nltk
nltk.download('stopwords')
from textblob import Word
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import losses

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split 

[nltk_data] Downloading package stopwords to /Users/sarah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
# read in good news data
df = pd.read_csv("articles.csv")
df.head()

Unnamed: 0,title,content
0,Butler University Creates 2-Year Debt-Free Col...,Butler University of Indianapolis has created ...
1,"Since Pandemic Closed His Business, New Jersey...",The owner of a New Jersey frame shop has been ...
2,PayPal Commits Over $500 Million to Support Mi...,PayPal yesterday announced a $530 million comm...
3,"9-Year-Old and Friends Have Raised $100,000 fo...",Some unlikely heroes in Minneapolis have raise...
4,Hanes is Equipping America’s Homeless With 1 M...,Hanes basic apparel is not only encouraging Am...


In [75]:
# clean up the content from the good news articles
def clean_news(input_data):
    """
    This function transforms the text to all lowercase letters and removes special punctuation
    input: text data
    output: standardized string
    """
    lowercase = tf.strings.lower(input_data)
    no_punctuation = tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation),'')
    return no_punctuation.numpy().decode('utf-8')

In [42]:
# standardize the content 
df['content'] = df['content'].apply(clean_news)
df.head()

Unnamed: 0,title,content
0,Butler University Creates 2-Year Debt-Free Col...,butler university of indianapolis has created ...
1,"Since Pandemic Closed His Business, New Jersey...",the owner of a new jersey frame shop has been ...
2,PayPal Commits Over $500 Million to Support Mi...,paypal yesterday announced a 530 million commi...
3,"9-Year-Old and Friends Have Raised $100,000 fo...",some unlikely heroes in minneapolis have raise...
4,Hanes is Equipping America’s Homeless With 1 M...,hanes basic apparel is not only encouraging am...


In [44]:
df.shape

(2487, 2)

In [48]:
# ** MODEL 1 **

In [87]:
# load financial sentiment analysis data
# https://www.kaggle.com/datasets/sbhatti/financial-sentiment-analysis
# data provides a column for financial sentences 
# and a column for sentiment ('positive', 'negative', or 'neutral')
fin_data = pd.read_csv("fin_data.csv")
fin_data.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [77]:
fin_data.shape

(5842, 2)

In [51]:
# pre-processing

In [78]:
def standardization(df, stop_words):
    # transforms text to lowercase letters
    df['Sentence'] = df['Sentence'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
    # remove digits 
    df['Sentence'] = df['Sentence'].str.replace('\d', '')
    # removes stop words for each word in Sentence column
    df['Sentence'] = df['Sentence'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))
    # lemmatization: reduce inflected words to root form for each word in Sentence column
    df['Sentence'] = df['Sentence'].apply(lambda x: ' '.join([Word(x).lemmatize() for x in x.split()]))
    return df

In [88]:
stop_words = stopwords.words('english')
fin_data = standardization(fin_data, stop_words)
fin_data.head()

Unnamed: 0,Sentence,Sentiment
0,geosolutions technology leverage benefon 's gp...,positive
1,"$esi lows, $1.50 $2.50 bk real possibility",negative
2,"last quarter 2010 , componenta 's net sale dou...",positive
3,"according finnish-russian chamber commerce , m...",neutral
4,swedish buyout firm sold remaining 22.4 percen...,neutral


In [69]:
# issue: not removing digits

In [90]:
# add code to split data 

In [81]:
# use tokenizer to vectorize words into list of integers
max_num = 2000 # top 2000 common words in data
tokenizer = Tokenizer(num_words = max_num, split = ' ') 
tokenizer.fit_on_texts(fin_data['Sentence'].values)
# convert Sentence column into sequence of integers
X = tokenizer.texts_to_sequences(fin_data['Sentence'].values)
# ensures equal length
X = pad_sequences(X)

In [83]:
model1 = tf.keras.Sequential([
    layers.Embedding(max_num, output_dim = 120),
    layers.SpatialDropout1D(0.4),
    layers.LSTM(704, dropout=0.2, recurrent_dropout=0.2),
    layers.Dense(352, activation='LeakyReLU'),
    layers.Dense(3, activation='softmax')
])

In [86]:
model1.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
model1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 120)         240000    
                                                                 
 spatial_dropout1d_1 (Spati  (None, None, 120)         0         
 alDropout1D)                                                    
                                                                 
 lstm_1 (LSTM)               (None, 704)               2323200   
                                                                 
 dense_2 (Dense)             (None, 352)               248160    
                                                                 
 dense_3 (Dense)             (None, 3)                 1059      
                                                                 
Total params: 2812419 (10.73 MB)
Trainable params: 2812419 (10.73 MB)
Non-trainable params: 0 (0.00 Byte)
______________

In [None]:
# train model
model.fit(X_train, y_train, epochs = 20, batch_size=32, verbose =1)

In [None]:
model.evaluate(X_test,y_test)