In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import string
import nltk
import ssl
import re

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

td = pd.read_csv('resources/all-data.csv', encoding="iso-8859-1", header=None)
td.drop_duplicates()
td.columns = ["sentiment", "headline"]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prateekjukalkar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prateekjukalkar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/prateekjukalkar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# we want 604 negatives, 800 positives, and 1000 neutrals
for s in set(td['sentiment']):
    print(s)
    print( sum(td['sentiment'] == s),  round( sum(td['sentiment'] == s) / len(td) * 100, 1), '%', '\n' )

positive
1363 28.1 % 

neutral
2879 59.4 % 

negative
604 12.5 % 



In [3]:
desired_positive_rows = 800
desired_neutral_rows = 1000

# Filter the DataFrame to include only positive rows
positive_rows = td[td['sentiment'] == 'positive']
neutral_rows = td[td['sentiment'] == 'neutral']

# If there are more than 800 positive rows, randomly select 800
if len(positive_rows) > desired_positive_rows:
    selected_positive_rows = positive_rows.sample(n=desired_positive_rows, random_state=42)  # You can choose a different random_state for reproducibility
    # Update the original DataFrame with the selected positive rows
    td = pd.concat([td[td['sentiment'] != 'positive'], selected_positive_rows])
else:
    selected_positive_rows = positive_rows
    
# If there are more than 1000 neutral rows, randomly select 1000
if len(neutral_rows) > desired_neutral_rows:
    selected_neutral_rows = neutral_rows.sample(n=desired_neutral_rows, random_state=42)  # You can choose a different random_state for reproducibility
    # Update the original DataFrame with the selected positive rows
    td = pd.concat([td[td['sentiment'] != 'neutral'], selected_neutral_rows])
else:
    selected_neutral_rows = neutral_rows

In [4]:
# make all text lowercase
td['headline'] = td['headline'].str.lower()

# Create a translation table to remove punctuation
translator = str.maketrans('', '', string.punctuation)
td['headline'] = td['headline'].str.translate(translator)

# Remove special characters
def remove_special_characters(sentence):
    # Use regex to remove non-alphanumeric characters
    cleaned_sentence = re.sub(r'[^A-Za-z0-9 ]+', '', sentence)
    return cleaned_sentence

# Apply the function to the 'text' column
td['headline'] = td['headline'].apply(remove_special_characters)

td['headline']

2       the international electronic industry company ...
415     a tinyurl link takes users to a scamming site ...
421     compared with the ftse 100 index  which rose 3...
423     compared with the ftse 100 index  which rose 9...
500     one of the challenges in the oil production in...
                              ...                        
3175    the casing comprises a first side casing membe...
1156    according to seikku  the retail sector in finl...
3037    mreal corporation stock exchange release 27 au...
1027    at capman haavisto will be responsible for gro...
3288    the name of the newspaper publishing and print...
Name: headline, Length: 2404, dtype: object

In [5]:
# Lemmatize
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

td['headline'] = td['headline'].str.split()

# Remove stopwords
stop_words = set(nltk.corpus.stopwords.words("english"))
td['headline'] = [[word for word in tokens if word.lower() not in stop_words] for tokens in td['headline']]

td['headline'] = td['headline'].apply( lambda x: [lemmatizer.lemmatize(word) for word in x] )

# Tokenize by letter
def letter_tokenize(word):
    x = []
    for i in word:
        x.append(list(i))
    return x
    
td['headline'] = td['headline'].apply(letter_tokenize)
td['headline']

2       [[i, n, t, e, r, n, a, t, i, o, n, a, l], [e, ...
415     [[t, i, n, y, u, r, l], [l, i, n, k], [t, a, k...
421     [[c, o, m, p, a, r, e, d], [f, t, s, e], [1, 0...
423     [[c, o, m, p, a, r, e, d], [f, t, s, e], [1, 0...
500     [[o, n, e], [c, h, a, l, l, e, n, g, e], [o, i...
                              ...                        
3175    [[c, a, s, i, n, g], [c, o, m, p, r, i, s, e, ...
1156    [[a, c, c, o, r, d, i, n, g], [s, e, i, k, k, ...
3037    [[m, r, e, a, l], [c, o, r, p, o, r, a, t, i, ...
1027    [[c, a, p, m, a, n], [h, a, a, v, i, s, t, o],...
3288    [[n, a, m, e], [n, e, w, s, p, a, p, e, r], [p...
Name: headline, Length: 2404, dtype: object

In [6]:
def convert_to_numbers(td):
    return td['headline'].apply(lambda sublist: [[ord(letter) - ord("0") for letter in word] for word in sublist])

td['headline'] = convert_to_numbers(td)
td['headline']

2       [[57, 62, 68, 53, 66, 62, 49, 68, 57, 63, 62, ...
415     [[68, 57, 62, 73, 69, 66, 60], [60, 57, 62, 59...
421     [[51, 63, 61, 64, 49, 66, 53, 52], [54, 68, 67...
423     [[51, 63, 61, 64, 49, 66, 53, 52], [54, 68, 67...
500     [[63, 62, 53], [51, 56, 49, 60, 60, 53, 62, 55...
                              ...                        
3175    [[51, 49, 67, 57, 62, 55], [51, 63, 61, 64, 66...
1156    [[49, 51, 51, 63, 66, 52, 57, 62, 55], [67, 53...
3037    [[61, 66, 53, 49, 60], [51, 63, 66, 64, 63, 66...
1027    [[51, 49, 64, 61, 49, 62], [56, 49, 49, 70, 57...
3288    [[62, 49, 61, 53], [62, 53, 71, 67, 64, 49, 64...
Name: headline, Length: 2404, dtype: object

In [7]:
# flatten the inner arrays
def combine_inner_arrays(row):
    return sum(row, [])

td['headline'] = td['headline'].apply(combine_inner_arrays)

print(td['headline'])

2       [57, 62, 68, 53, 66, 62, 49, 68, 57, 63, 62, 4...
415     [68, 57, 62, 73, 69, 66, 60, 60, 57, 62, 59, 6...
421     [51, 63, 61, 64, 49, 66, 53, 52, 54, 68, 67, 5...
423     [51, 63, 61, 64, 49, 66, 53, 52, 54, 68, 67, 5...
500     [63, 62, 53, 51, 56, 49, 60, 60, 53, 62, 55, 5...
                              ...                        
3175    [51, 49, 67, 57, 62, 55, 51, 63, 61, 64, 66, 5...
1156    [49, 51, 51, 63, 66, 52, 57, 62, 55, 67, 53, 5...
3037    [61, 66, 53, 49, 60, 51, 63, 66, 64, 63, 66, 4...
1027    [51, 49, 64, 61, 49, 62, 56, 49, 49, 70, 57, 6...
3288    [62, 49, 61, 53, 62, 53, 71, 67, 64, 49, 64, 5...
Name: headline, Length: 2404, dtype: object


In [8]:
# pad the inner array of each headline so they're all the same length

# first, lets find the max length of any of the inner arrays as a baseline
max_len = td['headline'].apply(len).max()
# pad the rest of the inner arrays with 0s
td['headline'] = td['headline'].apply(lambda x: x + [0] * (max_len - len(x)))

In [9]:
# # Remove max colwidth to verify if padding was successful
# pd.set_option('display.max_colwidth', None)

# # Print the DataFrame
# print(td.to_string(index=False))

# pd.reset_option('display.max_colwidth')

In [10]:
di = {"positive":1, "negative":0, "neutral":2}
td["sentiment"].replace(di, inplace=True)

td["sentiment"]

2       0
415     0
421     0
423     0
500     0
       ..
3175    2
1156    2
3037    2
1027    2
3288    2
Name: sentiment, Length: 2404, dtype: int64

In [11]:
td = td[td["sentiment"] != 2]

X = np.vstack( td['headline'] )
y = np.vstack( td['sentiment'] )
# X = np.expand_dims(X, 2)

X.shape, y.shape, type(X), type(y)

((1404, 231), (1404, 1), numpy.ndarray, numpy.ndarray)

In [12]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y)

y

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=17)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1333, 231), (71, 231), (1333, 2), (71, 2))

In [14]:
# import tensorflow as tf

# # Define LSTM model
# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(231, 231),
#     tf.keras.layers.Conv1D(128, 5, activation='relu'),
#     tf.keras.layers.GlobalAveragePooling1D(),
#     tf.keras.layers.Reshape((1, 128)),
#     tf.keras.layers.LSTM(128, activation='relu'),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dropout(0.2),
#     tf.keras.layers.Dense(3, activation='softmax')
# ])

# model.summary()

In [15]:
import tensorflow as tf

model = tf.keras.Sequential([
tf.keras.layers.Embedding(231, 231),
tf.keras.layers.Conv1D(512, 5, activation='relu'),
tf.keras.layers.Conv1D(256, 10, activation='relu'),
tf.keras.layers.Conv1D(128, 20, activation='relu'),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Reshape((1, 128)),
tf.keras.layers.LSTM(128),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(2, activation='softmax')
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 231)         53361     
                                                                 
 conv1d (Conv1D)             (None, None, 512)         591872    
                                                                 
 conv1d_1 (Conv1D)           (None, None, 256)         1310976   
                                                                 
 conv1d_2 (Conv1D)           (None, None, 128)         655488    
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 reshape (Reshape)           (None, 1, 128)            0         
                                                        

In [16]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x296ebf810>

In [17]:
model.save("finance_sentiment.keras")

In [18]:
from collections import Counter

Counter( np.argmax( model.predict(X_test), axis=1 ) )



Counter({1: 43, 0: 28})

In [19]:
Counter( list( np.argmax( y_test , axis=1) ) )

Counter({1: 40, 0: 31})

In [20]:
model.predict(X_test[0:1]), y_test[0]



(array([[9.999099e-01, 9.014498e-05]], dtype=float32),
 array([1., 0.], dtype=float32))

In [21]:
model.evaluate(X_test, y_test)



[0.636885404586792, 0.7887324094772339]

In [22]:
prediction = model.predict(X_test[0:1])

max(prediction[0]), np.argmax(prediction, axis=1)



(0.9999099, array([0]))