In [1]:
import pandas as pd
import numpy as np

# Data preprocessing

In [2]:
df = pd.read_csv("all-data.csv", header=None, encoding='ISO-8859-1')
df.head()

Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [3]:
column_mapping = {0: "Sentiment", 1: "News"}
df = df.rename(columns=column_mapping)

In [4]:
df.head()

Unnamed: 0,Sentiment,News
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


# Get embeddings for news with SentenceTransformer

In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
sentences = df["News"]

In [7]:
embeddings = model.encode(sentences)

In [8]:
print(sentences[1])
print(embeddings[1].shape)

Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .
(384,)


In [9]:
embedding_df = pd.DataFrame(embeddings, columns=[f'embedding_{i+1}' for i in range(embeddings.shape[1])])

In [10]:
embedding_df

Unnamed: 0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,embedding_10,...,embedding_375,embedding_376,embedding_377,embedding_378,embedding_379,embedding_380,embedding_381,embedding_382,embedding_383,embedding_384
0,0.033103,-0.100423,-0.005485,-0.066260,0.018836,-0.031824,-0.035340,-0.077344,-0.009411,-0.007487,...,-0.022584,0.069428,0.011990,-0.089813,0.018476,-0.019637,-0.018754,-0.127873,0.009407,-0.022025
1,0.084485,0.030830,-0.002329,-0.056259,-0.113815,-0.121623,-0.061220,0.055847,-0.064939,0.021643,...,0.060081,0.006366,-0.040411,-0.021518,-0.055036,0.037341,-0.034406,0.047268,0.056127,-0.001998
2,0.043609,0.004894,0.004905,0.040851,-0.046114,0.062376,-0.028048,-0.046857,-0.038051,-0.068427,...,0.015348,-0.097364,0.010314,0.016644,0.062900,-0.012047,-0.049860,-0.020716,0.020335,0.038261
3,-0.067166,-0.029139,-0.037931,-0.010632,0.047564,-0.011515,-0.073900,0.040631,-0.057157,0.001864,...,0.068182,0.043284,-0.001184,0.050133,-0.001276,0.048531,-0.034414,-0.071167,0.079625,-0.017603
4,-0.021902,-0.022826,0.011935,-0.058745,-0.037734,0.007767,-0.022633,0.066558,-0.011591,0.004221,...,0.011690,-0.047741,-0.072852,-0.009161,0.004916,0.062899,0.002259,-0.126716,-0.077100,0.065531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4841,0.037514,-0.031628,0.114416,0.060084,0.015975,0.017323,-0.006961,-0.023695,-0.019650,-0.032931,...,-0.047153,-0.036253,-0.089335,0.063155,-0.114799,-0.068349,-0.031357,-0.059850,-0.199711,0.050612
4842,-0.010111,0.053529,-0.115659,-0.004243,-0.012975,-0.027198,0.014351,0.099848,0.016810,-0.004626,...,0.019655,-0.046397,-0.043783,-0.015527,-0.046047,-0.086482,-0.066512,-0.062369,-0.044434,0.001449
4843,0.044477,0.030813,0.068884,0.047986,-0.002873,-0.025707,-0.078903,0.112590,-0.039120,0.016103,...,0.064868,-0.009705,-0.028710,0.066628,-0.104506,-0.023398,-0.102364,-0.062642,-0.026184,0.003788
4844,-0.040445,-0.003198,0.065040,0.009511,0.013660,0.024086,-0.078426,0.112104,0.038359,-0.006200,...,0.021573,-0.046804,-0.040203,0.002362,-0.084330,0.068363,-0.033828,-0.067488,-0.045671,-0.021801


In [11]:
embedding_df["Sentiment"] = df["Sentiment"]

In [12]:
embedding_df.head()

Unnamed: 0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,embedding_10,...,embedding_376,embedding_377,embedding_378,embedding_379,embedding_380,embedding_381,embedding_382,embedding_383,embedding_384,Sentiment
0,0.033103,-0.100423,-0.005485,-0.06626,0.018836,-0.031824,-0.03534,-0.077344,-0.009411,-0.007487,...,0.069428,0.01199,-0.089813,0.018476,-0.019637,-0.018754,-0.127873,0.009407,-0.022025,neutral
1,0.084485,0.03083,-0.002329,-0.056259,-0.113815,-0.121623,-0.06122,0.055847,-0.064939,0.021643,...,0.006366,-0.040411,-0.021518,-0.055036,0.037341,-0.034406,0.047268,0.056127,-0.001998,neutral
2,0.043609,0.004894,0.004905,0.040851,-0.046114,0.062376,-0.028048,-0.046857,-0.038051,-0.068427,...,-0.097364,0.010314,0.016644,0.0629,-0.012047,-0.04986,-0.020716,0.020335,0.038261,negative
3,-0.067166,-0.029139,-0.037931,-0.010632,0.047564,-0.011515,-0.0739,0.040631,-0.057157,0.001864,...,0.043284,-0.001184,0.050133,-0.001276,0.048531,-0.034414,-0.071167,0.079625,-0.017603,positive
4,-0.021902,-0.022826,0.011935,-0.058745,-0.037734,0.007767,-0.022633,0.066558,-0.011591,0.004221,...,-0.047741,-0.072852,-0.009161,0.004916,0.062899,0.002259,-0.126716,-0.0771,0.065531,positive


In [13]:
X = embedding_df.drop(["Sentiment"], axis = 1)

In [14]:
y = embedding_df["Sentiment"]

# Label encoding predictor variable (sentiment)

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
lbe = LabelEncoder()

In [17]:
y_encoded = lbe.fit_transform(y)

In [18]:
y_encoded[:5]

array([1, 1, 0, 2, 2])

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_recall_curve

In [20]:
sum(y_encoded == 2)

1363

In [21]:
sum(y_encoded == 1)

2879

In [22]:
sum(y_encoded == 0)

604

# Stratified train test split

In [23]:
#handlingimbalanced dataset (Ex: churn prediction, device failure, cancer prediction)
#https://www.youtube.com/watch?v=JnlM4yLFNuo
# 1. undersampling majority class
# 2. oversampling minority class by duplication
# 3. oversampling minority class by SMOTE (synthetic examples using k nearest neighbors)
# 4. ensemble (3000 in class 1 and 1000 in class 2, create three models with 1000 in each class and then 
#take majority vote of predictions made by the 3 models)
# 5. focal loss (penalize majority class during loss calculation and give more weight to minority class samples)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [24]:
y_train

132     positive
1189     neutral
798     positive
2000    positive
3488     neutral
          ...   
1840     neutral
1129    positive
1682     neutral
1016     neutral
3477     neutral
Name: Sentiment, Length: 3876, dtype: object

# Decision Tree, Random Forest, Finetuning with Random Search

In [25]:
clf = DecisionTreeClassifier(class_weight='balanced')
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Accuracy: 0.5639175257731959
Classification Report
              precision    recall  f1-score   support

    negative       0.43      0.45      0.44       121
     neutral       0.68      0.67      0.67       576
    positive       0.39      0.39      0.39       273

    accuracy                           0.56       970
   macro avg       0.50      0.50      0.50       970
weighted avg       0.57      0.56      0.56       970



In [26]:
clf = RandomForestClassifier(class_weight='balanced', random_state=1)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Accuracy: 0.7103092783505155
Classification Report
              precision    recall  f1-score   support

    negative       0.81      0.46      0.59       121
     neutral       0.69      0.98      0.81       576
    positive       0.85      0.25      0.39       273

    accuracy                           0.71       970
   macro avg       0.78      0.56      0.59       970
weighted avg       0.75      0.71      0.66       970



In [27]:
text = "This is amazing"
emb = model.encode(text)
clf.predict(emb.reshape(1,-1))[0]
#In summary, (1, -1) is used when you want to reshape your array to have one row 
#and the number of columns inferred based on the size of the original array. 
#It doesn't transpose the array; it just changes its shape.



'neutral'

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, make_scorer

In [29]:
rf_classifier = RandomForestClassifier()
param_grid = {
    "random_state": [5,10,20,30,40,50],
    "max_depth": [2,5,10,20,50],
    "min_samples_split": [2,5,10],
    "min_samples_leaf": [1,2,4],
    'n_estimators': [50, 100, 200],
    'class_weight': ['balanced', None]
}

In [30]:
custom_scorer = make_scorer(accuracy_score)
random_search = RandomizedSearchCV(estimator = rf_classifier, param_distributions = param_grid, cv = 3, scoring = custom_scorer)

In [31]:
random_search.fit(X_train, y_train)

In [32]:
best_params = random_search.best_params_

In [33]:
best_model = random_search.best_estimator_

In [34]:
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7381443298969073


In [35]:
best_params

{'random_state': 20,
 'n_estimators': 200,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_depth': 10,
 'class_weight': 'balanced'}

In [36]:
rf_classifier_2 = RandomForestClassifier()
param_grid_2 = {
    "random_state": [20,25,30,35],
    "max_depth": [15,20,25],
    "min_samples_split": [5,10, 15],
    "min_samples_leaf": [1,2,3],
    'n_estimators': [150, 200, 250],
    'class_weight': ['balanced', None]
}

In [37]:
random_search_2 = RandomizedSearchCV(estimator = rf_classifier_2, param_distributions = param_grid_2, cv = 3, scoring = custom_scorer)

In [38]:
random_search_2.fit(X_train, y_train)

In [39]:
best_params_2 = random_search_2.best_params_
best_model_2 = random_search_2.best_estimator_
y_pred_2 = best_model_2.predict(X_test)
accuracy_2 = accuracy_score(y_test, y_pred_2)
print("Accuracy:", accuracy_2)

Accuracy: 0.7309278350515463


In [40]:
best_params_2

{'random_state': 25,
 'n_estimators': 200,
 'min_samples_split': 15,
 'min_samples_leaf': 3,
 'max_depth': 25,
 'class_weight': 'balanced'}

In [41]:
positive = embedding_df[embedding_df["Sentiment"] == "positive"]
neutral = embedding_df[embedding_df["Sentiment"] == "neutral"]
negative = embedding_df[embedding_df["Sentiment"] == "negative"]

In [42]:
positive.shape

(1363, 385)

In [43]:
negative.shape

(604, 385)

In [44]:
neutral.shape

(2879, 385)

In [45]:
embedding_df.shape

(4846, 385)

In [46]:
X_train.shape

(3876, 384)

In [47]:
y_train.value_counts()

neutral     2303
positive    1090
negative     483
Name: Sentiment, dtype: int64

In [48]:
X_test.shape

(970, 384)

In [49]:
unique_rows_embedding_df = embedding_df.drop_duplicates()
unique_rows_embedding_df.shape

(4840, 385)

# Downsampling

In [50]:
train_rows_each_class = int(min(positive.shape[0], neutral.shape[0], negative.shape[0])*0.8)

In [51]:
train_positive = positive.sample(train_rows_each_class)
train_neutral = neutral.sample(train_rows_each_class)
train_negative = negative.sample(train_rows_each_class)

In [52]:
train = pd.concat([train_positive, train_neutral, train_negative], axis = 0)
train.shape

(1449, 385)

In [53]:
#test = pd.concat([embedding_df, train]).drop_duplicates(keep=False)
test = pd.merge(embedding_df, train, how='left', indicator=True)
test = test[test['_merge'] == 'left_only'].drop(columns='_merge')

In [54]:
test.shape

(3397, 385)

In [55]:
#downsampling to understand class imbalance
x_train = train.drop(["Sentiment"], axis = 1)
Y_train = train["Sentiment"]
x_test = test.drop(["Sentiment"], axis = 1)
Y_test = test["Sentiment"]

In [56]:
rf_classifier_3 = RandomForestClassifier()
param_grid_3 = {
    "random_state": [20,25,30,35],
    "max_depth": [15,20,25],
    "min_samples_split": [5,10, 15],
    "min_samples_leaf": [1,2,3],
    'n_estimators': [150, 200, 250],
    'class_weight': ['balanced', None]
}

In [57]:
random_search_3 = RandomizedSearchCV(estimator = rf_classifier_3, param_distributions = param_grid_3, cv = 5, scoring = custom_scorer)

In [58]:
random_search_3.fit(x_train, Y_train)

In [59]:
best_params_3 = random_search_3.best_params_
best_model_3 = random_search_3.best_estimator_
y_pred_3 = best_model_3.predict(x_test)
accuracy_3 = accuracy_score(Y_test, y_pred_3)
print("Accuracy:", accuracy_3)

Accuracy: 0.7282896673535473


In [60]:
best_params_3

{'random_state': 30,
 'n_estimators': 250,
 'min_samples_split': 5,
 'min_samples_leaf': 3,
 'max_depth': 15,
 'class_weight': None}

# Oversampling using SMOTE

In [61]:
#over_sampling using SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)

In [62]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [63]:
rf_classifier_4 = RandomForestClassifier(random_state = 20, n_estimators = 200, min_samples_split = 15, min_samples_leaf = 1, max_depth = 20,class_weight = 'balanced')

In [64]:
X_train_resampled.shape

(6909, 384)

In [65]:
y_train_resampled.value_counts()

positive    2303
neutral     2303
negative    2303
Name: Sentiment, dtype: int64

In [66]:
rf_classifier_4.fit(X_train_resampled, y_train_resampled)
y_pred_4 = rf_classifier_4.predict(X_test)
accuracy_4 = accuracy_score(y_test, y_pred_4)
print("Accuracy:", accuracy_4)

Accuracy: 0.7577319587628866


In [67]:
print("Classification Report SMOTE")
print(classification_report(y_test, y_pred_4))

Classification Report SMOTE
              precision    recall  f1-score   support

    negative       0.73      0.68      0.70       121
     neutral       0.77      0.91      0.83       576
    positive       0.75      0.48      0.58       273

    accuracy                           0.76       970
   macro avg       0.75      0.69      0.71       970
weighted avg       0.76      0.76      0.74       970



In [68]:
print("Classification Report Downsampling")
print(classification_report(Y_test, y_pred_3))

Classification Report Downsampling
              precision    recall  f1-score   support

    negative       0.28      0.76      0.41       121
     neutral       0.86      0.79      0.82      2396
    positive       0.57      0.56      0.57       880

    accuracy                           0.73      3397
   macro avg       0.57      0.70      0.60      3397
weighted avg       0.76      0.73      0.74      3397



In [69]:
print("Classification Report (Random Forest with RandomizedSearchCV)")
print(classification_report(y_test, y_pred_2))

Classification Report (Random Forest with RandomizedSearchCV)
              precision    recall  f1-score   support

    negative       0.73      0.56      0.64       121
     neutral       0.72      0.95      0.82       576
    positive       0.78      0.34      0.48       273

    accuracy                           0.73       970
   macro avg       0.75      0.62      0.64       970
weighted avg       0.74      0.73      0.70       970



In [70]:
from sklearn.metrics import confusion_matrix

In [71]:
confusion_matrix(y_test, y_pred_2)

array([[ 68,  47,   6],
       [  9, 547,  20],
       [ 16, 163,  94]])

In [72]:
confusion_matrix(y_test, y_pred_4)

array([[ 82,  33,   6],
       [ 16, 522,  38],
       [ 15, 127, 131]])

In [73]:
accuracy_score(y_test, y_pred_4)

0.7577319587628866

# RNN (specifically LSTM)

In [74]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Sample data (X: text sequences, y: sentiment labels)
y_RNN = lbe.fit_transform(embedding_df['Sentiment'])
X_RNN = df['News']

X_train_RNN, X_test_RNN, y_train_RNN, y_test_RNN = train_test_split(X_RNN, y_RNN, test_size=0.2, random_state=42, stratify = y)


# Tokenize words and convert to numerical sequences
tokenizer = Tokenizer() #initialize tokenizer
tokenizer.fit_on_texts(X_train_RNN) # processes the input text data (X_train_RNN) and updates the internal vocabulary based on the unique words present in the text.
X_train_seq = tokenizer.texts_to_sequences(X_train_RNN) # convert text to sequences

# Pad sequences to fix input length to length 50
X_train_pad = pad_sequences(X_train_seq, maxlen=50, padding='post')

# Simple model

In [75]:
from tensorflow.keras.layers import Dropout #import dropout layer
#RNN model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=50),
    #input_dim = specifies the size of the vocabulary, i.e., the total number of unique words in the input text data.
    #output_dim = 100, each word will be embedded into a dense vector of size 100.
    #input_length = input sequences should have a length of 50 tokens after padding.
    #tokenizer.word_index is a dictionary that maps words (tokens) to their corresponding integer indices.
    LSTM(units=64),# LSTM layer processes sequential data by selectively retaining and updating information over time
    Dense(units=64, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=16, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_pad, y_train_RNN, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
# (Assuming you have test data X_test and y_test prepared in a similar way as training data)
# X_test_seq = tokenizer.texts_to_sequences(X_test)
# X_test_pad = pad_sequences(X_test_seq, maxlen=50, padding='post')
# model.evaluate(X_test_pad, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x17ddf0150>

In [76]:
X_train_pad.shape

(3876, 50)

In [77]:
X_train_RNN.shape

(3876,)

In [78]:
X_train_pad[0]

array([   1,  331, 2430,  112,  122,  807,  576,   25,   19,   48,   38,
         95,  417,  147,   32,    5, 1777, 1395,   27, 1778,   13, 2431,
       2023,   27, 1778,    3,  295,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0], dtype=int32)

In [79]:
X_train_RNN[0]

'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .'

In [80]:
tokenizer.word_index

{'the': 1,
 'of': 2,
 'in': 3,
 'and': 4,
 'to': 5,
 'a': 6,
 'for': 7,
 "'s": 8,
 'eur': 9,
 'is': 10,
 'company': 11,
 'will': 12,
 'from': 13,
 'on': 14,
 'its': 15,
 'with': 16,
 'as': 17,
 'has': 18,
 'by': 19,
 'be': 20,
 'said': 21,
 'mn': 22,
 'finnish': 23,
 '1': 24,
 'sales': 25,
 'at': 26,
 'million': 27,
 'it': 28,
 'that': 29,
 'net': 30,
 'profit': 31,
 'year': 32,
 'was': 33,
 'm': 34,
 'finland': 35,
 '2': 36,
 'group': 37,
 '5': 38,
 '3': 39,
 'an': 40,
 '2009': 41,
 'operating': 42,
 '2008': 43,
 'are': 44,
 '0': 45,
 'business': 46,
 'new': 47,
 '4': 48,
 'period': 49,
 'mln': 50,
 'quarter': 51,
 '2010': 52,
 '6': 53,
 'oyj': 54,
 '2007': 55,
 '7': 56,
 'which': 57,
 'have': 58,
 'services': 59,
 "''": 60,
 'market': 61,
 'also': 62,
 '8': 63,
 '000': 64,
 'share': 65,
 '9': 66,
 'first': 67,
 'this': 68,
 '2006': 69,
 'up': 70,
 'shares': 71,
 'helsinki': 72,
 'euro': 73,
 'today': 74,
 'been': 75,
 'about': 76,
 'mobile': 77,
 'loss': 78,
 'operations': 79,
 'comp

In [81]:
len(tokenizer.word_index)

9020

# Deeper model v2

In [84]:
from tensorflow.keras.layers import Dropout #import dropout layer
#RNN model
model2 = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=50),
    #input_dim = specifies the size of the vocabulary, i.e., the total number of unique words in the input text data.
    #output_dim = 100, each word will be embedded into a dense vector of size 100.
    #input_length = input sequences should have a length of 50 tokens after padding.
    #tokenizer.word_index is a dictionary that maps words (tokens) to their corresponding integer indices.
    LSTM(units=64),# LSTM layer processes sequential data by selectively retaining and updating information over time
    Dense(units=256, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=128, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=64, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=16, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

# Compile the model
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model2.fit(X_train_pad, y_train_RNN, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2c83af010>

In [85]:
# Evaluate the model
X_test_seq = tokenizer.texts_to_sequences(X_test_RNN)
X_test_pad = pad_sequences(X_test_seq, maxlen=50, padding='post')
model2.evaluate(X_test_pad, y_test_RNN)



[-480226560.0, 0.5938144326210022]

# Increased output_dim from 100 to 150

In [88]:
from tensorflow.keras.layers import Dropout #import dropout layer
#RNN model
model3 = Sequential([
    #reducing vocab by reducing input_dim parameter
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=150, input_length=50),
    #input_dim = specifies the size of the vocabulary, i.e., the total number of unique words in the input text data.
    #output_dim = 100, each word will be embedded into a dense vector of size 100.
    #input_length = input sequences should have a length of 50 tokens after padding.
    #tokenizer.word_index is a dictionary that maps words (tokens) to their corresponding integer indices.
    LSTM(units=64),# LSTM layer processes sequential data by selectively retaining and updating information over time
    Dense(units=256, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=128, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=64, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=16, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

# Compile the model
model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model3.fit(X_train_pad, y_train_RNN, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [103]:
model3.evaluate(X_test_pad, y_test_RNN)



[-1789672320.0, 0.5938144326210022]

# Increased output_dim from 100 to 300

In [98]:
from tensorflow.keras.layers import Dropout #import dropout layer
#RNN model
model4 = Sequential([
    #reducing vocab by reducing input_dim parameter
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=300, input_length=50),
    #input_dim = specifies the size of the vocabulary, i.e., the total number of unique words in the input text data.
    #output_dim = 100, each word will be embedded into a dense vector of size 100.
    #input_length = input sequences should have a length of 50 tokens after padding.
    #tokenizer.word_index is a dictionary that maps words (tokens) to their corresponding integer indices.
    LSTM(units=64),# LSTM layer processes sequential data by selectively retaining and updating information over time
    Dense(units=256, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=128, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=64, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=16, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

# Compile the model
model4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model4.fit(X_train_pad, y_train_RNN, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x30633af10>

In [102]:
model4.evaluate(X_test_pad, y_test_RNN)



[-264134672.0, 0.6092783212661743]

# Deeper model with output_dim set at 300

In [100]:
#RNN model
model5 = Sequential([
    #reducing vocab by reducing input_dim parameter
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=300, input_length=50),
    #input_dim = specifies the size of the vocabulary, i.e., the total number of unique words in the input text data.
    #output_dim = 100, each word will be embedded into a dense vector of size 100.
    #input_length = input sequences should have a length of 50 tokens after padding.
    #tokenizer.word_index is a dictionary that maps words (tokens) to their corresponding integer indices.
    LSTM(units=64),# LSTM layer processes sequential data by selectively retaining and updating information over time
    Dense(units=1024, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=512, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=256, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=128, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=64, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=16, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

# Compile the model
model5.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model5.fit(X_train_pad, y_train_RNN, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2d30e44d0>

In [101]:
model5.evaluate(X_test_pad, y_test_RNN)



[-4824268865011712.0, 0.5649484395980835]

# Increased output_dim from 100 to 400

In [106]:
from tensorflow.keras.layers import Dropout #import dropout layer
#RNN model
model6 = Sequential([
    #reducing vocab by reducing input_dim parameter
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=400, input_length=50),
    #input_dim = specifies the size of the vocabulary, i.e., the total number of unique words in the input text data.
    #output_dim = 100, each word will be embedded into a dense vector of size 100.
    #input_length = input sequences should have a length of 50 tokens after padding.
    #tokenizer.word_index is a dictionary that maps words (tokens) to their corresponding integer indices.
    LSTM(units=64),# LSTM layer processes sequential data by selectively retaining and updating information over time
    Dense(units=256, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=128, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=64, activation='relu'),
    Dropout(0.2),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=16, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

# Compile the model
model6.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model6.fit(X_train_pad, y_train_RNN, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2de84c590>

In [107]:
model6.evaluate(X_test_pad, y_test_RNN)



[-1827852288.0, 0.5938144326210022]

# Increase dropout rate from 0.2 to 0.3

In [108]:
from tensorflow.keras.layers import Dropout #import dropout layer
#RNN model
model7 = Sequential([
    #reducing vocab by reducing input_dim parameter
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=300, input_length=50),
    #input_dim = specifies the size of the vocabulary, i.e., the total number of unique words in the input text data.
    #output_dim = 100, each word will be embedded into a dense vector of size 100.
    #input_length = input sequences should have a length of 50 tokens after padding.
    #tokenizer.word_index is a dictionary that maps words (tokens) to their corresponding integer indices.
    LSTM(units=64),# LSTM layer processes sequential data by selectively retaining and updating information over time
    Dense(units=256, activation='relu'),
    Dropout(0.3),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=128, activation='relu'),
    Dropout(0.3),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=64, activation='relu'),
    Dropout(0.3),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=16, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

# Compile the model
model7.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model7.fit(X_train_pad, y_train_RNN, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2cc54dd10>

In [109]:
model7.evaluate(X_test_pad, y_test_RNN)



[-1389985920.0, 0.5938144326210022]

# Reduce batch size from 32 to 16

In [110]:
from tensorflow.keras.layers import Dropout #import dropout layer
#RNN model
model8 = Sequential([
    #reducing vocab by reducing input_dim parameter
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=300, input_length=50),
    #input_dim = specifies the size of the vocabulary, i.e., the total number of unique words in the input text data.
    #output_dim = 100, each word will be embedded into a dense vector of size 100.
    #input_length = input sequences should have a length of 50 tokens after padding.
    #tokenizer.word_index is a dictionary that maps words (tokens) to their corresponding integer indices.
    LSTM(units=64),# LSTM layer processes sequential data by selectively retaining and updating information over time
    Dense(units=256, activation='relu'),
    Dropout(0.3),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=128, activation='relu'),
    Dropout(0.3),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=64, activation='relu'),
    Dropout(0.3),#20% of the neurons in the dropout layer will be randomly set to zero
    Dense(units=16, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

# Compile the model
model8.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model8.fit(X_train_pad, y_train_RNN, epochs=10, batch_size=16, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x323b31390>

In [114]:
model8.evaluate(X_test_pad, y_test_RNN)



[-68898283520.0, 0.5938144326210022]