# From Detection to Credibility: A Machine Learning Framework for Assessing News Source Reliability



In [1]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud

In [2]:
data = pd.read_csv('../processed_data.csv')

In [3]:
data

Unnamed: 0,label,full_content,processed_full_content
0,1,No comment is expected from Barack Obama Membe...,no comment expect barack obama member fyf911 f...
1,1,Did they post their votes for Hillary already?,post vote hillari alreadi
2,1,"Now, most of the demonstrators gathered last n...",demonstr gather last night exercis constitut p...
3,0,A dozen politically active pastors came here f...,dozen polit activ pastor came privat dinner fr...
4,1,"The RS-28 Sarmat missile, dubbed Satan 2, will...",rs-28 sarmat missil dub satan 2 replac ss-18 f...
...,...,...,...
63855,0,WASHINGTON (Reuters) - Hackers believed to be ...,washington reuter hacker believ work russian g...
63856,1,"You know, because in fantasyland Republicans n...",know fantasyland republican never question cit...
63857,0,Migrants Refuse To Leave Train At Refugee Camp...,migrant refus leav train refuge camp hungari t...
63858,0,MEXICO CITY (Reuters) - Donald Trump’s combati...,mexico citi reuter donald trump ’ comb style b...


# Basic RNN

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Parameters
vocab_size = 5000         # Limit vocabulary to 5000 words
embedding_dim = 128        # Embedding dimensions for each word
max_sequence_length = 300 # Max number of words in each sequence

# Step 1: Tokenize and Pad the Text
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])
X_padded = pad_sequences(sequences, maxlen=max_sequence_length)

# Labels
y = data['label'].values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Step 2: Define a Simple RNN Model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length, trainable=True))
model.add(SimpleRNN(64, activation='tanh'))
model.add(Dropout(0.5))  # Add dropout for regularization
model.add(Dense(1, activation='sigmoid'))   # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 3: Train the Model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)

# Step 4: Evaluate the Model
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# Calculate and print metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print('Performance Metrics:')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Epoch 1/10




[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 39ms/step - accuracy: 0.8581 - loss: 0.3208 - val_accuracy: 0.9448 - val_loss: 0.1551
Epoch 2/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 40ms/step - accuracy: 0.9581 - loss: 0.1228 - val_accuracy: 0.9481 - val_loss: 0.1469
Epoch 3/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 40ms/step - accuracy: 0.9652 - loss: 0.0979 - val_accuracy: 0.9476 - val_loss: 0.1523
Epoch 4/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 40ms/step - accuracy: 0.9873 - loss: 0.0393 - val_accuracy: 0.9516 - val_loss: 0.1623
Epoch 5/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 40ms/step - accuracy: 0.9952 - loss: 0.0192 - val_accuracy: 0.9573 - val_loss: 0.2017
Epoch 6/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 41ms/step - accuracy: 0.9952 - loss: 0.0159 - val_accuracy: 0.9493 - val_loss: 0.2355
Epoch 7/10
[1m639/639[0m 

# RNN + Count Vectoriser

### Loss of Sequential Information
Poor performance because RNNs are not well-suited to the bag-of-words representation generated by `CountVectorizer`. Since `CountVectorizer` treats each document as a set of words without any order, words are represented only by their counts, not by their position in the text. Since RNNs are designed to work with ordered sequences, where the position and context of words matter, without preserving word order, the RNN cannot capture dependencies between words over time.

### Sparse, non-contextual input
`CountVectorizer` produces a sparse representation where each word is treated as an independent feature based on its frequency. There is no semantic or contextual relationship between words, and the word counts lack dense, meaningful relationships that an RNN could leverage, since RNNs perform best with dense, continuous data that represents meaningful relationships between words, typically achieved with word embeddings.

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Parameters
max_features = 5000       # Limit vocabulary to 5000 words
max_sequence_length = 300 # Max number of words in each sequence

# Step 1: Text Vectorization using CountVectorizer
vectorizer = CountVectorizer(max_features=max_features)
X_counts = vectorizer.fit_transform(data['processed_full_content']).toarray()

# Convert Counts to Sequences
# In this case, we're treating each word count as a sequence "step," although this is not a true sequence.
# To match the expected input format, we use padding.
X_padded = pad_sequences(X_counts, maxlen=max_sequence_length)

# Labels
y = data['label'].values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Reshape input to 3D for RNN (samples, timesteps, features)
X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Step 2: Define a Simple RNN Model (without Embedding layer)
model = Sequential()
model.add(SimpleRNN(64, activation='tanh', input_shape=(X_train_reshaped.shape[1], 1)))  # Input shape adjusted
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 3: Train the Model
model.fit(X_train_reshaped, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)

# Step 4: Evaluate the Model
y_pred_prob = model.predict(X_test_reshaped)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# Calculate and print metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print('Performance Metrics:')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Epoch 1/10


  super().__init__(**kwargs)


[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 47ms/step - accuracy: 0.5719 - loss: 0.6763 - val_accuracy: 0.5407 - val_loss: 0.6947
Epoch 2/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - accuracy: 0.5184 - loss: 0.7027 - val_accuracy: 0.5329 - val_loss: 0.6882
Epoch 3/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - accuracy: 0.5335 - loss: 0.6914 - val_accuracy: 0.5619 - val_loss: 0.6772
Epoch 4/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 45ms/step - accuracy: 0.5354 - loss: 0.6891 - val_accuracy: 0.5839 - val_loss: 0.6753
Epoch 5/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 45ms/step - accuracy: 0.5428 - loss: 0.6893 - val_accuracy: 0.5741 - val_loss: 0.6751
Epoch 6/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 56ms/step - accuracy: 0.5499 - loss: 0.6852 - val_accuracy: 0.5666 - val_loss: 0.6800
Epoch 7/10
[1m639/639[0m 

# RNN + Count Vectoriser + Conversion to pseudo-sequences with word indices

Performance is poorer than Basic RNN.

Over here, we transform the `CountVectorizer` output into integer sequences which is compatible with the embedding layer. Unlike `Tokenizer` which retains the natural order or words in the text, converting each word to an index that aligns with its position in the original sentence, `CountVectorizer` converts into pseudo-sequences which loses the actual word order. This means that the RNN may miss out on valuable sequential information.

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Parameters
MAX_FEATURES = 5000       # Number of words to consider as features
EMBEDDING_DIM = 128        # Simplified dimensionality of the embedding vectors
MAX_SEQUENCE_LENGTH = 300 # Reduced max number of words in a sequence for simplicity

# Step 1: Text Vectorization using CountVectorizer
vectorizer = CountVectorizer(max_features=MAX_FEATURES)
X_counts = vectorizer.fit_transform(data['processed_full_content'])
word_index = vectorizer.vocabulary_

# Inverse vocabulary mapping for sequences creation
index_to_word = {i: word for word, i in word_index.items()}

def counts_to_sequences(X_counts):
    sequences = []
    for i in range(X_counts.shape[0]):
        indices = X_counts[i].nonzero()[1]
        words = [index_to_word[idx] for idx in indices]
        seq = [word_index[word] + 1 for word in words]  # +1 because 0 is reserved for padding
        sequences.append(seq)
    return sequences

sequences = counts_to_sequences(X_counts)
X_padded = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Labels
y = data['label'].values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Step 2: Define the RNN Model (simplified parameters)
model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=EMBEDDING_DIM, input_length=max_sequence_length, trainable=True))
model.add(SimpleRNN(64, activation='tanh'))
model.add(Dropout(0.5))    # Add dropout for regularization
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)

y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# Calculate and print metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print('Performance Metrics:')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Epoch 1/10




[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 50ms/step - accuracy: 0.7264 - loss: 0.5322 - val_accuracy: 0.9081 - val_loss: 0.2577
Epoch 2/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - accuracy: 0.8909 - loss: 0.2908 - val_accuracy: 0.7479 - val_loss: 0.5116
Epoch 3/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 40ms/step - accuracy: 0.8147 - loss: 0.4266 - val_accuracy: 0.7207 - val_loss: 0.5568
Epoch 4/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 43ms/step - accuracy: 0.7374 - loss: 0.5270 - val_accuracy: 0.8506 - val_loss: 0.3633
Epoch 5/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 44ms/step - accuracy: 0.8686 - loss: 0.3406 - val_accuracy: 0.8819 - val_loss: 0.3087
Epoch 6/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 43ms/step - accuracy: 0.8965 - loss: 0.2913 - val_accuracy: 0.8801 - val_loss: 0.3062
Epoch 7/10
[1m639/639[0m 

# RNN + Word2vec

We would expect Word2Vec to perform better even though Word2Vec embeddings are generally more informative since training Word2Vec on our data ensures the embeddings are more relevant to our specific dataset.

### Task-specific patterns VS. General semantic relationships
Possible reason why Word2Vec performance is basically same is because Word2Vec embeddings are trained to capture general semantic relationships between words, not task-specific patterns. In fake news, Word2Vec might not directly learn contextual patterns, as it captures relationships based on co-occurrence rather than sequence dependencies relevant to fake news classification.

### Dataset Size
While 64k rows is reasonably large, it could still be insufficient for training high-quality Word2Vec embeddings that generalise well, since Word2Vec embeddings usually benefit from massive diverse datasets (millions of news articles). 

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
import numpy as np

# Parameters
vocab_size = 5000         # Limit vocabulary to 5000 words
embedding_dim = 128        # Embedding dimensions for each word
max_sequence_length = 300  # Max number of words in each sequence

# Step 1: Tokenize the Text Data
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])
X_padded = pad_sequences(sequences, maxlen=max_sequence_length)

# Labels
y = data['label'].values

# Step 2: Prepare Sentences for Word2Vec Training
# Convert tokenized sequences to lists of words for Word2Vec
sentences = [text.split() for text in data['processed_full_content']]

# Step 3: Train a Word2Vec Model
word2vec_model = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=1, workers=4, sg=1)
# Note: sg=1 uses skip-gram, which is effective for smaller datasets

# Step 4: Create Embedding Matrix from Trained Word2Vec Model
embedding_matrix = np.zeros((vocab_size, embedding_dim))
word_index = tokenizer.word_index

for word, i in word_index.items():
    if i < vocab_size:
        # Retrieve the embedding vector for the word
        embedding_vector = word2vec_model.wv[word] if word in word2vec_model.wv else None
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Step 6: Define the RNN Model with Word2Vec Embeddings
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
                    input_length=max_sequence_length, trainable=True))  # Use trainable=False if no fine-tuning is needed
model.add(SimpleRNN(64, activation='tanh'))
model.add(Dropout(0.5))  # Add dropout for regularization
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 7: Train the Model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)

# Step 8: Evaluate the Model
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# Calculate and print metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print('Performance Metrics:')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Epoch 1/10




[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 42ms/step - accuracy: 0.8250 - loss: 0.3798 - val_accuracy: 0.9423 - val_loss: 0.1738
Epoch 2/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 42ms/step - accuracy: 0.9398 - loss: 0.1815 - val_accuracy: 0.9469 - val_loss: 0.1631
Epoch 3/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 44ms/step - accuracy: 0.9556 - loss: 0.1380 - val_accuracy: 0.9442 - val_loss: 0.1598
Epoch 4/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 44ms/step - accuracy: 0.9641 - loss: 0.1108 - val_accuracy: 0.9310 - val_loss: 0.2007
Epoch 5/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 43ms/step - accuracy: 0.9589 - loss: 0.1194 - val_accuracy: 0.9471 - val_loss: 0.1589
Epoch 6/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 42ms/step - accuracy: 0.9562 - loss: 0.1235 - val_accuracy: 0.9425 - val_loss: 0.1845
Epoch 7/10
[1m639/639[0m 

# RNN + Word2vec + Stratified K-fold Cross Validation

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
import numpy as np

# Parameters
vocab_size = 5000         # Limit vocabulary to 5000 words
embedding_dim = 128        # Embedding dimensions for each word
max_sequence_length = 300  # Max number of words in each sequence
n_splits = 5               # Number of folds

# Step 1: Tokenize the Text Data
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])
X_padded = pad_sequences(sequences, maxlen=max_sequence_length)

# Labels
y = data['label'].values

# Step 2: Prepare Sentences for Word2Vec Training
sentences = [text.split() for text in data['processed_full_content']]

# Step 3: Train a Word2Vec Model
word2vec_model = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=1, workers=4, sg=1)
embedding_matrix = np.zeros((vocab_size, embedding_dim))
word_index = tokenizer.word_index

for word, i in word_index.items():
    if i < vocab_size:
        embedding_vector = word2vec_model.wv[word] if word in word2vec_model.wv else None
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Step 4: Stratified K-Fold Cross-Validation
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
accuracy_scores, precision_scores, recall_scores, f1_scores = [], [], [], []

for fold, (train_index, test_index) in enumerate(kf.split(X_padded, y), 1):
    print(f"Fold {fold}")
    
    # Split data into train and test for this fold
    X_train, X_test = X_padded[train_index], X_padded[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Step 5: Define and Compile the RNN Model with Word2Vec Embeddings
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
                        input_length=max_sequence_length, trainable=True))
    model.add(SimpleRNN(64, activation='tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Step 6: Train the Model
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)
    
    # Step 7: Evaluate the Model
    y_pred_prob = model.predict(X_test)
    y_pred = (y_pred_prob > 0.5).astype(int).flatten()
    
    # Calculate metrics for this fold
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    
    print(f"Fold {fold} Metrics:")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

# Step 8: Average Metrics Across All Folds
print("\nAverage Performance Metrics across all folds:")
print(f"Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Precision: {np.mean(precision_scores):.4f}")
print(f"Recall: {np.mean(recall_scores):.4f}")
print(f"F1 Score: {np.mean(f1_scores):.4f}")

Fold 1




Epoch 1/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 78ms/step - accuracy: 0.8302 - loss: 0.3689 - val_accuracy: 0.9291 - val_loss: 0.1902
Epoch 2/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 80ms/step - accuracy: 0.9426 - loss: 0.1704 - val_accuracy: 0.9262 - val_loss: 0.1989
Epoch 3/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 81ms/step - accuracy: 0.9462 - loss: 0.1652 - val_accuracy: 0.9461 - val_loss: 0.1549
Epoch 4/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 77ms/step - accuracy: 0.9609 - loss: 0.1166 - val_accuracy: 0.9439 - val_loss: 0.1622
Epoch 5/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 75ms/step - accuracy: 0.9686 - loss: 0.0982 - val_accuracy: 0.9376 - val_loss: 0.1835
Epoch 6/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 75ms/step - accuracy: 0.9739 - loss: 0.0831 - val_accuracy: 0.9519 - val_loss: 0.1654
Epoch 7/10
[1m6



[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 93ms/step - accuracy: 0.8410 - loss: 0.3530 - val_accuracy: 0.9347 - val_loss: 0.1821
Epoch 2/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 91ms/step - accuracy: 0.9395 - loss: 0.1736 - val_accuracy: 0.9494 - val_loss: 0.1390
Epoch 3/10
[1m569/639[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m6s[0m 95ms/step - accuracy: 0.9586 - loss: 0.1265

KeyboardInterrupt: 

# Count Vectoriser + Recurrent Neural network + K-Fold Cross Validation + L2 Regularization

Why L2 is preferred is because L2 penalises large weights but allows them to remain small rather than zeroing them out which is useful in RNNs where all the weights contribute to the learning process.

On the other hand, L1 forces some weights to become exactly 0, effectively performing feature selection by eliminating certain weights entirely which can disrupt sequential learning in RNNs where continuous, context-dependent features are essential.

Also, RNNs are more stable with L2 because it reduces weight magnitudes evely, which helps with gradient stability across time steps, which is particularly important in sequential tasks where large gradients can destabilise training. L1 can create sparse weights, potentially leading to instability if certain pathways are zeroed out, which can affect sequential dependencies.

L1 is more useful for feature selection in high-dimensional data where many features are irrelevant. Also L1's ability to zero out weights can make models more interpretable, as it helps identify which features are mots influential in classification, which is more common in sparse models like log regression, where interpretability of individual features is more practical.

# Count Vectoriser + Recurrent Neural network + GridSearch CV

Purpose of using Grid Search is for hyperparameter tuning, which is finding the best combination of hyperparameters for the model.

EarlyStopping is also used to prevent overfitting by stopping training when the metric no longer improves. In this case, if the validation loss does not improve for `patience=3` consecutive epochs.

L2 Regularization also prevents overfitting by ensuring the model doesn't rely too heavily on any one feature. It forces the model to distribute the 'learning' across multiple features.