# From Detection to Credibility: A Machine Learning Framework for Assessing News Source Reliability



In [2]:
# !pip3 install -r ../requirements.txt
!pip3 install tensorflow




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm


# Text Preprocessing and NLP
import nltk
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer


ModuleNotFoundError: No module named 'pandas'

# Data Preparation (Loading CSV)

Load the processed_data `csv` file into pandas DataFrames
- `processed_data.csv` is loaded into `data` DataFrame (stemming has been performed to reduce processing time.)

In [4]:
data = pd.read_csv('../processed_data.csv')

In [5]:
data.info()
print("Dataframe Shape:", data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63860 entries, 0 to 63859
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   label                   63860 non-null  int64 
 1   full_content            63860 non-null  object
 2   processed_full_content  63860 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.5+ MB
Dataframe Shape: (63860, 3)


In [6]:
data.head()

Unnamed: 0,label,full_content,processed_full_content
0,1,No comment is expected from Barack Obama Membe...,no comment expect barack obama member fyf911 f...
1,1,Did they post their votes for Hillary already?,post vote hillari alreadi
2,1,"Now, most of the demonstrators gathered last n...",demonstr gather last night exercis constitut p...
3,0,A dozen politically active pastors came here f...,dozen polit activ pastor came privat dinner fr...
4,1,"The RS-28 Sarmat missile, dubbed Satan 2, will...",rs-28 sarmat missil dub satan 2 replac ss-18 f...


In [7]:
data.isnull().sum()

label                     0
full_content              0
processed_full_content    0
dtype: int64

In [8]:
data['label'].value_counts()

label
0    34779
1    29081
Name: count, dtype: int64

In [9]:
# Ensure required NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('all')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Admin\AppData\Roamin

True

### Basic Convolutional Neural Network

In [11]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load your dataset here
# Assuming 'data' is a DataFrame with 'processed_full_content' and 'label' columns
# data = pd.read_csv('your_dataset.csv')

# Step 1: Tokenization and Padding
max_words = 10000  # Max vocabulary size
max_sequence_length = 300  # Max length of sequences

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])

# Pad sequences to ensure uniform length
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Target labels

# Step 2: Train-Validation-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Define the Basic CNN Model
def create_basic_cnn():
    model = Sequential()
    
    # Embedding layer with random initialization (no pre-trained embeddings)
    model.add(Embedding(input_dim=max_words, output_dim=128))
    
    # Convolutional layer
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    
    # Fully connected layer
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))  # Add dropout for regularization
    model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification, using the sigmoid activation function
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Step 4: Train the Model
model = create_basic_cnn()
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)

# Step 5: Evaluate the Model
y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Epoch 1/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 17ms/step - accuracy: 0.8543 - loss: 0.2973 - val_accuracy: 0.9653 - val_loss: 0.0887
Epoch 2/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.9859 - loss: 0.0452 - val_accuracy: 0.9746 - val_loss: 0.0692
Epoch 3/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.9978 - loss: 0.0102 - val_accuracy: 0.9780 - val_loss: 0.0761
Epoch 4/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.9996 - loss: 0.0028 - val_accuracy: 0.9770 - val_loss: 0.0914
Epoch 5/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.9995 - loss: 0.0017 - val_accuracy: 0.9768 - val_loss: 0.1083
Epoch 6/10
[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 16ms/step - accuracy: 0.9993 - loss: 0.0024 - val_accuracy: 0.9762 - val_loss: 0.1103
Epoch 7/10
[1m6

### Convolutional Neural Network + TF-IDF Vectorizer

Using TF-IDF vectorizer along with CNN led to a drastic fall in performance. Below are some reasons why we should not use TF-IDF vectorizer along with a CNN or other neural networks.

#### Lack of Spatial Structure:

TF-IDF vectors are sparse and non-sequential representations where each position in the vector represents a word, not a spatial pattern.
CNNs are designed to detect patterns in sequential or spatially structured data (e.g., images or sentences), so they might struggle to find meaningful patterns in TF-IDF vectors.

#### High-Dimensional Sparse Data:

TF-IDF vectors, especially with a high max_features value (like 10,000), result in a high-dimensional but sparse input.
CNNs are generally not well-suited for such high-dimensional sparse data; they perform better with dense embeddings where words have contextually meaningful dimensions.

#### Mismatch Between Input Type and CNN Architecture:

CNNs are typically effective when applied to word embeddings (like GloVe or Word2Vec) because embeddings maintain semantic relationships and neighborhood structures.
TF-IDF, however, does not capture word order or semantic relationships, which means the convolution operation might not yield meaningful feature maps.


In [17]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Reshape, Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Step 1: Apply TF-IDF Vectorization
max_features = 10000  # Limit TF-IDF to top 10,000 features
tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
X_tfidf = tfidf_vectorizer.fit_transform(data['processed_full_content']).toarray()

# Convert the labels
y = data['label'].values  # Target labels

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Step 3: Define the CNN Model for TF-IDF Input
def create_cnn_with_tfidf():
    inputs = Input(shape=(max_features,))
    x = Reshape((max_features, 1))(inputs)  # Reshape TF-IDF output to be compatible with Conv1D

    # Convolutional layer
    x = Conv1D(filters=128, kernel_size=5, activation='relu')(x)
    x = GlobalMaxPooling1D()(x)
    
    # Fully connected layer
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)  # Dropout for regularization
    outputs = Dense(1, activation='sigmoid')(x)  # Output layer for binary classification

    # Create model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Step 4: Train the Model
model = create_cnn_with_tfidf()
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Step 5: Evaluate the Model
y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Epoch 1/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 153ms/step - accuracy: 0.5428 - loss: 0.6880 - val_accuracy: 0.5503 - val_loss: 0.6820
Epoch 2/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 155ms/step - accuracy: 0.5517 - loss: 0.6841 - val_accuracy: 0.5727 - val_loss: 0.6793
Epoch 3/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 154ms/step - accuracy: 0.5553 - loss: 0.6809 - val_accuracy: 0.5732 - val_loss: 0.6765
Epoch 4/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 153ms/step - accuracy: 0.5578 - loss: 0.6801 - val_accuracy: 0.5700 - val_loss: 0.6758
Epoch 5/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 151ms/step - accuracy: 0.5579 - loss: 0.6807 - val_accuracy: 0.5729 - val_loss: 0.6750
Epoch 6/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 152ms/step - accuracy: 0.5592 - loss: 0.6795 - val_accuracy: 0.5709 - val_loss: 0.6757
Epoc

In [None]:
# pip install gensim

### Convolutional Neural Network + Custom-trained Word2Vec embeddings (on our dataset)

In [10]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from gensim.models import Word2Vec
import pandas as pd

# Tokenization parameters
max_words = 10000  # Maximum number of words to keep in the vocabulary
max_sequence_length = 300  # Maximum length of sequences

# Tokenize and create sequences
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Target labels

# Step 2: Train Word2Vec Embeddings
# Prepare sentences as lists of words for Word2Vec training
sentences = [text.split() for text in data['processed_full_content']]

# Train custom Word2Vec model
embedding_dim = 200  # Set embedding dimension (try 100-200)
custom_word2vec = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=2, workers=4)

# Step 3: Create Embedding Matrix from Custom Word2Vec
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for the padding token
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Map words in tokenizer's vocabulary to the Word2Vec vectors
for word, i in tokenizer.word_index.items():
    if i < max_words:  # Limit to top max_words
        if word in custom_word2vec.wv:
            embedding_matrix[i] = custom_word2vec.wv[word]
        else:
            embedding_matrix[i] = np.random.normal(size=(embedding_dim,))  # Random init for OOV words

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Define CNN Model with Custom Word2Vec Embeddings
def create_cnn_with_custom_word2vec():
    input_layer = Input(shape=(max_sequence_length,))
    
    # Embedding layer with custom Word2Vec embeddings
    embedding_layer = Embedding(input_dim=vocab_size,
                                output_dim=embedding_dim,
                                weights=[embedding_matrix],
                                trainable=False)(input_layer)  # Set to non-trainable

    # Convolutional and pooling layers
    x = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
    x = GlobalMaxPooling1D()(x)
    
    # Fully connected layer with Dropout
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)
    output_layer = Dense(1, activation='sigmoid')(x)  # Output layer for binary classification

    # Compile model
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Step 6: Train the CNN Model
model = create_cnn_with_custom_word2vec()
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Step 7: Evaluate the Model
y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

: 

### Convolutional Neural Network + Pre-Trained Word2Vec Embeddings


In [None]:
# import gensim.downloader as api
# word2vec_model = api.load('word2vec-google-news-300')

### Convolutional Neural Network + Word2Vec Embeddings + 5-Fold Cross Validation

We use Stratified K-Fold Cross-Validation with n_splits=5 to evaluate the model on different splits of the data. 
For each fold, we store the metrics (accuracy, precision, recall, and F1 score) and then calculate the average metrics across all folds for a robust evaluation.

Cross-validation helps us understand the model’s performance more robustly by testing it on multiple splits of the data. This approach gives a more reliable estimate of model performance and helps reduce the risk of overfitting to any single train-test split.

In [12]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from gensim.models import Word2Vec

# Tokenization parameters
max_words = 10000  # Maximum number of words to keep in the vocabulary
max_sequence_length = 300  # Maximum length of sequences

# Tokenize and create sequences
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Target labels

# Step 2: Train Custom Word2Vec Embeddings
sentences = [text.split() for text in data['processed_full_content']]

embedding_dim = 200  # Set embedding dimension, can set 100-300
custom_word2vec = Word2Vec(sentences, vector_size=embedding_dim, window=5, min_count=2, workers=4)

# Step 3: Create Embedding Matrix from Custom Word2Vec
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for the padding token
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Map words in tokenizer's vocabulary to the Word2Vec vectors
for word, i in tokenizer.word_index.items():
    if i < max_words:  # Limit to top max_words
        if word in custom_word2vec.wv:
            embedding_matrix[i] = custom_word2vec.wv[word]
        else:
            embedding_matrix[i] = np.random.normal(size=(embedding_dim,))  # Random init for OOV words

# Step 4: Define CNN Model with Custom Word2Vec Embeddings
def create_cnn_with_custom_word2vec():
    input_layer = Input(shape=(max_sequence_length,))
    
    # Embedding layer with custom Word2Vec embeddings
    embedding_layer = Embedding(input_dim=vocab_size,
                                output_dim=embedding_dim,
                                weights=[embedding_matrix],
                                trainable=False)(input_layer)  # Set to non-trainable

    # Convolutional and pooling layers
    x = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
    x = GlobalMaxPooling1D()(x)
    
    # Fully connected layer with Dropout
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)
    output_layer = Dense(1, activation='sigmoid')(x)  # Output layer for binary classification

    # Compile model
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Step 5: Implement 5-Fold Cross-Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies, precisions, recalls, f1_scores = [], [], [], []  # To store metrics

for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Create and train the CNN model
    model = create_cnn_with_custom_word2vec()
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=1)
    
    # Predict and calculate metrics for this fold
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

    # Store metrics for this fold
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Step 6: Calculate Average Metrics Across Folds
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1_score = np.mean(f1_scores)

# Print the aggregated report
print("\nAggregated Report:")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1_score:.4f}")

Epoch 1/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 16ms/step - accuracy: 0.8739 - loss: 0.3295 - val_accuracy: 0.9507 - val_loss: 0.1198
Epoch 2/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - accuracy: 0.9630 - loss: 0.1010 - val_accuracy: 0.9645 - val_loss: 0.0934
Epoch 3/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - accuracy: 0.9766 - loss: 0.0634 - val_accuracy: 0.9670 - val_loss: 0.0866
Epoch 4/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - accuracy: 0.9848 - loss: 0.0437 - val_accuracy: 0.9703 - val_loss: 0.0966
Epoch 5/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 17ms/step - accuracy: 0.9899 - loss: 0.0282 - val_accuracy: 0.9730 - val_loss: 0.1004
Epoch 6/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 17ms/step - accuracy: 0.9923 - loss: 0.0209 - val_accuracy: 0.9701 - val_loss: 0.1098
Epoch 7/10
[1m7

### Convolutional Neural network + GloVe word embeddings + 5-Fold Cross Validation

#### Why do we use word embedding over other preprocessing techniques (eg. tf-idf, count vectorizer), for our task of fake news classification?


##### 1. Word embeddings capture the semantic relationships between words in a dense, low-dimensional space.
Fake news often uses subtle language, and word embeddings like GloVe can capture the semantic context of words, allowing the model to understand relationships between words that simple vectorizers would miss. This helps in detecting nuanced differences in language use between real and fake news.

##### 2. Word embeddings produce dense, low-dimensional vectors (e.g., 100-300 dimensions) that capture rich word information.
Pre-trained embeddings are built on large corpora like Wikipedia and news articles, giving our model external knowledge that’s useful for distinguishing between real news and fake news. This boosts the model's ability to generalize on unseen test data from our web scraping.

##### 3. Efficient Representation of Semantics
Words in fake news can appear in different contexts, but with similar underlying meanings (e.g., "hoax" and "lie"). GloVe embeddings represent these similar words in close proximity in the vector space, helping the model recognize fake news patterns more effectively than TF-IDF or Count Vectorizer.

##### 4. Handling Synonyms and Rare Words:
Fake news often uses alternative phrases or rare terminology. Pre-trained embeddings like GloVe can handle these rare words because they’ve seen a broad variety of language during training, making our model more robust against unusual vocabulary choices in fake news.

In [11]:
## Takes 15mins 53 seconds to run on my PC (4070)

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Tokenize the already preprocessed text in 'processed_full_content' column
tokenizer = Tokenizer(num_words=10000)  # Limit to top 10,000 words
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])

# Padding sequences to ensure uniform length
max_sequence_length = 300
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Labels for fake news (0 for real, 1 for fake)

# Load pre-trained GloVe embeddings
embeddings_index = {}
with open('../glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Prepare embedding matrix for the words in your dataset
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define CNN model with pre-trained embeddings using the functional API
def create_model():
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(input_dim=vocab_size,
                                output_dim=embedding_dim,
                                weights=[embedding_matrix],
                                trainable=False)(input_layer)

    convs = []
    for kernel_size in [3, 4, 5]:
        conv = Conv1D(filters=128, kernel_size=kernel_size, activation='relu')(embedding_layer)
        pool = GlobalMaxPooling1D()(conv)
        convs.append(pool)

    merged = Concatenate()(convs)
    dense = Dense(128, activation='relu')(merged)
    drop = Dropout(0.5)(dense)
    output = Dense(1, activation='sigmoid')(drop)

    model = Model(inputs=input_layer, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Perform Stratified K-Fold Cross Validation with 5 folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store metrics for each fold
accuracies, precisions, recalls, f1_scores = [], [], [], []

for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights_dict = dict(enumerate(class_weights))

    # Create and train the CNN model
    model = create_model()
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), class_weight=class_weights_dict, verbose=1)

    # Predict and calculate metrics for this fold
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

    # Store metrics for this fold
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate average metrics across all folds
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1_score = np.mean(f1_scores)

# Print the classification report
print("\nAggregated Report:")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-score: {avg_f1_score:.4f}")

Epoch 1/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 22ms/step - accuracy: 0.8395 - loss: 0.3473 - val_accuracy: 0.9598 - val_loss: 0.1032
Epoch 2/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 22ms/step - accuracy: 0.9630 - loss: 0.0980 - val_accuracy: 0.9665 - val_loss: 0.0863
Epoch 3/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 22ms/step - accuracy: 0.9817 - loss: 0.0527 - val_accuracy: 0.9688 - val_loss: 0.0843
Epoch 4/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 22ms/step - accuracy: 0.9881 - loss: 0.0352 - val_accuracy: 0.9619 - val_loss: 0.1110
Epoch 5/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.9922 - loss: 0.0226 - val_accuracy: 0.9702 - val_loss: 0.0965
Epoch 6/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 22ms/step - accuracy: 0.9953 - loss: 0.0146 - val_accuracy: 0.9630 - val_loss: 0.1486
Epoch 7/10
[1m7

### Convolutional Neural network + GloVe word embeddings + 5-Fold Cross Validation + L2 Regularization + Other Hyperparameter Tuning

- Batch Normalization: Added after each convolutional layer to stabilize and speed up training
- Increase dropout rate from 0.5 to 0.6, which adds additional regularization.
- Early stopping when validation loss does not improve, to prevent overfitting.

L2 Regularization discourages large weights by adding a penalty to the loss function. It encourages the model to find simpler solutions by minimizing both the original loss and the weight magnitudes.

*Benefit*: Reduces overfitting by making the model less sensitive to noise and preventing it from memorizing specific data points.


In [13]:
## Takes 15mins 53 seconds to run on my PC (4070)

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers
from tensorflow.keras.layers import BatchNormalization
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Tokenize the already preprocessed text in 'processed_full_content' column
tokenizer = Tokenizer(num_words=10000)  # Limit to top 10,000 words
tokenizer.fit_on_texts(data['processed_full_content'])
sequences = tokenizer.texts_to_sequences(data['processed_full_content'])

# Padding sequences to ensure uniform length
max_sequence_length = 300
X = pad_sequences(sequences, maxlen=max_sequence_length)
y = data['label'].values  # Labels for fake news (0 for real, 1 for fake)

# Load pre-trained GloVe embeddings
embeddings_index = {}
with open('../glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Prepare embedding matrix for the words in your dataset
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define CNN model with pre-trained embeddings using the functional API
def create_model():
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(input_dim=vocab_size,
                                output_dim=embedding_dim,
                                weights=[embedding_matrix],
                                trainable=False)(input_layer)

    convs = []
    for kernel_size in [3, 4, 5]:
        conv = Conv1D(filters=128, kernel_size=kernel_size, activation='relu', kernel_regularizer=regularizers.l2(0.01))(embedding_layer) ## Adding L2 regularization
        conv = BatchNormalization()(conv) ## Adding batch normalization
        pool = GlobalMaxPooling1D()(conv)
        convs.append(pool)

    merged = Concatenate()(convs)
    dense = Dense(128, activation='relu')(merged)
    drop = Dropout(0.6)(dense) ## Change dropout from 0.5 to 0.6 for further regularization
    output = Dense(1, activation='sigmoid')(drop)

    model = Model(inputs=input_layer, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Perform Stratified K-Fold Cross Validation with 5 folds, with early stopping 
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store metrics for each fold
accuracies, precisions, recalls, f1_scores = [], [], [], []

for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights_dict = dict(enumerate(class_weights))

    # Create and train the CNN model
    model = create_model()
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test),class_weight=class_weights_dict, verbose=1, callbacks=[early_stopping])

    # Predict and calculate metrics for this fold
    y_pred = (model.predict(X_test) > 0.5).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

    # Store metrics for this fold
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Calculate average metrics across all folds
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1_score = np.mean(f1_scores)

# Print the classification report
print("\nAggregated Report:")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-score: {avg_f1_score:.4f}")

Epoch 1/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 38ms/step - accuracy: 0.8032 - loss: 2.1591 - val_accuracy: 0.9544 - val_loss: 0.3253
Epoch 2/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 37ms/step - accuracy: 0.9398 - loss: 0.3465 - val_accuracy: 0.9491 - val_loss: 0.2843
Epoch 3/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 37ms/step - accuracy: 0.9495 - loss: 0.2957 - val_accuracy: 0.9553 - val_loss: 0.2700
Epoch 4/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 37ms/step - accuracy: 0.9550 - loss: 0.2799 - val_accuracy: 0.9549 - val_loss: 0.2620
Epoch 5/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 38ms/step - accuracy: 0.9565 - loss: 0.2669 - val_accuracy: 0.9384 - val_loss: 0.2786
Epoch 6/10
[1m799/799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 38ms/step - accuracy: 0.9558 - loss: 0.2567 - val_accuracy: 0.9542 - val_loss: 0.2492
Epoch 7/10
[1m7

In [2]:
print('gay')

gay
