# Import necessary libraries

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.linear_model import SGDClassifier

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud
import tensorflow as tf
import numpy as np
import random

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Data Preparation (Loading CSV)

Load the processed_data `csv` file into pandas DataFrames
- `processed_data.csv` is loaded into `data` DataFrame (stemming has been performed to reduce processing time.)

In [3]:
import pandas as pd
data = pd.read_csv('../processed_data.csv')
print(data.head())  # Shows the first 5 rows

   label                                       full_content  \
0      1  No comment is expected from Barack Obama Membe...   
1      1     Did they post their votes for Hillary already?   
2      1  Now, most of the demonstrators gathered last n...   
3      0  A dozen politically active pastors came here f...   
4      1  The RS-28 Sarmat missile, dubbed Satan 2, will...   

                              processed_full_content  
0  no comment expect barack obama member fyf911 f...  
1                          post vote hillari alreadi  
2  demonstr gather last night exercis constitut p...  
3  dozen polit activ pastor came privat dinner fr...  
4  rs-28 sarmat missil dub satan 2 replac ss-18 f...  


# SVM with countvectorizer


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score, precision_score, recall_score
import numpy as np
import random

seed = 42
np.random.seed(seed)
random.seed(seed)

# Step 1: Vectorization using CountVectorizer
MAX_FEATURES = 5000  # Number of features to consider
vectorizer = CountVectorizer(max_features=MAX_FEATURES)

# Convert text to feature vectors
X = vectorizer.fit_transform(data['processed_full_content']).toarray()
y = data['label']  # Target labels (binary classification: 0 or 1)

# Step 2: Stratified K-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
macro_f1_scores = []
precision_scores = []
recall_scores = []
accuracy_scores = []
confusion_matrices = []
classification_reports = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Step 3: Train the SVM Model with L2 Regularization
    svm = SGDClassifier(loss='hinge', penalty='l2', alpha=0.01, random_state=42)
    svm.fit(X_train, y_train)
    
    # Step 4: Make Predictions and Evaluate
    y_pred = svm.predict(X_test)
    
    # Collect evaluation metrics for each fold
    macro_f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    precision_scores.append(precision_score(y_test, y_pred, average='macro'))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    
    confusion_matrices.append(confusion_matrix(y_test, y_pred))
    classification_reports.append(classification_report(y_test, y_pred, digits=4))

# Calculate and print average metrics
print("Stratified 5-Fold Cross-Validation Performance Metrics:")
print(f"\nAverage Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Average Precision: {np.mean(precision_scores):.4f}")
print(f"Average Recall: {np.mean(recall_scores):.4f}")
print(f"Average Macro F1-Score: {np.mean(macro_f1_scores):.4f}")

for i, (conf_matrix, class_report) in enumerate(zip(confusion_matrices, classification_reports), 1):
    print(f"\nFold {i} - Confusion Matrix:\n", conf_matrix)
    print(f"\nFold {i} - Classification Report:\n", class_report)


Stratified 5-Fold Cross-Validation Performance Metrics:

Average Accuracy: 0.9559
Average Precision: 0.9551
Average Recall: 0.9564
Average Macro F1-Score: 0.9557

Fold 1 - Confusion Matrix:
 [[6594  361]
 [ 222 5595]]

Fold 1 - Classification Report:
               precision    recall  f1-score   support

           0     0.9674    0.9481    0.9577      6955
           1     0.9394    0.9618    0.9505      5817

    accuracy                         0.9544     12772
   macro avg     0.9534    0.9550    0.9541     12772
weighted avg     0.9547    0.9544    0.9544     12772


Fold 2 - Confusion Matrix:
 [[6622  334]
 [ 236 5580]]

Fold 2 - Classification Report:
               precision    recall  f1-score   support

           0     0.9656    0.9520    0.9587      6956
           1     0.9435    0.9594    0.9514      5816

    accuracy                         0.9554     12772
   macro avg     0.9546    0.9557    0.9551     12772
weighted avg     0.9555    0.9554    0.9554     12772


Fol

# SVM with tf-idf instead of CountVectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TF-IDF instead of CountVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=MAX_FEATURES)
X = tfidf_vectorizer.fit_transform(data['processed_full_content']).toarray()
y = data['label']  # Target labels (binary classification: 0 or 1)

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the SVM Model
# svm = SVC(kernel='linear', random_state=42)
svm = SGDClassifier(loss='hinge', random_state=42)
svm.fit(X_train, y_train)

# Step 4: Make Predictions and Evaluate
y_pred = svm.predict(X_test)

# Evaluate model performance
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
macro_f1 = f1_score(y_test, y_pred, average='macro')
print("\nMacro Average F1-Score:", macro_f1)


Confusion Matrix:
 [[6784  220]
 [ 403 5365]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9439    0.9686    0.9561      7004
           1     0.9606    0.9301    0.9451      5768

    accuracy                         0.9512     12772
   macro avg     0.9523    0.9494    0.9506     12772
weighted avg     0.9515    0.9512    0.9511     12772


Macro Average F1-Score: 0.9506117863026156


# SVM with glove

In [6]:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

# Load GloVe vectors
def load_glove(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Convert text data to GloVe embeddings
def get_embedding_matrix(data, embeddings_index, embed_dim=100):
    embedding_matrix = []
    for text in data:
        words = text.split()
        embeddings = [embeddings_index[word] for word in words if word in embeddings_index]
        if embeddings:
            document_embedding = np.mean(embeddings, axis=0)  # Average embeddings for the document
        else:
            document_embedding = np.zeros(embed_dim)  # Zero vector if no embeddings found
        embedding_matrix.append(document_embedding)
    return np.array(embedding_matrix)

# Assume `data` is a DataFrame with `processed_full_content` and `label` columns
glove_path = "glove.6B.100d.txt"
embeddings_index = load_glove(glove_path)
X = get_embedding_matrix(data['processed_full_content'], embeddings_index)
y = data['label']

# Initialize classifier and StratifiedKFold
svm = SGDClassifier(loss='hinge', penalty='l2', alpha=0.01, random_state=42)
skf = StratifiedKFold(n_splits=5)

# Cross-validation loop with tqdm progress bar
accuracy_scores = []
precision_scores = []
recall_scores = []
macro_f1_scores = []

for train_index, test_index in tqdm(skf.split(X, y), total=skf.get_n_splits(), desc="Cross-Validation Progress"):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train and predict
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    
    # Evaluate metrics for each fold
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    
    # Append each metric
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    macro_f1_scores.append(macro_f1)
    
    # Print metrics for each fold
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("\nAccuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("Macro Average F1-Score:", macro_f1)

# Calculate average metrics across all folds
print("\nAverage Metrics across 5 folds:")
print("Average Accuracy:", np.mean(accuracy_scores))
print("Average Precision:", np.mean(precision_scores))
print("Average Recall:", np.mean(recall_scores))
print("Average Macro F1-Score:", np.mean(macro_f1_scores))


Cross-Validation Progress:  40%|████      | 2/5 [00:00<00:00,  5.85it/s]

Confusion Matrix:
 [[6035  920]
 [1503 4314]]

Classification Report:
               precision    recall  f1-score   support

           0     0.8006    0.8677    0.8328      6955
           1     0.8242    0.7416    0.7807      5817

    accuracy                         0.8103     12772
   macro avg     0.8124    0.8047    0.8068     12772
weighted avg     0.8114    0.8103    0.8091     12772


Accuracy: 0.8102881302849985
Precision: 0.8124182273322997
Recall: 0.8046702277108161
Macro Average F1-Score: 0.8067798331095009
Confusion Matrix:
 [[6153  803]
 [1593 4223]]

Classification Report:
               precision    recall  f1-score   support

           0     0.7943    0.8846    0.8370      6956
           1     0.8402    0.7261    0.7790      5816

    accuracy                         0.8124     12772
   macro avg     0.8173    0.8053    0.8080     12772
weighted avg     0.8152    0.8124    0.8106     12772


Accuracy: 0.8124021296586282
Precision: 0.8172881342348988
Recall: 0.8053

Cross-Validation Progress:  60%|██████    | 3/5 [00:00<00:00,  5.76it/s]

Confusion Matrix:
 [[6039  917]
 [1395 4421]]

Classification Report:
               precision    recall  f1-score   support

           0     0.8123    0.8682    0.8393      6956
           1     0.8282    0.7601    0.7927      5816

    accuracy                         0.8190     12772
   macro avg     0.8203    0.8142    0.8160     12772
weighted avg     0.8196    0.8190    0.8181     12772


Accuracy: 0.8189790165988099
Precision: 0.8202807410344036
Recall: 0.8141578960065747
Macro Average F1-Score: 0.8160264852305266
Confusion Matrix:
 [[6125  831]
 [1372 4444]]

Classification Report:
               precision    recall  f1-score   support

           0     0.8170    0.8805    0.8476      6956
           1     0.8425    0.7641    0.8014      5816

    accuracy                         0.8275     12772
   macro avg     0.8297    0.8223    0.8245     12772
weighted avg     0.8286    0.8275    0.8265     12772


Accuracy: 0.8275133103664266
Precision: 0.8297289595142954
Recall: 0.8223

Cross-Validation Progress: 100%|██████████| 5/5 [00:00<00:00,  5.59it/s]

Confusion Matrix:
 [[6110  846]
 [1334 4482]]

Classification Report:
               precision    recall  f1-score   support

           0     0.8208    0.8784    0.8486      6956
           1     0.8412    0.7706    0.8044      5816

    accuracy                         0.8293     12772
   macro avg     0.8310    0.8245    0.8265     12772
weighted avg     0.8301    0.8293    0.8285     12772


Accuracy: 0.8293141246476667
Precision: 0.8310057437878502
Recall: 0.8245055578274285
Macro Average F1-Score: 0.8264950745792454

Average Metrics across 5 folds:
Average Accuracy: 0.819699342311306
Average Precision: 0.8221443611807494
Average Recall: 0.814196169499947
Average Macro F1-Score: 0.8163584703183238





# SVM with TF-IDF and glove

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Get TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(data['processed_full_content'])

# Get GloVe embeddings
X_glove = get_embedding_matrix(data['processed_full_content'], embeddings_index)

# Combine GloVe and TF-IDF features
X_combined = hstack((X_tfidf, StandardScaler().fit_transform(X_glove))).toarray()

# Split and train model
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
svm = SGDClassifier(loss='hinge', random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
macro_f1 = f1_score(y_test, y_pred, average='macro')
print("\nMacro Average F1-Score:", macro_f1)


Confusion Matrix:
 [[6669  335]
 [ 346 5422]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9507    0.9522    0.9514      7004
           1     0.9418    0.9400    0.9409      5768

    accuracy                         0.9467     12772
   macro avg     0.9462    0.9461    0.9462     12772
weighted avg     0.9467    0.9467    0.9467     12772


Macro Average F1-Score: 0.9461670657996066


# SVM + Pre-trained Word2Vec

In [None]:
import numpy as np
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from scipy.sparse import hstack

# Step 1: Load pre-trained Word2Vec embeddings
def load_word2vec(file_path):
    return KeyedVectors.load_word2vec_format(file_path, binary=True)

# Load Word2Vec vectors
word2vec_path = r"c:\Users\Admin\Downloads\GoogleNews-vectors-negative300.bin"  # Example path
word2vec = load_word2vec(word2vec_path)

# Step 2: Convert each document to Word2Vec embeddings by averaging word vectors
def get_word2vec_embedding(text, word2vec, embed_dim=300):
    words = text.split()
    embeddings = [word2vec[word] for word in words if word in word2vec.key_to_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embed_dim)

# Generate Word2Vec embeddings for the entire dataset
X_word2vec = np.array([get_word2vec_embedding(text, word2vec) for text in data['processed_full_content']])

# Step 3: Generate CountVectorizer features
vectorizer = CountVectorizer(max_features=5000)
X_count = vectorizer.fit_transform(data['processed_full_content'])

# Step 4: Combine Word2Vec embeddings and CountVectorizer features
X_combined = hstack((X_count, X_word2vec))

# Step 5: Train-test split
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Step 6: Train a classifier (e.g., SVM) on the combined features
svm = SGDClassifier(loss='hinge', random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

# Step 7: Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
macro_f1 = f1_score(y_test, y_pred, average='macro')
print("\nMacro Average F1-Score:", macro_f1)


# SVM + Word2Vec

In [9]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

# Train or load Word2Vec model (assuming `data` is available and each entry is a list of words)
def train_word2vec_model(data, embed_dim=100):
    # Tokenize each document into a list of words if not already tokenized
    tokenized_data = [text.split() for text in data]  # Split each text into words
    # Train Word2Vec model
    model = Word2Vec(sentences=tokenized_data, vector_size=embed_dim, window=5, min_count=1, workers=4)
    return model

# Convert text data to Word2Vec embeddings
def get_embedding_matrix(data, model, embed_dim=100):
    embedding_matrix = []
    for text in data:
        words = text.split()
        embeddings = [model.wv[word] for word in words if word in model.wv]
        if embeddings:
            document_embedding = np.mean(embeddings, axis=0)  # Average embeddings for the document
        else:
            document_embedding = np.zeros(embed_dim)  # Zero vector if no embeddings found
        embedding_matrix.append(document_embedding)
    return np.array(embedding_matrix)

# Assume `data` is a DataFrame with `processed_full_content` and `label` columns
word2vec_model = train_word2vec_model(data['processed_full_content'])  # Train Word2Vec model on text data
X = get_embedding_matrix(data['processed_full_content'], word2vec_model)
y = data['label']

# Initialize classifier and StratifiedKFold
svm = SGDClassifier(loss='hinge', penalty='l2', alpha=0.01, random_state=42)
skf = StratifiedKFold(n_splits=5)

# Cross-validation loop with tqdm progress bar
accuracy_scores = []
precision_scores = []
recall_scores = []
macro_f1_scores = []

for train_index, test_index in tqdm(skf.split(X, y), total=skf.get_n_splits(), desc="Cross-Validation Progress"):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train and predict
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    
    # Evaluate metrics for each fold
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    
    # Append each metric
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)
    macro_f1_scores.append(macro_f1)
    
    # Print metrics for each fold
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("\nAccuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("Macro Average F1-Score:", macro_f1)

# Calculate average metrics across all folds
print("\nAverage Metrics across 5 folds:")
print("Average Accuracy:", np.mean(accuracy_scores))
print("Average Precision:", np.mean(precision_scores))
print("Average Recall:", np.mean(recall_scores))
print("Average Macro F1-Score:", np.mean(macro_f1_scores))


Cross-Validation Progress:  20%|██        | 1/5 [00:00<00:00,  5.99it/s]

Confusion Matrix:
 [[6351  604]
 [ 606 5211]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9129    0.9132    0.9130      6955
           1     0.8961    0.8958    0.8960      5817

    accuracy                         0.9053     12772
   macro avg     0.9045    0.9045    0.9045     12772
weighted avg     0.9053    0.9053    0.9053     12772


Accuracy: 0.9052615095521453
Precision: 0.9045120925236476
Recall: 0.9044892959195061
Macro Average F1-Score: 0.9045006715428003
Confusion Matrix:
 [[6399  557]
 [ 562 5254]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9193    0.9199    0.9196      6956
           1     0.9041    0.9034    0.9038      5816

    accuracy                         0.9124     12772
   macro avg     0.9117    0.9116    0.9117     12772
weighted avg     0.9124    0.9124    0.9124     12772


Accuracy: 0.9123864704040088
Precision: 0.9117058901635291
Recall: 0.9116

Cross-Validation Progress:  80%|████████  | 4/5 [00:00<00:00,  6.53it/s]

Confusion Matrix:
 [[6336  620]
 [ 542 5274]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9212    0.9109    0.9160      6956
           1     0.8948    0.9068    0.9008      5816

    accuracy                         0.9090     12772
   macro avg     0.9080    0.9088    0.9084     12772
weighted avg     0.9092    0.9090    0.9091     12772


Accuracy: 0.9090197306608205
Precision: 0.9080031511436957
Recall: 0.9088385592124362
Macro Average F1-Score: 0.9083863109330876
Confusion Matrix:
 [[6349  607]
 [ 537 5279]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9220    0.9127    0.9174      6956
           1     0.8969    0.9077    0.9022      5816

    accuracy                         0.9104     12772
   macro avg     0.9094    0.9102    0.9098     12772
weighted avg     0.9106    0.9104    0.9105     12772


Accuracy: 0.9104290635765737
Precision: 0.9094448110774283
Recall: 0.9102

Cross-Validation Progress: 100%|██████████| 5/5 [00:00<00:00,  6.46it/s]

Confusion Matrix:
 [[6373  583]
 [ 534 5282]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9227    0.9162    0.9194      6956
           1     0.9006    0.9082    0.9044      5816

    accuracy                         0.9125     12772
   macro avg     0.9116    0.9122    0.9119     12772
weighted avg     0.9126    0.9125    0.9126     12772


Accuracy: 0.9125430629502036
Precision: 0.9116419447214367
Recall: 0.9121858915897372
Macro Average F1-Score: 0.9119002175847235

Average Metrics across 5 folds:
Average Accuracy: 0.9099279674287504
Average Precision: 0.9090615779259474
Average Recall: 0.9094728457570002
Average Macro F1-Score: 0.909251955071795





# SVM with glove + countVectoriser

In [6]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import hstack

# Step 1: Load pre-trained GloVe embeddings
def load_glove(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load GloVe vectors
glove_path = "glove.6B.100d.txt"
embeddings_index = load_glove(glove_path)

# Step 2: Convert each document to GloVe embeddings by averaging word vectors
def get_glove_embedding(text, embeddings_index, embed_dim=100):
    words = text.split()
    embeddings = [embeddings_index[word] for word in words if word in embeddings_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embed_dim)

# Generate GloVe embeddings for the entire dataset
X_glove = np.array([get_glove_embedding(text, embeddings_index) for text in data['processed_full_content']])

# Step 3: Generate CountVectorizer features
vectorizer = CountVectorizer(max_features=5000)
X_count = vectorizer.fit_transform(data['processed_full_content'])

# Step 4: Combine GloVe embeddings and CountVectorizer features
# Use hstack to combine sparse CountVectorizer matrix with dense GloVe embeddings
X_combined = hstack((X_count, X_glove))

# Step 5: Train-test split
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Step 6: Train a classifier (e.g., SVM) on the combined features
svm = SGDClassifier(loss='hinge', random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

# Step 7: Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
macro_f1 = f1_score(y_test, y_pred, average='macro')
print("\nMacro Average F1-Score:", macro_f1)


Confusion Matrix:
 [[6734  270]
 [ 306 5462]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9565    0.9615    0.9590      7004
           1     0.9529    0.9469    0.9499      5768

    accuracy                         0.9549     12772
   macro avg     0.9547    0.9542    0.9544     12772
weighted avg     0.9549    0.9549    0.9549     12772


Macro Average F1-Score: 0.9544495436702043


# SVM with glove, count vectoriser and Stratified K-fold Cross Validation

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from scipy.sparse import hstack, csr_matrix

# Step 1: Load pre-trained GloVe embeddings
def load_glove(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load GloVe vectors
glove_path = "glove.6B.100d.txt"
embeddings_index = load_glove(glove_path)

# Step 2: Convert each document to GloVe embeddings by averaging word vectors
def get_glove_embedding(text, embeddings_index, embed_dim=100):
    words = text.split()
    embeddings = [embeddings_index[word] for word in words if word in embeddings_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embed_dim)

# Generate GloVe embeddings for the entire dataset
X_glove = np.array([get_glove_embedding(text, embeddings_index) for text in data['processed_full_content']])

# Step 3: Generate CountVectorizer features
vectorizer = CountVectorizer(max_features=5000)
X_count = vectorizer.fit_transform(data['processed_full_content'])

# Step 4: Combine GloVe embeddings and CountVectorizer features
# Use hstack to combine sparse CountVectorizer matrix with dense GloVe embeddings
X_combined = hstack((X_count, X_glove))
X_combined = csr_matrix(X_combined)  # Convert to csr_matrix for subscriptable indexing

# Define labels
y = data['label']

# Step 5: Set up Stratified K-Fold Cross-Validation
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
fold = 1
f1_scores = []
all_classification_reports = []

for train_index, test_index in kf.split(X_combined, y):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X_combined[train_index], X_combined[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the SVM model on the current fold
    svm = SGDClassifier(loss='hinge', random_state=42)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    
    # Calculate F1-score for the current fold
    f1 = f1_score(y_test, y_pred, average='weighted')
    f1_scores.append(f1)
    classification_rep = classification_report(y_test, y_pred, digits=4, output_dict=True)
    all_classification_reports.append(classification_rep)
    
    # Print F1-score and classification report for the current fold
    print(f"Fold {fold} - F1 Score (weighted): {f1}")
    print(f"Fold {fold} - Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"Fold {fold} - Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    fold += 1

# Step 6: Calculate and display average F1-score across all folds
avg_f1_score = np.mean(f1_scores)
print("\nAverage F1 Score (weighted) across all folds:", avg_f1_score)

# Calculate the average classification report across folds
avg_classification_report = {}
for key in all_classification_reports[0].keys():
    if isinstance(all_classification_reports[0][key], dict):
        avg_classification_report[key] = {metric: np.mean([report[key][metric] for report in all_classification_reports]) 
                                          for metric in all_classification_reports[0][key]}
    else:
        avg_classification_report[key] = np.mean([report[key] for report in all_classification_reports])

print("\nAverage Classification Report across all folds:")
for label, metrics in avg_classification_report.items():
    if isinstance(metrics, dict):
        print(f"{label}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.4f}")
    else:
        print(f"{label}: {metrics:.4f}")


Fold 1 - F1 Score (weighted): 0.9540576271559605
Fold 1 - Confusion Matrix:
 [[11101   492]
 [  486  9208]]
Fold 1 - Classification Report:
               precision    recall  f1-score   support

           0     0.9581    0.9576    0.9578     11593
           1     0.9493    0.9499    0.9496      9694

    accuracy                         0.9541     21287
   macro avg     0.9537    0.9537    0.9537     21287
weighted avg     0.9541    0.9541    0.9541     21287

Fold 2 - F1 Score (weighted): 0.9518794502764808
Fold 2 - Confusion Matrix:
 [[11119   474]
 [  550  9144]]
Fold 2 - Classification Report:
               precision    recall  f1-score   support

           0     0.9529    0.9591    0.9560     11593
           1     0.9507    0.9433    0.9470      9694

    accuracy                         0.9519     21287
   macro avg     0.9518    0.9512    0.9515     21287
weighted avg     0.9519    0.9519    0.9519     21287

Fold 3 - F1 Score (weighted): 0.9522081012321895
Fold 3 - Confus

In [29]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import StandardScaler

# Step 1: Load pre-trained GloVe embeddings
def load_glove(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load GloVe vectors
glove_path = r"c:\Users\Admin\Downloads\glove.6B\glove.6B.300d.txt"
embeddings_index = load_glove(glove_path)

# Step 2: Convert each document to GloVe embeddings by averaging word vectors
def get_glove_embedding(text, embeddings_index, embed_dim=300):
    words = text.split()
    embeddings = [embeddings_index[word] for word in words if word in embeddings_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embed_dim)

# Generate GloVe embeddings for the entire dataset
X_glove = np.array([get_glove_embedding(text, embeddings_index) for text in data['processed_full_content']])

# Step 3: Generate CountVectorizer features
vectorizer = CountVectorizer(max_features=5000)
X_count = vectorizer.fit_transform(data['processed_full_content'])

# Step 4: Standardize GloVe embeddings and combine with CountVectorizer features
scaler = StandardScaler()
X_glove_scaled = scaler.fit_transform(X_glove)

# Use hstack to combine sparse CountVectorizer matrix with dense (scaled) GloVe embeddings
X_combined = hstack((X_count, X_glove_scaled))
X_combined = csr_matrix(X_combined)  # Convert to csr_matrix for subscriptable indexing

# Define labels
y = data['label']

# Step 5: Set up Stratified K-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
f1_scores = []
all_classification_reports = []

for train_index, test_index in kf.split(X_combined, y):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X_combined[train_index], X_combined[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the SVM model with L2 regularization (penalty='l2')
    svm = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, random_state=42)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    
    # Calculate F1-score for the current fold
    f1 = f1_score(y_test, y_pred, average='weighted')
    f1_scores.append(f1)
    classification_rep = classification_report(y_test, y_pred, digits=4, output_dict=True)
    all_classification_reports.append(classification_rep)
    
    # Print F1-score and classification report for the current fold
    print(f"Fold {fold} - F1 Score (weighted): {f1}")
    print(f"Fold {fold} - Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"Fold {fold} - Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    fold += 1

# Step 6: Calculate and display average F1-score across all folds
avg_f1_score = np.mean(f1_scores)
print("\nAverage F1 Score (weighted) across all folds:", avg_f1_score)

# Calculate the average classification report across folds
avg_classification_report = {}
for key in all_classification_reports[0].keys():
    if isinstance(all_classification_reports[0][key], dict):
        avg_classification_report[key] = {metric: np.mean([report[key][metric] for report in all_classification_reports]) 
                                          for metric in all_classification_reports[0][key]}
    else:
        avg_classification_report[key] = np.mean([report[key] for report in all_classification_reports])

print("\nAverage Classification Report across all folds:")
for label, metrics in avg_classification_report.items():
    if isinstance(metrics, dict):
        print(f"{label}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.4f}")
    else:
        print(f"{label}: {metrics:.4f}")


Fold 1 - F1 Score (weighted): 0.954830207313335
Fold 1 - Confusion Matrix:
 [[6655  300]
 [ 277 5540]]
Fold 1 - Classification Report:
               precision    recall  f1-score   support

           0     0.9600    0.9569    0.9585      6955
           1     0.9486    0.9524    0.9505      5817

    accuracy                         0.9548     12772
   macro avg     0.9543    0.9546    0.9545     12772
weighted avg     0.9548    0.9548    0.9548     12772

Fold 2 - F1 Score (weighted): 0.9519500861173588
Fold 2 - Confusion Matrix:
 [[6611  345]
 [ 269 5547]]
Fold 2 - Classification Report:
               precision    recall  f1-score   support

           0     0.9609    0.9504    0.9556      6956
           1     0.9414    0.9537    0.9476      5816

    accuracy                         0.9519     12772
   macro avg     0.9512    0.9521    0.9516     12772
weighted avg     0.9520    0.9519    0.9520     12772

Fold 3 - F1 Score (weighted): 0.9556160302718006
Fold 3 - Confusion Matri

# SVM with Count Vectoriser, Glove, Stratified K-fold Cross-validation, L2 Regularisation, Gridsearch hyperparameter tuning

In [7]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import StandardScaler

# Step 1: Load pre-trained GloVe embeddings
def load_glove(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load GloVe vectors
glove_path = "glove.6B.100d.txt"
embeddings_index = load_glove(glove_path)

# Step 2: Convert each document to GloVe embeddings by averaging word vectors
def get_glove_embedding(text, embeddings_index, embed_dim=100):
    words = text.split()
    embeddings = [embeddings_index[word] for word in words if word in embeddings_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embed_dim)

# Generate GloVe embeddings for the entire dataset
X_glove = np.array([get_glove_embedding(text, embeddings_index) for text in data['processed_full_content']])

# Step 3: Generate CountVectorizer features
vectorizer = CountVectorizer(max_features=5000)
X_count = vectorizer.fit_transform(data['processed_full_content'])

# Step 4: Standardize GloVe embeddings and combine with CountVectorizer features
scaler = StandardScaler()
X_glove_scaled = scaler.fit_transform(X_glove)

# Use hstack to combine sparse CountVectorizer matrix with dense (scaled) GloVe embeddings
X_combined = hstack((X_count, X_glove_scaled))
X_combined = csr_matrix(X_combined)  # Convert to csr_matrix for subscriptable indexing

# Define labels
y = data['label']

# Step 5: Set up Stratified K-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
f1_scores = []
all_classification_reports = []

# Step 6: Define parameter grid for GridSearchCV
param_grid = {
    'alpha': [0.0001, 0.001, 0.01],
    'penalty': ['l2'],
    'loss': ['hinge', 'log'],  # 'hinge' for SVM, 'log' for logistic regression
    'max_iter': [1000, 2000]
}

# Step 7: Iterate over each fold in cross-validation
for train_index, test_index in kf.split(X_combined, y):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X_combined[train_index], X_combined[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Set up GridSearchCV with the SGDClassifier and parameter grid
    svm = SGDClassifier(random_state=42)
    grid_search = GridSearchCV(svm, param_grid, scoring='f1_weighted', cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model from GridSearchCV
    best_svm = grid_search.best_estimator_
    y_pred = best_svm.predict(X_test)
    
    # Calculate F1-score for the current fold
    f1 = f1_score(y_test, y_pred, average='weighted')
    f1_scores.append(f1)
    classification_rep = classification_report(y_test, y_pred, digits=4, output_dict=True)
    all_classification_reports.append(classification_rep)
    
    # Print F1-score, best parameters, and classification report for the current fold
    print(f"Fold {fold} - F1 Score (weighted): {f1}")
    print(f"Best Parameters for Fold {fold}: {grid_search.best_params_}")
    print(f"Fold {fold} - Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"Fold {fold} - Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    fold += 1

# Step 8: Calculate and display average F1-score across all folds
avg_f1_score = np.mean(f1_scores)
print("\nAverage F1 Score (weighted) across all folds:", avg_f1_score)

# Calculate the average classification report across folds
avg_classification_report = {}
for key in all_classification_reports[0].keys():
    if isinstance(all_classification_reports[0][key], dict):
        avg_classification_report[key] = {metric: np.mean([report[key][metric] for report in all_classification_reports]) 
                                          for metric in all_classification_reports[0][key]}
    else:
        avg_classification_report[key] = np.mean([report[key] for report in all_classification_reports])

print("\nAverage Classification Report across all folds:")
for label, metrics in avg_classification_report.items():
    if isinstance(metrics, dict):
        print(f"{label}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.4f}")
    else:
        print(f"{label}: {metrics:.4f}")


18 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-

Fold 1 - F1 Score (weighted): 0.9615324260866738
Best Parameters for Fold 1: {'alpha': 0.01, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Fold 1 - Confusion Matrix:
 [[6751  204]
 [ 287 5530]]
Fold 1 - Classification Report:
               precision    recall  f1-score   support

           0     0.9592    0.9707    0.9649      6955
           1     0.9644    0.9507    0.9575      5817

    accuracy                         0.9616     12772
   macro avg     0.9618    0.9607    0.9612     12772
weighted avg     0.9616    0.9616    0.9615     12772



18 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-

Fold 2 - F1 Score (weighted): 0.9580712079690743
Best Parameters for Fold 2: {'alpha': 0.01, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Fold 2 - Confusion Matrix:
 [[6750  206]
 [ 329 5487]]
Fold 2 - Classification Report:
               precision    recall  f1-score   support

           0     0.9535    0.9704    0.9619      6956
           1     0.9638    0.9434    0.9535      5816

    accuracy                         0.9581     12772
   macro avg     0.9587    0.9569    0.9577     12772
weighted avg     0.9582    0.9581    0.9581     12772



18 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-

Fold 3 - F1 Score (weighted): 0.9619548195119899
Best Parameters for Fold 3: {'alpha': 0.01, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Fold 3 - Confusion Matrix:
 [[6700  256]
 [ 230 5586]]
Fold 3 - Classification Report:
               precision    recall  f1-score   support

           0     0.9668    0.9632    0.9650      6956
           1     0.9562    0.9605    0.9583      5816

    accuracy                         0.9619     12772
   macro avg     0.9615    0.9618    0.9617     12772
weighted avg     0.9620    0.9619    0.9620     12772



18 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-

Fold 4 - F1 Score (weighted): 0.9614710143364869
Best Parameters for Fold 4: {'alpha': 0.01, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Fold 4 - Confusion Matrix:
 [[6723  233]
 [ 259 5557]]
Fold 4 - Classification Report:
               precision    recall  f1-score   support

           0     0.9629    0.9665    0.9647      6956
           1     0.9598    0.9555    0.9576      5816

    accuracy                         0.9615     12772
   macro avg     0.9613    0.9610    0.9612     12772
weighted avg     0.9615    0.9615    0.9615     12772



18 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\User\AppData\Local\Programs\Python\Python310\lib\site-

Fold 5 - F1 Score (weighted): 0.9617039597687295
Best Parameters for Fold 5: {'alpha': 0.01, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Fold 5 - Confusion Matrix:
 [[6728  228]
 [ 261 5555]]
Fold 5 - Classification Report:
               precision    recall  f1-score   support

           0     0.9627    0.9672    0.9649      6956
           1     0.9606    0.9551    0.9578      5816

    accuracy                         0.9617     12772
   macro avg     0.9616    0.9612    0.9614     12772
weighted avg     0.9617    0.9617    0.9617     12772


Average F1 Score (weighted) across all folds: 0.960946685534591

Average Classification Report across all folds:
0:
  precision: 0.9610
  recall: 0.9676
  f1-score: 0.9643
  support: 6955.8000
1:
  precision: 0.9609
  recall: 0.9530
  f1-score: 0.9570
  support: 5816.2000
accuracy: 0.9610
macro avg:
  precision: 0.9610
  recall: 0.9603
  f1-score: 0.9606
  support: 12772.0000
weighted avg:
  precision: 0.9610
  recall: 0.9610
  f1-scor