# Import necessary libraries

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.linear_model import SGDClassifier

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud

# Data Preparation (Loading CSV)

Load the processed_data `csv` file into pandas DataFrames
- `processed_data.csv` is loaded into `data` DataFrame (stemming has been performed to reduce processing time.)

In [5]:
import pandas as pd
data = pd.read_csv('../processed_data.csv')
print(data.head())  # Shows the first 5 rows

   label                                       full_content  \
0      1  No comment is expected from Barack Obama Membe...   
1      1     Did they post their votes for Hillary already?   
2      1  Now, most of the demonstrators gathered last n...   
3      0  A dozen politically active pastors came here f...   
4      1  The RS-28 Sarmat missile, dubbed Satan 2, will...   

                              processed_full_content  
0  no comment expect barack obama member fyf911 f...  
1                          post vote hillari alreadi  
2  demonstr gather last night exercis constitut p...  
3  dozen polit activ pastor came privat dinner fr...  
4  rs-28 sarmat missil dub satan 2 replac ss-18 f...  


# SVM with countvectorizer


In [16]:
# Step 1: Vectorization using CountVectorizer
MAX_FEATURES = 5000  # Number of features to consider
vectorizer = CountVectorizer(max_features=MAX_FEATURES)

# Convert text to feature vectors
X = vectorizer.fit_transform(data['processed_full_content']).toarray()
y = data['label']  # Target labels (binary classification: 0 or 1)

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the SVM Model
# svm = SVC(kernel='linear', random_state=42)
svm = SGDClassifier(loss='hinge', random_state=42)
svm.fit(X_train, y_train)

# Step 4: Make Predictions and Evaluate
y_pred = svm.predict(X_test)

# Evaluate model performance
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
macro_f1 = f1_score(y_test, y_pred, average='macro')
print("\nMacro Average F1-Score:", macro_f1)

Confusion Matrix:
 [[6705  299]
 [ 303 5465]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9568    0.9573    0.9570      7004
           1     0.9481    0.9475    0.9478      5768

    accuracy                         0.9529     12772
   macro avg     0.9524    0.9524    0.9524     12772
weighted avg     0.9529    0.9529    0.9529     12772


Macro Average F1-Score: 0.9524171294036754


# SVM with tf-idf instead of CountVectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TF-IDF instead of CountVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=MAX_FEATURES)
X = tfidf_vectorizer.fit_transform(data['processed_full_content']).toarray()
y = data['label']  # Target labels (binary classification: 0 or 1)

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the SVM Model
# svm = SVC(kernel='linear', random_state=42)
svm = SGDClassifier(loss='hinge', random_state=42)
svm.fit(X_train, y_train)

# Step 4: Make Predictions and Evaluate
y_pred = svm.predict(X_test)

# Evaluate model performance
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
macro_f1 = f1_score(y_test, y_pred, average='macro')
print("\nMacro Average F1-Score:", macro_f1)


Confusion Matrix:
 [[6784  220]
 [ 403 5365]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9439    0.9686    0.9561      7004
           1     0.9606    0.9301    0.9451      5768

    accuracy                         0.9512     12772
   macro avg     0.9523    0.9494    0.9506     12772
weighted avg     0.9515    0.9512    0.9511     12772


Macro Average F1-Score: 0.9506117863026156


# SVM with glove

In [21]:
import numpy as np

def load_glove(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load pre-trained GloVe vectors
def get_embedding_matrix(data, embeddings_index, embed_dim=300):
    embedding_matrix = []
    for text in data:
        words = text.split()
        embeddings = [embeddings_index[word] for word in words if word in embeddings_index]
        if embeddings:
            document_embedding = np.mean(embeddings, axis=0)  # Average embeddings for the document
        else:
            document_embedding = np.zeros(embed_dim)  # Zero vector if no embeddings found
        embedding_matrix.append(document_embedding)
    return np.array(embedding_matrix)

# Load GloVe vectors
glove_path = r"c:\Users\Admin\Downloads\glove.6B\glove.6B.300d.txt"
embeddings_index = load_glove(glove_path)

# Convert text data to GloVe embeddings
X_train = get_embedding_matrix(data['processed_full_content'], embeddings_index)
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.2, random_state=42)

svm = SGDClassifier(loss='hinge', random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

# Evaluate model performance
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
macro_f1 = f1_score(y_test, y_pred, average='macro')
print("\nMacro Average F1-Score:", macro_f1)


Confusion Matrix:
 [[6180  824]
 [ 593 5175]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9124    0.8824    0.8971      7004
           1     0.8626    0.8972    0.8796      5768

    accuracy                         0.8891     12772
   macro avg     0.8875    0.8898    0.8884     12772
weighted avg     0.8900    0.8891    0.8892     12772


Macro Average F1-Score: 0.8883629509041846


# SVM with TF-IDF and glove

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Get TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(data['processed_full_content'])

# Get GloVe embeddings
X_glove = get_embedding_matrix(data['processed_full_content'], embeddings_index)

# Combine GloVe and TF-IDF features
X_combined = hstack((X_tfidf, StandardScaler().fit_transform(X_glove))).toarray()

# Split and train model
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
svm = SGDClassifier(loss='hinge', random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
macro_f1 = f1_score(y_test, y_pred, average='macro')
print("\nMacro Average F1-Score:", macro_f1)


Confusion Matrix:
 [[6725  279]
 [ 461 5307]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9358    0.9602    0.9479      7004
           1     0.9501    0.9201    0.9348      5768

    accuracy                         0.9421     12772
   macro avg     0.9430    0.9401    0.9413     12772
weighted avg     0.9423    0.9421    0.9420     12772


Macro Average F1-Score: 0.9413376651927967


In [None]:
import numpy as np
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from scipy.sparse import hstack

# Step 1: Load pre-trained Word2Vec embeddings
def load_word2vec(file_path):
    return KeyedVectors.load_word2vec_format(file_path, binary=True)

# Load Word2Vec vectors
word2vec_path = r"c:\Users\Admin\Downloads\GoogleNews-vectors-negative300.bin"  # Example path
word2vec = load_word2vec(word2vec_path)

# Step 2: Convert each document to Word2Vec embeddings by averaging word vectors
def get_word2vec_embedding(text, word2vec, embed_dim=300):
    words = text.split()
    embeddings = [word2vec[word] for word in words if word in word2vec.key_to_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embed_dim)

# Generate Word2Vec embeddings for the entire dataset
X_word2vec = np.array([get_word2vec_embedding(text, word2vec) for text in data['processed_full_content']])

# Step 3: Generate CountVectorizer features
vectorizer = CountVectorizer(max_features=5000)
X_count = vectorizer.fit_transform(data['processed_full_content'])

# Step 4: Combine Word2Vec embeddings and CountVectorizer features
X_combined = hstack((X_count, X_word2vec))

# Step 5: Train-test split
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Step 6: Train a classifier (e.g., SVM) on the combined features
svm = SGDClassifier(loss='hinge', random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

# Step 7: Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
macro_f1 = f1_score(y_test, y_pred, average='macro')
print("\nMacro Average F1-Score:", macro_f1)


# SVM with glove + countVectoriser

In [23]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import hstack

# Step 1: Load pre-trained GloVe embeddings
def load_glove(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load GloVe vectors
glove_path = r"c:\Users\Admin\Downloads\glove.6B\glove.6B.300d.txt"
embeddings_index = load_glove(glove_path)

# Step 2: Convert each document to GloVe embeddings by averaging word vectors
def get_glove_embedding(text, embeddings_index, embed_dim=300):
    words = text.split()
    embeddings = [embeddings_index[word] for word in words if word in embeddings_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embed_dim)

# Generate GloVe embeddings for the entire dataset
X_glove = np.array([get_glove_embedding(text, embeddings_index) for text in data['processed_full_content']])

# Step 3: Generate CountVectorizer features
vectorizer = CountVectorizer(max_features=5000)
X_count = vectorizer.fit_transform(data['processed_full_content'])

# Step 4: Combine GloVe embeddings and CountVectorizer features
# Use hstack to combine sparse CountVectorizer matrix with dense GloVe embeddings
X_combined = hstack((X_count, X_glove))

# Step 5: Train-test split
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Step 6: Train a classifier (e.g., SVM) on the combined features
svm = SGDClassifier(loss='hinge', random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

# Step 7: Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
macro_f1 = f1_score(y_test, y_pred, average='macro')
print("\nMacro Average F1-Score:", macro_f1)


Confusion Matrix:
 [[6688  316]
 [ 251 5517]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9638    0.9549    0.9593      7004
           1     0.9458    0.9565    0.9511      5768

    accuracy                         0.9556     12772
   macro avg     0.9548    0.9557    0.9552     12772
weighted avg     0.9557    0.9556    0.9556     12772


Macro Average F1-Score: 0.9552296680372214


# SVM with glove, count vectoriser and Stratified K-fold Cross Validation

In [28]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from scipy.sparse import hstack, csr_matrix

# Step 1: Load pre-trained GloVe embeddings
def load_glove(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load GloVe vectors
glove_path = r"c:\Users\Admin\Downloads\glove.6B\glove.6B.300d.txt"
embeddings_index = load_glove(glove_path)

# Step 2: Convert each document to GloVe embeddings by averaging word vectors
def get_glove_embedding(text, embeddings_index, embed_dim=300):
    words = text.split()
    embeddings = [embeddings_index[word] for word in words if word in embeddings_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embed_dim)

# Generate GloVe embeddings for the entire dataset
X_glove = np.array([get_glove_embedding(text, embeddings_index) for text in data['processed_full_content']])

# Step 3: Generate CountVectorizer features
vectorizer = CountVectorizer(max_features=5000)
X_count = vectorizer.fit_transform(data['processed_full_content'])

# Step 4: Combine GloVe embeddings and CountVectorizer features
# Use hstack to combine sparse CountVectorizer matrix with dense GloVe embeddings
X_combined = hstack((X_count, X_glove))
X_combined = csr_matrix(X_combined)  # Convert to csr_matrix for subscriptable indexing

# Define labels
y = data['label']

# Step 5: Set up Stratified K-Fold Cross-Validation
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
fold = 1
f1_scores = []
all_classification_reports = []

for train_index, test_index in kf.split(X_combined, y):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X_combined[train_index], X_combined[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the SVM model on the current fold
    svm = SGDClassifier(loss='hinge', random_state=42)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    
    # Calculate F1-score for the current fold
    f1 = f1_score(y_test, y_pred, average='weighted')
    f1_scores.append(f1)
    classification_rep = classification_report(y_test, y_pred, digits=4, output_dict=True)
    all_classification_reports.append(classification_rep)
    
    # Print F1-score and classification report for the current fold
    print(f"Fold {fold} - F1 Score (weighted): {f1}")
    print(f"Fold {fold} - Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"Fold {fold} - Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    fold += 1

# Step 6: Calculate and display average F1-score across all folds
avg_f1_score = np.mean(f1_scores)
print("\nAverage F1 Score (weighted) across all folds:", avg_f1_score)

# Calculate the average classification report across folds
avg_classification_report = {}
for key in all_classification_reports[0].keys():
    if isinstance(all_classification_reports[0][key], dict):
        avg_classification_report[key] = {metric: np.mean([report[key][metric] for report in all_classification_reports]) 
                                          for metric in all_classification_reports[0][key]}
    else:
        avg_classification_report[key] = np.mean([report[key] for report in all_classification_reports])

print("\nAverage Classification Report across all folds:")
for label, metrics in avg_classification_report.items():
    if isinstance(metrics, dict):
        print(f"{label}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.4f}")
    else:
        print(f"{label}: {metrics:.4f}")


Fold 1 - F1 Score (weighted): 0.9540576271559605
Fold 1 - Confusion Matrix:
 [[11101   492]
 [  486  9208]]
Fold 1 - Classification Report:
               precision    recall  f1-score   support

           0     0.9581    0.9576    0.9578     11593
           1     0.9493    0.9499    0.9496      9694

    accuracy                         0.9541     21287
   macro avg     0.9537    0.9537    0.9537     21287
weighted avg     0.9541    0.9541    0.9541     21287

Fold 2 - F1 Score (weighted): 0.9518794502764808
Fold 2 - Confusion Matrix:
 [[11119   474]
 [  550  9144]]
Fold 2 - Classification Report:
               precision    recall  f1-score   support

           0     0.9529    0.9591    0.9560     11593
           1     0.9507    0.9433    0.9470      9694

    accuracy                         0.9519     21287
   macro avg     0.9518    0.9512    0.9515     21287
weighted avg     0.9519    0.9519    0.9519     21287

Fold 3 - F1 Score (weighted): 0.9522081012321895
Fold 3 - Confus

In [29]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import StandardScaler

# Step 1: Load pre-trained GloVe embeddings
def load_glove(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load GloVe vectors
glove_path = r"c:\Users\Admin\Downloads\glove.6B\glove.6B.300d.txt"
embeddings_index = load_glove(glove_path)

# Step 2: Convert each document to GloVe embeddings by averaging word vectors
def get_glove_embedding(text, embeddings_index, embed_dim=300):
    words = text.split()
    embeddings = [embeddings_index[word] for word in words if word in embeddings_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embed_dim)

# Generate GloVe embeddings for the entire dataset
X_glove = np.array([get_glove_embedding(text, embeddings_index) for text in data['processed_full_content']])

# Step 3: Generate CountVectorizer features
vectorizer = CountVectorizer(max_features=5000)
X_count = vectorizer.fit_transform(data['processed_full_content'])

# Step 4: Standardize GloVe embeddings and combine with CountVectorizer features
scaler = StandardScaler()
X_glove_scaled = scaler.fit_transform(X_glove)

# Use hstack to combine sparse CountVectorizer matrix with dense (scaled) GloVe embeddings
X_combined = hstack((X_count, X_glove_scaled))
X_combined = csr_matrix(X_combined)  # Convert to csr_matrix for subscriptable indexing

# Define labels
y = data['label']

# Step 5: Set up Stratified K-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
f1_scores = []
all_classification_reports = []

for train_index, test_index in kf.split(X_combined, y):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X_combined[train_index], X_combined[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the SVM model with L2 regularization (penalty='l2')
    svm = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, random_state=42)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    
    # Calculate F1-score for the current fold
    f1 = f1_score(y_test, y_pred, average='weighted')
    f1_scores.append(f1)
    classification_rep = classification_report(y_test, y_pred, digits=4, output_dict=True)
    all_classification_reports.append(classification_rep)
    
    # Print F1-score and classification report for the current fold
    print(f"Fold {fold} - F1 Score (weighted): {f1}")
    print(f"Fold {fold} - Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"Fold {fold} - Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    fold += 1

# Step 6: Calculate and display average F1-score across all folds
avg_f1_score = np.mean(f1_scores)
print("\nAverage F1 Score (weighted) across all folds:", avg_f1_score)

# Calculate the average classification report across folds
avg_classification_report = {}
for key in all_classification_reports[0].keys():
    if isinstance(all_classification_reports[0][key], dict):
        avg_classification_report[key] = {metric: np.mean([report[key][metric] for report in all_classification_reports]) 
                                          for metric in all_classification_reports[0][key]}
    else:
        avg_classification_report[key] = np.mean([report[key] for report in all_classification_reports])

print("\nAverage Classification Report across all folds:")
for label, metrics in avg_classification_report.items():
    if isinstance(metrics, dict):
        print(f"{label}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.4f}")
    else:
        print(f"{label}: {metrics:.4f}")


Fold 1 - F1 Score (weighted): 0.954830207313335
Fold 1 - Confusion Matrix:
 [[6655  300]
 [ 277 5540]]
Fold 1 - Classification Report:
               precision    recall  f1-score   support

           0     0.9600    0.9569    0.9585      6955
           1     0.9486    0.9524    0.9505      5817

    accuracy                         0.9548     12772
   macro avg     0.9543    0.9546    0.9545     12772
weighted avg     0.9548    0.9548    0.9548     12772

Fold 2 - F1 Score (weighted): 0.9519500861173588
Fold 2 - Confusion Matrix:
 [[6611  345]
 [ 269 5547]]
Fold 2 - Classification Report:
               precision    recall  f1-score   support

           0     0.9609    0.9504    0.9556      6956
           1     0.9414    0.9537    0.9476      5816

    accuracy                         0.9519     12772
   macro avg     0.9512    0.9521    0.9516     12772
weighted avg     0.9520    0.9519    0.9520     12772

Fold 3 - F1 Score (weighted): 0.9556160302718006
Fold 3 - Confusion Matri

# SVM with Count Vectoriser, Glove, Stratified K-fold Cross-validation, L2 Regularisation, Gridsearch hyperparameter tuning

In [30]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import StandardScaler

# Step 1: Load pre-trained GloVe embeddings
def load_glove(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load GloVe vectors
glove_path = r"c:\Users\Admin\Downloads\glove.6B\glove.6B.300d.txt"
embeddings_index = load_glove(glove_path)

# Step 2: Convert each document to GloVe embeddings by averaging word vectors
def get_glove_embedding(text, embeddings_index, embed_dim=300):
    words = text.split()
    embeddings = [embeddings_index[word] for word in words if word in embeddings_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embed_dim)

# Generate GloVe embeddings for the entire dataset
X_glove = np.array([get_glove_embedding(text, embeddings_index) for text in data['processed_full_content']])

# Step 3: Generate CountVectorizer features
vectorizer = CountVectorizer(max_features=5000)
X_count = vectorizer.fit_transform(data['processed_full_content'])

# Step 4: Standardize GloVe embeddings and combine with CountVectorizer features
scaler = StandardScaler()
X_glove_scaled = scaler.fit_transform(X_glove)

# Use hstack to combine sparse CountVectorizer matrix with dense (scaled) GloVe embeddings
X_combined = hstack((X_count, X_glove_scaled))
X_combined = csr_matrix(X_combined)  # Convert to csr_matrix for subscriptable indexing

# Define labels
y = data['label']

# Step 5: Set up Stratified K-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
f1_scores = []
all_classification_reports = []

# Step 6: Define parameter grid for GridSearchCV
param_grid = {
    'alpha': [0.0001, 0.001, 0.01],
    'penalty': ['l2'],
    'loss': ['hinge', 'log'],  # 'hinge' for SVM, 'log' for logistic regression
    'max_iter': [1000, 2000]
}

# Step 7: Iterate over each fold in cross-validation
for train_index, test_index in kf.split(X_combined, y):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X_combined[train_index], X_combined[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Set up GridSearchCV with the SGDClassifier and parameter grid
    svm = SGDClassifier(random_state=42)
    grid_search = GridSearchCV(svm, param_grid, scoring='f1_weighted', cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model from GridSearchCV
    best_svm = grid_search.best_estimator_
    y_pred = best_svm.predict(X_test)
    
    # Calculate F1-score for the current fold
    f1 = f1_score(y_test, y_pred, average='weighted')
    f1_scores.append(f1)
    classification_rep = classification_report(y_test, y_pred, digits=4, output_dict=True)
    all_classification_reports.append(classification_rep)
    
    # Print F1-score, best parameters, and classification report for the current fold
    print(f"Fold {fold} - F1 Score (weighted): {f1}")
    print(f"Best Parameters for Fold {fold}: {grid_search.best_params_}")
    print(f"Fold {fold} - Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(f"Fold {fold} - Classification Report:\n", classification_report(y_test, y_pred, digits=4))
    fold += 1

# Step 8: Calculate and display average F1-score across all folds
avg_f1_score = np.mean(f1_scores)
print("\nAverage F1 Score (weighted) across all folds:", avg_f1_score)

# Calculate the average classification report across folds
avg_classification_report = {}
for key in all_classification_reports[0].keys():
    if isinstance(all_classification_reports[0][key], dict):
        avg_classification_report[key] = {metric: np.mean([report[key][metric] for report in all_classification_reports]) 
                                          for metric in all_classification_reports[0][key]}
    else:
        avg_classification_report[key] = np.mean([report[key] for report in all_classification_reports])

print("\nAverage Classification Report across all folds:")
for label, metrics in avg_classification_report.items():
    if isinstance(metrics, dict):
        print(f"{label}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.4f}")
    else:
        print(f"{label}: {metrics:.4f}")


18 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\s

Fold 1 - F1 Score (weighted): 0.9617665341623912
Best Parameters for Fold 1: {'alpha': 0.01, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Fold 1 - Confusion Matrix:
 [[6754  201]
 [ 287 5530]]
Fold 1 - Classification Report:
               precision    recall  f1-score   support

           0     0.9592    0.9711    0.9651      6955
           1     0.9649    0.9507    0.9577      5817

    accuracy                         0.9618     12772
   macro avg     0.9621    0.9609    0.9614     12772
weighted avg     0.9618    0.9618    0.9618     12772



18 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\s

Fold 2 - F1 Score (weighted): 0.9575828834614227
Best Parameters for Fold 2: {'alpha': 0.01, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Fold 2 - Confusion Matrix:
 [[6771  185]
 [ 356 5460]]
Fold 2 - Classification Report:
               precision    recall  f1-score   support

           0     0.9500    0.9734    0.9616      6956
           1     0.9672    0.9388    0.9528      5816

    accuracy                         0.9576     12772
   macro avg     0.9586    0.9561    0.9572     12772
weighted avg     0.9579    0.9576    0.9576     12772



18 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\s

Fold 3 - F1 Score (weighted): 0.9631268049126205
Best Parameters for Fold 3: {'alpha': 0.01, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Fold 3 - Confusion Matrix:
 [[6712  244]
 [ 227 5589]]
Fold 3 - Classification Report:
               precision    recall  f1-score   support

           0     0.9673    0.9649    0.9661      6956
           1     0.9582    0.9610    0.9596      5816

    accuracy                         0.9631     12772
   macro avg     0.9627    0.9629    0.9628     12772
weighted avg     0.9631    0.9631    0.9631     12772



18 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\s

Fold 4 - F1 Score (weighted): 0.9633483014265164
Best Parameters for Fold 4: {'alpha': 0.01, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Fold 4 - Confusion Matrix:
 [[6739  217]
 [ 251 5565]]
Fold 4 - Classification Report:
               precision    recall  f1-score   support

           0     0.9641    0.9688    0.9664      6956
           1     0.9625    0.9568    0.9596      5816

    accuracy                         0.9634     12772
   macro avg     0.9633    0.9628    0.9630     12772
weighted avg     0.9634    0.9634    0.9633     12772



18 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\s

Fold 5 - F1 Score (weighted): 0.9624713138481806
Best Parameters for Fold 5: {'alpha': 0.01, 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Fold 5 - Confusion Matrix:
 [[6760  196]
 [ 283 5533]]
Fold 5 - Classification Report:
               precision    recall  f1-score   support

           0     0.9598    0.9718    0.9658      6956
           1     0.9658    0.9513    0.9585      5816

    accuracy                         0.9625     12772
   macro avg     0.9628    0.9616    0.9621     12772
weighted avg     0.9625    0.9625    0.9625     12772


Average F1 Score (weighted) across all folds: 0.9616591675622264

Average Classification Report across all folds:
0:
  precision: 0.9601
  recall: 0.9700
  f1-score: 0.9650
  support: 6955.8000
1:
  precision: 0.9637
  recall: 0.9517
  f1-score: 0.9577
  support: 5816.2000
accuracy: 0.9617
macro avg:
  precision: 0.9619
  recall: 0.9609
  f1-score: 0.9613
  support: 12772.0000
weighted avg:
  precision: 0.9617
  recall: 0.9617
  f1-sco