# GloVe Embedding
    - using glove.6B.100d.txt

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Load the cleaned data

In [2]:
df = pd.read_csv('C:\\group-1-main\\Data-Preprocessing\\cleaned_data.csv')
df['tweet'] = df['tweet'].astype(str).fillna('')

### Filter and Encode Labels

In [3]:
df = df[df['class'].isin([0, 1, 2])]
label_mapping = {0: 0, 1: 0, 2: 1}
df['label'] = df['class'].map(label_mapping)

In [4]:
# Extract Features and Labels
X = df['tweet'].values
y = df['label'].values

### Tokenization

In [5]:
max_words = 10000
max_length = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

# Padding Sequences
X_pad = pad_sequences(X_seq, maxlen=max_length)

### Load GloVe Embeddings

In [6]:
def load_glove_embeddings(filepath, word_index, embedding_dim=100):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [7]:
embedding_dim = 100
glove_filepath = 'glove.6B.100d.txt'
embedding_matrix = load_glove_embeddings(glove_filepath, 
                                         tokenizer.word_index, 
                                         embedding_dim)

### Transform padded sequences into average GloVe embeddings

In [8]:
def get_average_glove_embeddings(X_pad, embedding_matrix):
    X_embeddings = np.zeros((X_pad.shape[0], embedding_dim))
    for i, seq in enumerate(X_pad):
        embeddings = [embedding_matrix[word_idx] for word_idx in seq if word_idx != 0]
        if embeddings:
            X_embeddings[i] = np.mean(embeddings, axis=0)
    return X_embeddings

In [9]:
X_embeddings = get_average_glove_embeddings(X_pad, embedding_matrix)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)

### Sample data modeling (RFC)

In [10]:
# Define Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train Model
model.fit(X_train, y_train)

# Evaluate Model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred, target_names=['Hate Speech', 'Non-Hate Speech'], digits=4)

# Print accuracy, F1 score, and classification report
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Classification Report:\n{report}")


Accuracy: 0.9017550938067379
F1 Score: 0.8916686415906198
Classification Report:
                 precision    recall  f1-score   support

    Hate Speech     0.9085    0.9806    0.9432      4122
Non-Hate Speech     0.8425    0.5126    0.6374       835

       accuracy                         0.9018      4957
      macro avg     0.8755    0.7466    0.7903      4957
   weighted avg     0.8974    0.9018    0.8917      4957

