In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np
import spacy
from tqdm import tqdm  # For progress bar
from multiprocessing import Pool, cpu_count

In [2]:
# Load the dataset
file_path = 'hatspeech dataset.csv'
dataset = pd.read_csv(file_path, encoding='latin1')

In [3]:
# Load the spaCy model
nlp = spacy.load('en_core_web_md')

In [4]:
# Step 1: Preprocess the Text Data
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    return tokens

In [5]:
# Apply preprocessing to the 'comment' column
dataset['tokens'] = dataset['comment'].apply(preprocess_text)

In [6]:
# Step 2: Encode the Comments using spaCy with caching
def encode_comments_spacy(docs):
    vectors = []
    for doc in docs:
        vectors.append(doc.vector)
    return vectors

In [None]:
# Tokenize comments using spaCy
docs = list(nlp.pipe(dataset['comment']))

In [8]:
# Encode comments in batches
vector_cache = {}
encoded_vectors = encode_comments_spacy(docs)

In [9]:
# Assign encoded vectors to dataset
dataset['vector'] = encoded_vectors

In [10]:
# Display the first few rows with the new 'tokens' and 'vector' columns
print(dataset[['comment', 'tokens', 'vector']].head())

                                             comment  \
0                                dalits are lowlives   
1             gay people are a burden to our society   
2                              Arabs are not welcome   
3  I'm not saying we should actually eliminate he...   
4                       bananas are for black people   

                                              tokens  \
0                            [dalits, are, lowlives]   
1    [gay, people, are, a, burden, to, our, society]   
2                         [arabs, are, not, welcome]   
3  [i, not, saying, we, should, actually, elimina...   
4                 [bananas, are, for, black, people]   

                                              vector  
0  [-3.6505, 0.021166643, 0.48216668, 3.0177667, ...  
1  [-1.6461616, 4.3466787, -5.2230763, 0.6333113,...  
2  [-3.1942499, 2.0221825, -1.8137375, 2.40047, 1...  
3  [-1.0117264, 1.699589, -2.4988325, -2.4018066,...  
4  [-4.457378, -0.899094, -4.03162, 2.803622, 2.2..

In [11]:
dataset.to_csv('word2vec encoded.csv', index=False)
print("File saved")

File saved
