In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Reddit_final.csv')
df.head()

Unnamed: 0,comment,hate_speech
0,A subsection of retarded Hungarians? Oh boy. b...,1
1,Iii. Just got off work. Foundation and groundi...,0
2,wow i guess cowboys are the same in every country,0
3,Owen Benjamin's cowboy song goes for every cou...,0
4,"> ""y'all hear sun?"" by all means I live in a s...",0


## Tokenization

In [3]:
#Word Tokenization with NLTK - Handles punctuation and contractions and Suitable for general text processing.

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK data
nltk.download('punkt')

# Define tokenization function
def tokenize_text(text):
    if isinstance(text, str):  # Check if the input is a string
        return word_tokenize(text.lower())
    else:
        return []  # Return an empty list for NaN or non-string inputs


# Apply tokenization to the DataFrame
df['tokens'] = df['comment'].apply(tokenize_text)
print(df[['comment', 'tokens']])

[nltk_data] Downloading package punkt to C:\Users\Aman Shekhar
[nltk_data]     Sachan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                 comment  \
0      A subsection of retarded Hungarians? Oh boy. b...   
1      Iii. Just got off work. Foundation and groundi...   
2      wow i guess cowboys are the same in every country   
3      Owen Benjamin's cowboy song goes for every cou...   
4      > "y'all hear sun?" by all means I live in a s...   
...                                                  ...   
22241  Of, stop being a forgot and post videos next t...   
22242  In this minute long video, Top Hate and Champa...   
22243  No clue whos these e-celebs are, but at this p...   
22244      I didn’t insult you, why would you insult me?   
22245                      Because you are living a lie.   

                                                  tokens  
0      [a, subsection, of, retarded, hungarians, ?, o...  
1      [iii, ., just, got, off, work, ., foundation, ...  
2      [wow, i, guess, cowboys, are, the, same, in, e...  
3      [owen, benjamin, 's, cowboy, song, g

In [4]:
#SpaCy Tokenization - A robust tokenizer that handles punctuation, contractions, and multi-word expressions.
#(Handles a wide variety of text and Good for syntactic and semantic analysis.)

# import spacy

# nlp = spacy.load("en_core_web_sm")
# df['tokens_spacy'] = df['comment'].apply(lambda x: [token.text.lower() for token in nlp(x)])
# print(df[['comment', 'tokens_spacy']])


In [5]:
df.head()

Unnamed: 0,comment,hate_speech,tokens
0,A subsection of retarded Hungarians? Oh boy. b...,1,"[a, subsection, of, retarded, hungarians, ?, o..."
1,Iii. Just got off work. Foundation and groundi...,0,"[iii, ., just, got, off, work, ., foundation, ..."
2,wow i guess cowboys are the same in every country,0,"[wow, i, guess, cowboys, are, the, same, in, e..."
3,Owen Benjamin's cowboy song goes for every cou...,0,"[owen, benjamin, 's, cowboy, song, goes, for, ..."
4,"> ""y'all hear sun?"" by all means I live in a s...",0,"[>, ``, y'all, hear, sun, ?, '', by, all, mean..."


In [6]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to get Word2Vec vectors for each text
def get_word2vec_vectors(tokens, model, vector_size):
    vector = np.zeros(vector_size)
    valid_words = 0
    for word in tokens:
        if word in model.wv:
            vector += model.wv[word]
            valid_words += 1
    if valid_words > 0:
        vector /= valid_words
    return vector

# Apply Word2Vec to the dataset
df['word2vec_vector'] = df['tokens'].apply(lambda x: get_word2vec_vectors(x, word2vec_model, 100))

# Convert Word2Vec features to a DataFrame
word2vec_df = pd.DataFrame(df['word2vec_vector'].tolist())

print(word2vec_df.head())


  "class": algorithms.Blowfish,


         0         1         2         3         4         5         6   \
0  0.024228  0.034538  0.151526 -0.717572  0.319267 -0.452068  0.599014   
1  0.018594 -0.081036  0.187420 -0.153866  0.022831 -0.505655  0.717692   
2 -0.219919  0.477358  0.167302 -0.117339 -0.040331 -0.375824  0.592844   
3  0.040825  0.386046 -0.061500 -0.695316 -0.329573 -0.827267  0.688353   
4  0.034304 -0.045680  0.143956 -0.466570  0.125248 -0.486454  0.548949   

         7         8         9   ...        90        91        92        93  \
0  0.984125  0.147259 -0.725343  ...  0.189234  0.317745  0.492391 -0.221271   
1  1.110163 -0.101841 -0.857093  ...  0.437212  0.499844  0.348613  0.057537   
2  1.092306  0.042508 -0.698886  ...  0.390546  0.403835  0.225338 -0.839457   
3  0.763487 -0.125009 -0.865352  ...  0.544859  0.823666  0.188625 -0.394568   
4  0.957444 -0.202432 -0.814244  ...  0.673100  0.399568  0.155341 -0.135279   

         94        95        96        97        98        99  
0  0

In [7]:
df.head()

Unnamed: 0,comment,hate_speech,tokens,word2vec_vector
0,A subsection of retarded Hungarians? Oh boy. b...,1,"[a, subsection, of, retarded, hungarians, ?, o...","[0.024227744409942936, 0.03453819876825758, 0...."
1,Iii. Just got off work. Foundation and groundi...,0,"[iii, ., just, got, off, work, ., foundation, ...","[0.018594301187099434, -0.08103627108810645, 0..."
2,wow i guess cowboys are the same in every country,0,"[wow, i, guess, cowboys, are, the, same, in, e...","[-0.21991944704204797, 0.4773578126973007, 0.1..."
3,Owen Benjamin's cowboy song goes for every cou...,0,"[owen, benjamin, 's, cowboy, song, goes, for, ...","[0.04082503143904938, 0.38604630845495397, -0...."
4,"> ""y'all hear sun?"" by all means I live in a s...",0,"[>, ``, y'all, hear, sun, ?, '', by, all, mean...","[0.03430420139797924, -0.04568030201488922, 0...."


In [8]:
import fasttext
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Join tokens into space-separated strings
df['text'] = df['tokens'].apply(lambda x: ' '.join(x))

# Prefix labels with '__label__'
df['label'] = '__label__' + df['hate_speech'].astype(str)

# Split data into training and testing sets
train_data, test_data = train_test_split(df[['label', 'text']], test_size=0.2, random_state=42)

# Save to files
train_data[['label', 'text']].to_csv('train.txt', sep=' ', header=None, index=None)
test_data[['label', 'text']].to_csv('test.txt', sep=' ', header=None, index=None)

# Train a FastText supervised model
model = fasttext.train_supervised(input='train.txt', epoch=10, lr=0.1, wordNgrams=2, bucket=200000, dim=100, loss='ova')



In [9]:
df

Unnamed: 0,comment,hate_speech,tokens,word2vec_vector,text,label
0,A subsection of retarded Hungarians? Oh boy. b...,1,"[a, subsection, of, retarded, hungarians, ?, o...","[0.024227744409942936, 0.03453819876825758, 0....",a subsection of retarded hungarians ? oh boy ....,__label__1
1,Iii. Just got off work. Foundation and groundi...,0,"[iii, ., just, got, off, work, ., foundation, ...","[0.018594301187099434, -0.08103627108810645, 0...",iii . just got off work . foundation and groun...,__label__0
2,wow i guess cowboys are the same in every country,0,"[wow, i, guess, cowboys, are, the, same, in, e...","[-0.21991944704204797, 0.4773578126973007, 0.1...",wow i guess cowboys are the same in every country,__label__0
3,Owen Benjamin's cowboy song goes for every cou...,0,"[owen, benjamin, 's, cowboy, song, goes, for, ...","[0.04082503143904938, 0.38604630845495397, -0....",owen benjamin 's cowboy song goes for every co...,__label__0
4,"> ""y'all hear sun?"" by all means I live in a s...",0,"[>, ``, y'all, hear, sun, ?, '', by, all, mean...","[0.03430420139797924, -0.04568030201488922, 0....",> `` y'all hear sun ? '' by all means i live i...,__label__0
...,...,...,...,...,...,...
22241,"Of, stop being a forgot and post videos next t...",1,"[of, ,, stop, being, a, forgot, and, post, vid...","[0.0032317287781659294, 0.018679809044389165, ...","of , stop being a forgot and post videos next ...",__label__1
22242,"In this minute long video, Top Hate and Champa...",0,"[in, this, minute, long, video, ,, top, hate, ...","[-0.15791856837940627, 0.2616135396127557, 0.1...","in this minute long video , top hate and champ...",__label__0
22243,"No clue whos these e-celebs are, but at this p...",1,"[no, clue, whos, these, e-celebs, are, ,, but,...","[-0.00767811543376344, 0.16418187655281366, -0...","no clue whos these e-celebs are , but at this ...",__label__1
22244,"I didn’t insult you, why would you insult me?",0,"[i, didn, ’, t, insult, you, ,, why, would, yo...","[0.26191420738513654, 0.5319719864771917, 0.13...","i didn ’ t insult you , why would you insult me ?",__label__0


In [10]:
# Evaluate the model
def evaluate_model(model, test_file):
    with open(test_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    y_true = []
    y_pred = []
    for line in lines:
        label, text = line.strip().split(' ', 1)
        y_true.append(int(label.replace('__label__', '')))
        pred_label, _ = model.predict(text)
        y_pred.append(int(pred_label[0].replace('__label__', '')))

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')

# Call the evaluate_model function
evaluate_model(model, 'test.txt')


Accuracy: 0.8733
Precision: 0.8590
Recall: 0.5879
F1-score: 0.6981
