In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Reddit_final.csv')
df.head()

Unnamed: 0,comment,hate_speech
0,A subsection of retarded Hungarians? Oh boy. b...,1
1,Iii. Just got off work. Foundation and groundi...,0
2,wow i guess cowboys are the same in every country,0
3,Owen Benjamin's cowboy song goes for every cou...,0
4,"> ""y'all hear sun?"" by all means I live in a s...",0


## Tokenization

In [3]:
#Word Tokenization with NLTK - Handles punctuation and contractions and Suitable for general text processing.

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK data
nltk.download('punkt')

# Define tokenization function
def tokenize_text(text):
    if isinstance(text, str):  # Check if the input is a string
        return word_tokenize(text.lower())
    else:
        return []  # Return an empty list for NaN or non-string inputs


# Apply tokenization to the DataFrame
df['tokens'] = df['comment'].apply(tokenize_text)
print(df[['comment', 'tokens']])

[nltk_data] Downloading package punkt to C:\Users\Aman Shekhar
[nltk_data]     Sachan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                 comment  \
0      A subsection of retarded Hungarians? Oh boy. b...   
1      Iii. Just got off work. Foundation and groundi...   
2      wow i guess cowboys are the same in every country   
3      Owen Benjamin's cowboy song goes for every cou...   
4      > "y'all hear sun?" by all means I live in a s...   
...                                                  ...   
22241  Of, stop being a forgot and post videos next t...   
22242  In this minute long video, Top Hate and Champa...   
22243  No clue whos these e-celebs are, but at this p...   
22244      I didn’t insult you, why would you insult me?   
22245                      Because you are living a lie.   

                                                  tokens  
0      [a, subsection, of, retarded, hungarians, ?, o...  
1      [iii, ., just, got, off, work, ., foundation, ...  
2      [wow, i, guess, cowboys, are, the, same, in, e...  
3      [owen, benjamin, 's, cowboy, song, g

In [4]:
#SpaCy Tokenization - A robust tokenizer that handles punctuation, contractions, and multi-word expressions.
#(Handles a wide variety of text and Good for syntactic and semantic analysis.)

import spacy

nlp = spacy.load("en_core_web_sm")
df['tokens_spacy'] = df['comment'].apply(lambda x: [token.text.lower() for token in nlp(x)])
print(df[['comment', 'tokens_spacy']])


                                                 comment  \
0      A subsection of retarded Hungarians? Oh boy. b...   
1      Iii. Just got off work. Foundation and groundi...   
2      wow i guess cowboys are the same in every country   
3      Owen Benjamin's cowboy song goes for every cou...   
4      > "y'all hear sun?" by all means I live in a s...   
...                                                  ...   
22241  Of, stop being a forgot and post videos next t...   
22242  In this minute long video, Top Hate and Champa...   
22243  No clue whos these e-celebs are, but at this p...   
22244      I didn’t insult you, why would you insult me?   
22245                      Because you are living a lie.   

                                            tokens_spacy  
0      [a, subsection, of, retarded, hungarians, ?, o...  
1      [iii, ., just, got, off, work, ., foundation, ...  
2      [wow, i, guess, cowboys, are, the, same, in, e...  
3      [owen, benjamin, 's, cowboy, song, g

In [5]:
df.head()

Unnamed: 0,comment,hate_speech,tokens,tokens_spacy
0,A subsection of retarded Hungarians? Oh boy. b...,1,"[a, subsection, of, retarded, hungarians, ?, o...","[a, subsection, of, retarded, hungarians, ?, o..."
1,Iii. Just got off work. Foundation and groundi...,0,"[iii, ., just, got, off, work, ., foundation, ...","[iii, ., just, got, off, work, ., foundation, ..."
2,wow i guess cowboys are the same in every country,0,"[wow, i, guess, cowboys, are, the, same, in, e...","[wow, i, guess, cowboys, are, the, same, in, e..."
3,Owen Benjamin's cowboy song goes for every cou...,0,"[owen, benjamin, 's, cowboy, song, goes, for, ...","[owen, benjamin, 's, cowboy, song, goes, for, ..."
4,"> ""y'all hear sun?"" by all means I live in a s...",0,"[>, ``, y'all, hear, sun, ?, '', by, all, mean...","[>, "", y', all, hear, sun, ?, "", by, all, mean..."


## Encoding

In [6]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Function to get Word2Vec vectors for each text
def get_word2vec_vectors(tokens, model, vector_size):
    vector = np.zeros(vector_size)
    valid_words = 0
    for word in tokens:
        if word in model.wv:
            vector += model.wv[word]
            valid_words += 1
    if valid_words > 0:
        vector /= valid_words
    return vector

# Apply Word2Vec to the dataset
df['word2vec_vector'] = df['tokens'].apply(lambda x: get_word2vec_vectors(x, word2vec_model, 100))

# Convert Word2Vec features to a DataFrame
word2vec_df = pd.DataFrame(df['word2vec_vector'].tolist())

print(word2vec_df.head())


  "class": algorithms.Blowfish,


         0         1         2         3         4         5         6   \
0  0.363497 -0.220945  0.010517 -0.602767  0.172479 -0.504409  0.794537   
1  0.065042 -0.122197  0.100385 -0.005709  0.070777 -0.620618  0.681243   
2  0.292166  0.385734 -0.048033 -0.141083  0.212870 -0.512191  0.579481   
3  0.074742  0.083152 -0.081609 -0.397689 -0.137089 -1.063069  0.531439   
4  0.192634 -0.107493 -0.089676 -0.271427  0.183969 -0.576812  0.581196   

         7         8         9   ...        90        91        92        93  \
0  0.956250  0.095910 -0.555402  ...  0.367883  0.511329  0.360923  0.054399   
1  1.025234 -0.097676 -0.647134  ...  0.431777  0.739346  0.304107  0.122711   
2  0.932011 -0.035010 -0.343719  ...  0.612062  1.063374 -0.273927 -0.392330   
3  0.918282 -0.469783 -0.672046  ...  0.474360  0.966216  0.182148 -0.093319   
4  0.974797 -0.254152 -0.619714  ...  0.617659  0.673389 -0.008742  0.090960   

         94        95        96        97        98        99  
0  0

In [7]:
df.head()

Unnamed: 0,comment,hate_speech,tokens,tokens_spacy,word2vec_vector
0,A subsection of retarded Hungarians? Oh boy. b...,1,"[a, subsection, of, retarded, hungarians, ?, o...","[a, subsection, of, retarded, hungarians, ?, o...","[0.3634971592671655, -0.2209453577143622, 0.01..."
1,Iii. Just got off work. Foundation and groundi...,0,"[iii, ., just, got, off, work, ., foundation, ...","[iii, ., just, got, off, work, ., foundation, ...","[0.06504157067821273, -0.1221966900651888, 0.1..."
2,wow i guess cowboys are the same in every country,0,"[wow, i, guess, cowboys, are, the, same, in, e...","[wow, i, guess, cowboys, are, the, same, in, e...","[0.29216628866270183, 0.3857342477887869, -0.0..."
3,Owen Benjamin's cowboy song goes for every cou...,0,"[owen, benjamin, 's, cowboy, song, goes, for, ...","[owen, benjamin, 's, cowboy, song, goes, for, ...","[0.07474208818489893, 0.08315160974032348, -0...."
4,"> ""y'all hear sun?"" by all means I live in a s...",0,"[>, ``, y'all, hear, sun, ?, '', by, all, mean...","[>, "", y', all, hear, sun, ?, "", by, all, mean...","[0.19263355600529572, -0.10749349491605141, -0..."


In [8]:
import pandas as pd
import re
import fasttext


# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing
df['cleaned_comment'] = df['comment'].apply(preprocess_text)


In [9]:
# Load the FastText model
ft_model = fasttext.load_model('cc.en.300.bin')

In [10]:
# Function to get FastText vector
def get_fasttext_vector(text, model):
    words = text.split()
    vector = model.get_sentence_vector(' '.join(words))
    return vector

In [11]:

# Apply FastText encoding
df['fasttext_vector'] = df['cleaned_comment'].apply(lambda x: get_fasttext_vector(x, ft_model))
df['fasttext_vector'] = df['fasttext_vector'].apply(lambda x: x.tolist())

# Display the DataFrame with FastText vectors
print(df[['cleaned_comment', 'fasttext_vector']])


                                         cleaned_comment  \
0      a subsection of retarded hungarians oh boy bra...   
1      iii just got off work foundation and grounding...   
2      wow i guess cowboys are the same in every country   
3      owen benjamins cowboy song goes for every coun...   
4      yall hear sun by all means i live in a small t...   
...                                                  ...   
22241  of stop being a forgot and post videos next ti...   
22242  in this minute long video top hate and champag...   
22243  no clue whos these ecelebs are but at this poi...   
22244         i didnt insult you why would you insult me   
22245                       because you are living a lie   

                                         fasttext_vector  
0      [0.004770048428326845, -0.03953176364302635, -...  
1      [-0.012044071219861507, -0.011684201657772064,...  
2      [-0.0008766286191530526, 0.0192383024841547, 0...  
3      [0.00418263953179121, -0.00297893979

In [12]:
df

Unnamed: 0,comment,hate_speech,tokens,tokens_spacy,word2vec_vector,cleaned_comment,fasttext_vector
0,A subsection of retarded Hungarians? Oh boy. b...,1,"[a, subsection, of, retarded, hungarians, ?, o...","[a, subsection, of, retarded, hungarians, ?, o...","[0.3634971592671655, -0.2209453577143622, 0.01...",a subsection of retarded hungarians oh boy bra...,"[0.004770048428326845, -0.03953176364302635, -..."
1,Iii. Just got off work. Foundation and groundi...,0,"[iii, ., just, got, off, work, ., foundation, ...","[iii, ., just, got, off, work, ., foundation, ...","[0.06504157067821273, -0.1221966900651888, 0.1...",iii just got off work foundation and grounding...,"[-0.012044071219861507, -0.011684201657772064,..."
2,wow i guess cowboys are the same in every country,0,"[wow, i, guess, cowboys, are, the, same, in, e...","[wow, i, guess, cowboys, are, the, same, in, e...","[0.29216628866270183, 0.3857342477887869, -0.0...",wow i guess cowboys are the same in every country,"[-0.0008766286191530526, 0.0192383024841547, 0..."
3,Owen Benjamin's cowboy song goes for every cou...,0,"[owen, benjamin, 's, cowboy, song, goes, for, ...","[owen, benjamin, 's, cowboy, song, goes, for, ...","[0.07474208818489893, 0.08315160974032348, -0....",owen benjamins cowboy song goes for every coun...,"[0.00418263953179121, -0.0029789397958666086, ..."
4,"> ""y'all hear sun?"" by all means I live in a s...",0,"[>, ``, y'all, hear, sun, ?, '', by, all, mean...","[>, "", y', all, hear, sun, ?, "", by, all, mean...","[0.19263355600529572, -0.10749349491605141, -0...",yall hear sun by all means i live in a small t...,"[-6.847345503047109e-05, 0.0004209601611364633..."
...,...,...,...,...,...,...,...
22241,"Of, stop being a forgot and post videos next t...",1,"[of, ,, stop, being, a, forgot, and, post, vid...","[of, ,, stop, being, a, forgot, and, post, vid...","[0.18222494916442564, -0.2874639831921634, 0.0...",of stop being a forgot and post videos next ti...,"[0.010218881070613861, 0.015606718137860298, 0..."
22242,"In this minute long video, Top Hate and Champa...",0,"[in, this, minute, long, video, ,, top, hate, ...","[in, this, minute, long, video, ,, top, hate, ...","[0.0843904949122526, 0.06925528072591486, -0.0...",in this minute long video top hate and champag...,"[0.004358708392828703, -0.006413696799427271, ..."
22243,"No clue whos these e-celebs are, but at this p...",1,"[no, clue, whos, these, e-celebs, are, ,, but,...","[no, clue, who, s, these, e, -, celebs, are, ,...","[0.26610068939683007, -0.041094351843817205, -...",no clue whos these ecelebs are but at this poi...,"[-0.0025172580499202013, -0.005129380617290735..."
22244,"I didn’t insult you, why would you insult me?",0,"[i, didn, ’, t, insult, you, ,, why, would, yo...","[i, did, n’t, insult, you, ,, why, would, you,...","[0.706557840681993, 0.4792756805053124, -0.129...",i didnt insult you why would you insult me,"[0.02669641375541687, -0.015254346653819084, 0..."


In [13]:
df.to_csv('RedditTokenized.csv', index=False)
print("File saved")

File saved
