In [44]:
import pandas as pd
import numpy as np
import nltk


In [45]:
df=pd.read_csv('Reddit_final.csv')
df

Unnamed: 0,comment,hate_speech
0,subject regard hungarian oh boy brace lived bu...,1
1,iii just got work found and ground main type g...,0
2,wow guess cowboy every country,0
3,owen benjamin cowboy song go every country ama,0
4,all hear sun mean live small town rn for work ...,0
...,...,...
22241,op stop forgot post video next time hard,1
22242,minut long video top hate champagne go need ge...,0
22243,clue whos celeb are point time need get along ...,1
22244,didn ’ t insult you insult me,0


### Lemmatization

In [46]:
df['comment']=df['comment'].astype('str')

#### Lemmatization unlike stemming reduces inflected words properly ensuring that the root word belongs to the language. In the lemmatization root word is called lemma.

In [47]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df['lemmatized_comment'] = df['comment'].apply(lambda x: ''.join([lemmatizer.lemmatize(word) for word in x]))

df.head()

Unnamed: 0,comment,hate_speech,lemmatized_comment
0,subject regard hungarian oh boy brace lived bu...,1,subject regard hungarian oh boy brace lived bu...
1,iii just got work found and ground main type g...,0,iii just got work found and ground main type g...
2,wow guess cowboy every country,0,wow guess cowboy every country
3,owen benjamin cowboy song go every country ama,0,owen benjamin cowboy song go every country ama
4,all hear sun mean live small town rn for work ...,0,all hear sun mean live small town rn for work ...


### Tokenization

#### Breaking the text into sentences and words, we understand context of the text and we can also the find the topic of the text and many more.

### various tokenization techniques

### Using NLTK

##### 1.Applying word tokenization

In [48]:
#Word Tokenization with NLTK - Handles punctuation and contractions and Suitable for general text processing.
from nltk.tokenize import word_tokenize

# Define tokenization function
def tokenize_text(text):
    return word_tokenize(text.lower())

# Apply tokenization to the DataFrame
df['words'] = df['lemmatized_comment'].apply(tokenize_text)
df

Unnamed: 0,comment,hate_speech,lemmatized_comment,words
0,subject regard hungarian oh boy brace lived bu...,1,subject regard hungarian oh boy brace lived bu...,"[subject, regard, hungarian, oh, boy, brace, l..."
1,iii just got work found and ground main type g...,0,iii just got work found and ground main type g...,"[iii, just, got, work, found, and, ground, mai..."
2,wow guess cowboy every country,0,wow guess cowboy every country,"[wow, guess, cowboy, every, country]"
3,owen benjamin cowboy song go every country ama,0,owen benjamin cowboy song go every country ama,"[owen, benjamin, cowboy, song, go, every, coun..."
4,all hear sun mean live small town rn for work ...,0,all hear sun mean live small town rn for work ...,"[all, hear, sun, mean, live, small, town, rn, ..."
...,...,...,...,...
22241,op stop forgot post video next time hard,1,op stop forgot post video next time hard,"[op, stop, forgot, post, video, next, time, hard]"
22242,minut long video top hate champagne go need ge...,0,minut long video top hate champagne go need ge...,"[minut, long, video, top, hate, champagne, go,..."
22243,clue whos celeb are point time need get along ...,1,clue whos celeb are point time need get along ...,"[clue, whos, celeb, are, point, time, need, ge..."
22244,didn ’ t insult you insult me,0,didn ’ t insult you insult me,"[didn, ’, t, insult, you, insult, me]"


##### 2.Applying sentence tokenization

In [49]:
from nltk import sent_tokenize
def tokenize_text(text):
    return sent_tokenize(text.lower())

# Apply tokenization to the DataFrame
df['sentences'] = df['lemmatized_comment'].apply(tokenize_text)
df

Unnamed: 0,comment,hate_speech,lemmatized_comment,words,sentences
0,subject regard hungarian oh boy brace lived bu...,1,subject regard hungarian oh boy brace lived bu...,"[subject, regard, hungarian, oh, boy, brace, l...",[subject regard hungarian oh boy brace lived b...
1,iii just got work found and ground main type g...,0,iii just got work found and ground main type g...,"[iii, just, got, work, found, and, ground, mai...",[iii just got work found and ground main type ...
2,wow guess cowboy every country,0,wow guess cowboy every country,"[wow, guess, cowboy, every, country]",[wow guess cowboy every country]
3,owen benjamin cowboy song go every country ama,0,owen benjamin cowboy song go every country ama,"[owen, benjamin, cowboy, song, go, every, coun...",[owen benjamin cowboy song go every country ama]
4,all hear sun mean live small town rn for work ...,0,all hear sun mean live small town rn for work ...,"[all, hear, sun, mean, live, small, town, rn, ...",[all hear sun mean live small town rn for work...
...,...,...,...,...,...
22241,op stop forgot post video next time hard,1,op stop forgot post video next time hard,"[op, stop, forgot, post, video, next, time, hard]",[op stop forgot post video next time hard]
22242,minut long video top hate champagne go need ge...,0,minut long video top hate champagne go need ge...,"[minut, long, video, top, hate, champagne, go,...",[minut long video top hate champagne go need g...
22243,clue whos celeb are point time need get along ...,1,clue whos celeb are point time need get along ...,"[clue, whos, celeb, are, point, time, need, ge...",[clue whos celeb are point time need get along...
22244,didn ’ t insult you insult me,0,didn ’ t insult you insult me,"[didn, ’, t, insult, you, insult, me]",[didn ’ t insult you insult me]


### Using Spacy

In [50]:
#SpaCy Tokenization - A robust tokenizer that handles punctuation, contractions, and multi-word expressions.
#(Handles a wide variety of text and Good for syntactic and semantic analysis.)

import spacy

nlp = spacy.blank("en")
df['tokens'] = df['lemmatized_comment'].apply(lambda x: [token.text.lower() for token in nlp(x)])
df

Unnamed: 0,comment,hate_speech,lemmatized_comment,words,sentences,tokens
0,subject regard hungarian oh boy brace lived bu...,1,subject regard hungarian oh boy brace lived bu...,"[subject, regard, hungarian, oh, boy, brace, l...",[subject regard hungarian oh boy brace lived b...,"[subject, regard, hungarian, oh, boy, brace, l..."
1,iii just got work found and ground main type g...,0,iii just got work found and ground main type g...,"[iii, just, got, work, found, and, ground, mai...",[iii just got work found and ground main type ...,"[iii, just, got, work, found, and, ground, mai..."
2,wow guess cowboy every country,0,wow guess cowboy every country,"[wow, guess, cowboy, every, country]",[wow guess cowboy every country],"[wow, guess, cowboy, every, country]"
3,owen benjamin cowboy song go every country ama,0,owen benjamin cowboy song go every country ama,"[owen, benjamin, cowboy, song, go, every, coun...",[owen benjamin cowboy song go every country ama],"[owen, benjamin, cowboy, song, go, every, coun..."
4,all hear sun mean live small town rn for work ...,0,all hear sun mean live small town rn for work ...,"[all, hear, sun, mean, live, small, town, rn, ...",[all hear sun mean live small town rn for work...,"[all, hear, sun, mean, live, small, town, rn, ..."
...,...,...,...,...,...,...
22241,op stop forgot post video next time hard,1,op stop forgot post video next time hard,"[op, stop, forgot, post, video, next, time, hard]",[op stop forgot post video next time hard],"[op, stop, forgot, post, video, next, time, hard]"
22242,minut long video top hate champagne go need ge...,0,minut long video top hate champagne go need ge...,"[minut, long, video, top, hate, champagne, go,...",[minut long video top hate champagne go need g...,"[minut, long, video, top, hate, champagne, go,..."
22243,clue whos celeb are point time need get along ...,1,clue whos celeb are point time need get along ...,"[clue, whos, celeb, are, point, time, need, ge...",[clue whos celeb are point time need get along...,"[clue, who, s, celeb, are, point, time, need, ..."
22244,didn ’ t insult you insult me,0,didn ’ t insult you insult me,"[didn, ’, t, insult, you, insult, me]",[didn ’ t insult you insult me],"[didn, ’, t, insult, you, insult, me]"


In [51]:
df=df.drop(['sentences','words','comment'],axis=1)

In [52]:
df

Unnamed: 0,hate_speech,lemmatized_comment,tokens
0,1,subject regard hungarian oh boy brace lived bu...,"[subject, regard, hungarian, oh, boy, brace, l..."
1,0,iii just got work found and ground main type g...,"[iii, just, got, work, found, and, ground, mai..."
2,0,wow guess cowboy every country,"[wow, guess, cowboy, every, country]"
3,0,owen benjamin cowboy song go every country ama,"[owen, benjamin, cowboy, song, go, every, coun..."
4,0,all hear sun mean live small town rn for work ...,"[all, hear, sun, mean, live, small, town, rn, ..."
...,...,...,...
22241,1,op stop forgot post video next time hard,"[op, stop, forgot, post, video, next, time, hard]"
22242,0,minut long video top hate champagne go need ge...,"[minut, long, video, top, hate, champagne, go,..."
22243,1,clue whos celeb are point time need get along ...,"[clue, who, s, celeb, are, point, time, need, ..."
22244,0,didn ’ t insult you insult me,"[didn, ’, t, insult, you, insult, me]"


In [53]:
df.to_csv('Reddit_Tokenization.csv', index=False)
print("File saved")

File saved
