# Importing Libraries

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading the Dataset

In [None]:
data = pd.read_csv('hatspeech dataset.csv', encoding='latin1')  

In [None]:
data

# Exploring the data

In [None]:
print(data.head())  # Check the first few rows of the dataset

In [None]:
print(data.info())  # Get information about the dataset, like column names and data types

# Checking the distribution of labels

In [None]:
print(data['label'].value_counts())  # Check the distribution of labels

In [None]:
data.isnull().any()

# Checking for Duplicates

In [None]:
duplicates = data.duplicated(subset=['comment'])

In [None]:
duplicates

# Importing necessary nltk packages

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Text Cleaning

In [None]:
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    return text

In [None]:
data['clean_text'] = data['comment'].apply(clean_text)

In [None]:
data['clean_text']

# Tokenization

Tokenization is a way of separating a piece of text into smaller units called tokens. Here, tokens can be either words, characters, or subwords.For example, tokenizing the sentence “I love ice cream” would result in three tokens: “I,” “love,” and “ice cream.” It’s a fundamental step in natural language processing and text analysis tasks.

In [None]:
from nltk.tokenize import word_tokenize

data['tokens'] = data['clean_text'].apply(word_tokenize)

In [None]:
data['tokens']

# Removing Stopwords

Stop words, which are highly occurring words in the document such as ‘a’, ‘an’,’the’,’is’,’was’,’will’,’would’ etc.They provide no meaningful information, especially if we are building a text classification model. Therefore, we have to remove stopwords from our dataset.

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]


In [None]:
data['tokens'] = data['tokens'].apply(remove_stopwords)

In [None]:
print(data['tokens'])

# Lemmatization

Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item. Lemmatization is similar to stemming but it brings context to the words. So, it links words with similar meanings to one word. The practical distinction between stemming and lemmatization is that, where stemming merely removes common suffixes from the end of word tokens, lemmatization ensures the output word is an existing normalized form of the word (for example, lemma) that can be found in the dictionary.

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

data['lemmatized_tokens'] = data['tokens'].apply(lemmatize_words)

In [None]:
data['lemmatized_tokens']

In [None]:
data.to_csv('hatespeech_Preprocessed.csv', index=False)
print("File saved")

# Vectorization (using TF-IDF)

TF-IDF is the importance of a term is inversely related to its frequency across documents.TF gives us information on how often a term appears in a document and IDF gives us information about the relative rarity of a term in the collection of documents. By multiplying these values together we can get our final TF-IDF value.The higher the TF-IDF score the more important or relevant the term is; as a term gets less relevant, its TF-IDF score will approach 0.

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(data['clean_text']).toarray()
y = data['label']

# Splitting Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Class Balancing using SMOTE

In [None]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Checking the class distribution after balancing

In [None]:
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_balanced.value_counts())

# Saving the balanced data

In [None]:
# Step 10: Save the balanced data
balanced_data = pd.DataFrame(X_train_balanced, columns=tfidf_vectorizer.get_feature_names_out())
balanced_data['label'] = y_train_balanced
balanced_data.to_csv('balanced_hate_comment.csv', index=False)