In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


In [2]:
data = pd.read_csv('Preprocessed_Final_dataset.csv',encoding = 'latin1')
data.head()

Unnamed: 0,comment,label,clean_text,tokens,text
0,dalits are lowlives,N,dalits are lowlives,"['dalits', 'lowlives']",dalits lowlives
1,gay people are a burden to our society,N,gay people are a burden to our society,"['gay', 'people', 'burden', 'society']",gay people burden society
2,Arabs are not welcome,N,arabs are not welcome,"['arab', 'welcome']",arab welcome
3,I'm not saying we should actually eliminate he...,N,i am not saying we should actually eliminate h...,"['say', 'actually', 'eliminate', 'heebs', 'wis...",say actually eliminate heebs wish naturally be...
4,bananas are for black people,N,bananas are for black people,"['bananas', 'black', 'people']",bananas black people


# TF-IDF Vectorizer

### TF-IDF is the importance of a term is inversely related to its frequency across documents.TF gives us information on how often a term appears in a document and IDF gives us information about the relative rarity of a term in the collection of documents. By multiplying these values together we can get our final TF-IDF value.The higher the TF-IDF score the more important or relevant the term is; as a term gets less relevant, its TF-IDF score will approach 0.

In [4]:
# Transform text to TF-IDF features with a limit on max features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(data['text'])

# Label Encoding:

### LabelEncoder converts the categorical label into numeric labels.This is useful for classification tasks where the target variable needs to be in numerical format.

In [5]:
# Encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])
tfidf_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df['label'] = y

print(tfidf_df.head())

In [7]:
data.to_csv('tf_idf.csv', index=False)

# Random Undersampling

### Random Undersampling is a technique used to address class imbalance by randomly removing samples from the majority class to achieve a balanced dataset. This method is simple and effective but may result in the loss of important information from the majority class.

In [15]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)
X_resampled_df = pd.DataFrame(X_resampled.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
y_resampled_df = pd.DataFrame(y_resampled, columns=['label'])

# Concatenate the features and label DataFrames
balanced_df = pd.concat([X_resampled_df, y_resampled_df], axis=1)

# Inverse transform the labels to get the original label names
balanced_df['label'] = label_encoder.inverse_transform(balanced_df['label'])

### Balanced Class Distribution Using R

In [16]:
print(balanced_df['label'].value_counts())

label
N    18944
P    18944
Name: count, dtype: int64
