### Here doing Data cleaning and pre processing directly because I have done it clearly and seperately line by line in 
### DataCleaning and EDA Notebook

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Function for preprocessing text data
def preprocess_text(df, text_column):
    # Lowercase the text
    df[text_column] = df[text_column].str.lower()
    
    # Remove special characters, punctuation, and symbols
    def remove_special_characters(text):
        pattern = r'[^a-zA-Z0-9\s]'  # Define regex pattern
        return re.sub(pattern, '', text)
    
    df[text_column] = df[text_column].apply(remove_special_characters)
    
    # Remove numbers
    def remove_numbers(text):
        pattern = r'\b\d+\b'  # Define regex pattern to match any standalone number
        return re.sub(pattern, '', text)
    
    df[text_column] = df[text_column].apply(remove_numbers)
    
    # Download stopwords list
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    
    # Remove stopwords
    def remove_stopwords(text):
        cleaned_text = ' '.join([word for word in text.split() if word not in stop_words])
        return cleaned_text
    
    df[text_column] = df[text_column].apply(remove_stopwords)
    
    # Tokenize text
    nltk.download('punkt')
    def tokenize_text(text):
        return word_tokenize(text)
    
    df['token'] = df[text_column].apply(tokenize_text)
    
    # Stemming
    stemmer = PorterStemmer()
    def stem_tokens(tokens):
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        return stemmed_tokens
    
    df['stemmed_token'] = df['token'].apply(stem_tokens)
    
    # Lemmatization
    nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()
    def lemmatize_tokens(tokens):
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return lemmatized_tokens
    
    df['lemmatized_token'] = df['stemmed_token'].apply(lemmatize_tokens)
    
    # Combine tokens into cleaned text
    df['cleaned_text'] = df['lemmatized_token'].apply(lambda tokens: ' '.join(tokens))
    
    return df

# Load data
data = pd.read_csv("hate.csv" , encoding='latin1')  

# Preprocess text data
data = preprocess_text(data, 'comment')


[nltk_data] Downloading package stopwords to C:\Users\Prayas
[nltk_data]     jain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Prayas
[nltk_data]     jain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Prayas
[nltk_data]     jain\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Train-Test Split

In [2]:
data

Unnamed: 0.1,Unnamed: 0,comment,label,token,stemmed_token,lemmatized_token,cleaned_text
0,1,dalits lowlives,N,"[dalits, lowlives]","[dalit, lowliv]","[dalit, lowliv]",dalit lowliv
1,2,gay people burden society,N,"[gay, people, burden, society]","[gay, peopl, burden, societi]","[gay, peopl, burden, societi]",gay peopl burden societi
2,3,arabs welcome,N,"[arabs, welcome]","[arab, welcom]","[arab, welcom]",arab welcom
3,4,im saying actually eliminate heebs wish natura...,N,"[im, saying, actually, eliminate, heebs, wish,...","[im, say, actual, elimin, heeb, wish, natur, b...","[im, say, actual, elimin, heeb, wish, natur, b...",im say actual elimin heeb wish natur becam ext...
4,5,bananas black people,N,"[bananas, black, people]","[banana, black, peopl]","[banana, black, peopl]",banana black peopl
...,...,...,...,...,...,...,...
41139,117100,indian culture backwards,N,"[indian, culture, backwards]","[indian, cultur, backward]","[indian, cultur, backward]",indian cultur backward
41140,118100,women dont want equality want charge,N,"[women, dont, want, equality, want, charge]","[women, dont, want, equal, want, charg]","[woman, dont, want, equal, want, charg]",woman dont want equal want charg
41141,119100,fuck covid,P,"[fuck, covid]","[fuck, covid]","[fuck, covid]",fuck covid
41142,1205,computer garbage,P,"[computer, garbage]","[comput, garbag]","[comput, garbag]",comput garbag


In [3]:
from sklearn.model_selection import train_test_split

# Split the dataset into input (X) and output (y) features
X = data['cleaned_text']
y = data['label']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (32915,)
X_test shape: (8229,)
y_train shape: (32915,)
y_test shape: (8229,)
