In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

TRAIN_DATA_PATH = os.getenv("TRAIN_DATA_PATH")
TEST_DATA_PATH = os.getenv("TEST_DATA_PATH")
PRE_TRAIN_DATA_PATH = os.getenv("PRE_TRAIN_DATA_PATH")
PRE_TEST_DATA_PATH = os.getenv("PRE_TEST_DATA_PATH")
TEST_LABELS_PATH = os.getenv("TEST_LABELS_PATH")


In [11]:
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import spacy
from tqdm import tqdm

In [12]:
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/ronakpanchal/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ronakpanchal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ronakpanchal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Remove URLs
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z\s]", '', text)  # Remove special characters and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [14]:
def tokenize_lemmatize(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    doc = nlp(' '.join(filtered_tokens))
    return ' '.join([token.lemma_ for token in doc])

In [15]:
def preprocess_dataframe(df, text_column='comment_text'):
    tqdm.pandas()
    df['clean_text'] = df[text_column].progress_apply(clean_text)
    df['lemmatized'] = df['clean_text'].progress_apply(tokenize_lemmatize)
    return df

In [16]:
train_df = pd.read_csv(str(TRAIN_DATA_PATH))
test_df = pd.read_csv(str(TEST_DATA_PATH))

preprocessed_train_df = preprocess_dataframe(train_df)
preprocessed_test_df = preprocess_dataframe(test_df)

100%|██████████| 159571/159571 [00:04<00:00, 32678.36it/s]
100%|██████████| 159571/159571 [05:40<00:00, 468.20it/s]
100%|██████████| 153164/153164 [00:04<00:00, 35779.50it/s]
100%|██████████| 153164/153164 [05:06<00:00, 500.23it/s]


In [17]:
test_labels = pd.read_csv(str(TEST_LABELS_PATH))
test_labels = test_labels[test_labels['toxic']!=-1]
full_test = preprocessed_test_df.merge(test_labels,on='id')

In [18]:
preprocessed_train_df.to_csv(str(PRE_TRAIN_DATA_PATH), index=False)
full_test.to_csv(str(PRE_TEST_DATA_PATH), index=False)