#### Data Preprocessing post 02_eda_v1.ipynb

In [29]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [30]:
df = pd.read_csv('../../data/interim/reddit_preprocessed_v1.csv')

In [31]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them th...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [32]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df = df[~(df['clean_comment'].str.strip() == '')]

In [33]:
# Define the preprocessing function
def preprocess_comment(comment):
    # Convert to lowercase
    comment = comment.lower()

    # Remove trailing and leading whitespaces
    comment = comment.strip()

    # Remove newline characters
    comment = re.sub(r'\n', ' ', comment)

    # Remove non-alphanumeric characters, except punctuation
    comment = re.sub(r'[^A-Za-z0-9\s!?.,]', '', comment)

    # Remove stopwords but retain important ones for sentiment analysis
    stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
    comment = ' '.join([word for word in comment.split() if word not in stop_words])

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])

    return comment

In [34]:
# Apply the preprocessing function to the 'clean_comment' column
df['clean_comment'] = df['clean_comment'].apply(preprocess_comment)

In [35]:
df['clean_comment'].isnull().sum()

np.int64(0)

In [36]:
# df.to_csv('../data/interim/reddit_preprocessed_v2.csv', index=False)
df.to_csv('../../data/processed/reddit_clean_final.csv', index=False)

In [38]:
df1 = pd.read_csv(
    '../../data/processed/reddit_clean_final.csv',
    keep_default_na=False,
    na_filter=False
)


In [39]:
df1['clean_comment'].isnull().sum()

np.int64(0)

"""
IMPORTANT NOTE — WHY WE USE keep_default_na=False & na_filter=False WHEN LOADING CSV

Problem:
--------
After preprocessing, df['clean_comment'] had 0 nulls.
However, after saving to CSV and reloading, exactly 123 nulls appeared again.

Root Cause:
-----------
Pandas automatically interprets certain strings as missing values
(even if they are legitimate text in our dataset).

By default, these strings turn INTO NaN when reading CSV:

    "NA", "N/A", "na", "Na", "NULL", "None", "null", "#N/A", ""

In our cleaned dataset, some comments legitimately contained words like:
    "NA", "None", "Null"

Even though the cleaned DataFrame had no NaNs before saving, 
Pandas converted these values into NaN on reload.

Solution:
---------
Disable Pandas’ default NA detection when loading the CSV:

    keep_default_na=False  → don't convert special strings to NaN
    na_filter=False        → don't detect missing values at all

This ensures the text is preserved EXACTLY as saved.

Usage:
------
df.to_csv(..., quoting=1, encoding='utf-8')   # save safely
df = pd.read_csv(..., keep_default_na=False, na_filter=False)  # load safely

This guarantees that NO unintended NaNs appear again after reloading.
"""


next will perform eda for the exploration of cleaned data in 04_eda_v2.ipynb