In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download resources (only first time)
nltk.download('stopwords')
nltk.download('wordnet')

# Load file
df = pd.read_csv("/content/final_preprocessed_dataset.csv")

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Add any contact-center fillers you wish to drop
extra_stops = {"please", "kindly", "hello", "hi", "thank", "thanks", "sir", "madam", ""}
stop_words.update(extra_stops)

# Preprocessing function
def preprocess_summary(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)        # keep only letters
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

# Apply preprocessing
df["Summary_Preprocessed"] = df["Summary"].astype(str).apply(preprocess_summary)

# Save with new column
df.to_csv("final_preprocessed_dataset_updated.csv", index=False)
print("✅ New file created: final_preprocessed_dataset_updated.csv with Summary_Preprocessed column")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


✅ New file created: final_preprocessed_dataset_updated.csv with Summary_Preprocessed column
