In [4]:
# =============================================================================
# Cell 1: All Imports and NLTK Downloads
# =============================================================================
import pandas as pd
import re
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from transformers import pipeline
import torch
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Download all necessary NLTK data packages one time
print("Downloading NLTK data... (This may take a moment)")
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)
print("NLTK data download complete.")

# =============================================================================
# Cell 2: Load and Inspect Data
# =============================================================================
print("\nStep 1: Loading and Inspecting Data...")
df = pd.read_csv("../data/customer_support_tickets_1.csv")
print("Data loaded successfully. Here's a preview:")
print(df.head())
print("\nChecking for missing values:")
print(df.isnull().sum())
print("\nChecking for duplicates:")
print(f"Found {df.duplicated().sum()} duplicate rows.")
print("\nChecking column types:")
print(df.dtypes)

# =============================================================================
# Cell 3: Deep Text Cleaning
# =============================================================================
print("\nStep 2: Performing Deep Text Cleaning...")

def deep_text_cleaning(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove emojis
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub(r'', text)
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Correct basic spelling errors (optional, can be slow)
    # try:
    #     text = str(TextBlob(text).correct())
    # except Exception:
    #     pass
    return text

df['cleaned_message_body'] = df['message_body'].apply(deep_text_cleaning)
print("Deep text cleaning complete.")

# =============================================================================
# Cell 4: Text Preprocessing (Lemmatization, Stopword Removal)
# THIS CELL CREATES THE 'processed_message' COLUMN
# =============================================================================
print("\nStep 3: Preprocessing text for ML model...")

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_for_ml(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return " ".join(lemmatized_tokens)

df['processed_message'] = df['cleaned_message_body'].apply(preprocess_for_ml)
print("Text preprocessing for ML complete.")

# =============================================================================
# Cell 5: Named Entity Recognition (NER)
# =============================================================================
print("\nStep 4: Extracting Named Entities...")

def extract_entities_nltk(text):
    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    chunks = nltk.ne_chunk(pos_tags, binary=False)
    entities = {"user_names": [], "locations": []}
    for chunk in chunks:
        if hasattr(chunk, 'label'):
            if chunk.label() == 'PERSON':
                entities['user_names'].append(' '.join(c[0] for c in chunk))
            elif chunk.label() == 'GPE':
                entities['locations'].append(' '.join(c[0] for c in chunk))
    return entities

df['key_entities'] = df['cleaned_message_body'].apply(extract_entities_nltk)
print("Named Entity Recognition complete.")

# =============================================================================
# Cell 6: Message Classification
# =============================================================================
print("\nStep 5: Classifying messages...")

# Feature and target variables
X = df['processed_message']
y = df['true_category']

# Split, Train, and Predict
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Apply model to the entire dataset to create the 'predicted_category' column
df_tfidf = tfidf_vectorizer.transform(df['processed_message'])
df['predicted_category'] = model.predict(df_tfidf)
print(f"Message classification complete. Model Accuracy: {accuracy_score(y_test, model.predict(X_test_tfidf)):.2f}")

# =============================================================================
# Cell 7: Text Summarization
# =============================================================================
print("\nStep 6: Summarizing long messages...")

# Check for GPU
if torch.cuda.is_available():
    device = "cuda"
    print("GPU is available. Using CUDA for summarization.")
else:
    device = "cpu"
    print("GPU not available. Using CPU for summarization.")

summarizer = pipeline("summarization", model="t5-small", device=device)

def summarize_text(text):
    if len(text.split()) > 100:
        try:
            summary = summarizer(text, max_length=50, min_length=25, do_sample=False)
            return summary[0]['summary_text']
        except Exception as e:
            return f"Summarization failed: {e}"
    else:
        return "Message is short, no summary needed."

df['summary'] = df['cleaned_message_body'].apply(summarize_text)
print("Summarization complete.")

# =============================================================================
# Cell 8: Create Final CSV Output
# =============================================================================
print("\nStep 7: Generating final CSV file...")

final_columns = [
    'message_id',
    'cleaned_message_body',
    'predicted_category',
    'key_entities',
]
output_df = df[final_columns].copy()
output_df['key_entities'] = output_df['key_entities'].astype(str)
output_file_path = "../data/processed_customer_support_tickets.csv"
output_df.to_csv(output_file_path, index=False)

print(f"\nSUCCESS! Final CSV file generated successfully at: {output_file_path}")
print("Here is a preview of the final output:")
print(output_df.head())

Downloading NLTK data... (This may take a moment)
NLTK data download complete.

Step 1: Loading and Inspecting Data...
Data loaded successfully. Here's a preview:
  message_id          user_name     location  \
0      M0001        Ryan Obrien  North Billy   
1      M0002      Jamie Salazar   South Kari   
2      M0003    Clinton Wallace   Port Paige   
3      M0004  Christopher Olsen    Jeanville   
4      M0005     Alex Alexander    West Gail   

                                     email_subject  \
0                    Law from traditional now Mrs.   
1  Require billion probably cut raise include now.   
2                        South maintain year firm.   
3                              Fill personal fire.   
4               Term authority offer feeling than.   

                                        message_body           created_at  \
0  Reflect available century join outside. i cant...  2025-09-01 06:10:59   
1  Try cause behind single project. Sport sound c...  2025-06-16 04:5