In [None]:
import pandas as pd
import re
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from transformers import pipeline
import torch
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng') # This is the specific english model
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('maxent_ne_chunker') # New addition
nltk.download('words')             # New addition
nltk.download('maxent_ne_chunker_tab')

In [None]:
df = pd.read_csv("../data/customer_support_tickets_1.csv")
df.head()

Load and inspect the dataset (CSV format). Identify	missing	values,	duplicates, and	column types.	

In [None]:
df.isnull().sum() # Check for missing values

In [None]:
df.duplicated() # Check for duplicate rows

In [None]:
df.dtypes # Check data types of each column

Perform	deep text cleaning on 'message_body': remove emojis, HTML, repeated	punctuation, extra spaces,and fix common spelling errors.	

In [None]:
#
# THIS IS THE CORRECTED DEEP TEXT CLEANING CELL
#

def deep_text_cleaning(text):
    # Remove emojis
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub(r"", text)

    # Remove HTML tags
    html_pattern = re.compile("<.*?>")
    text = html_pattern.sub(r"", text)

    # Remove special characters and punctuation
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Correct spelling - this can be slow, apply with caution on large datasets
    # For this project, it's fine.
    try:
        text = str(TextBlob(text).correct())
    except Exception:
        # Pass if TextBlob fails on a specific text
        pass

    return text

# The key line: Ensure this creates the column the next cell needs
df["cleaned_message_body"] = df["message_body"].apply(deep_text_cleaning)
df[["message_body", "cleaned_message_body"]].head()

In [None]:
#remove the HTLM tags
def remove_html_tags(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

df['message_body'] = df['message_body'].apply(remove_html_tags)
df['message_body'].head()

In [None]:
#remove repeated punctuation and special characters 
def remove_special_characters(text):    
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

df['message_body'] = df['message_body'].apply(remove_special_characters)
df['message_body'].head()

In [None]:
#remove space extra spaces
def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()
df['message_body'] = df['message_body'].apply(remove_extra_spaces)
df['message_body'].head()

In [None]:
#fix spelling mistakes
def correct_spelling(text):
    return str(TextBlob(text).correct())
df['message_body'] = df['message_body'].apply(correct_spelling)
df['message_body'].head()


Perform text preprocessing — apply tokenization, lowercasing, lemmatization, and POS tagging using spaCy or NLTK.

In [None]:
#apply tokenisation
def tokenize_text(text):
    return word_tokenize(text)
df['tokenized_message'] = df['message_body'].apply(tokenize_text)
df[['message_body', 'tokenized_message']].head()

In [None]:
#lowercasing the text
def lowercase_text(text):
    return text.lower()
df['message_body'] = df['message_body'].apply(lowercase_text)
df['message_body'].head()

In [None]:
#lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    tokens = word_tokenize(text)
    return [lemmatizer.lemmatize(token) for token in tokens]
df['lemmatized_message'] = df['message_body'].apply(lemmatize_text)
df[['message_body', 'lemmatized_message']].head()

In [None]:
#pos tagging
def pos_tagging(text): 
    tokens = word_tokenize(text)
    return pos_tag(tokens)
df['pos_tagged_message'] = df['message_body'].apply(pos_tagging)
df[['message_body', 'pos_tagged_message']].head()

Extract key entities using Named Entity Recognition (NER) to identify product names, user names, locations, and issue types.

In [None]:


def extract_entities_nltk(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    # Apply Part-of-Speech (POS) tagging
    pos_tags = nltk.pos_tag(words)
    # Apply Named Entity Recognition (NER) chunking
    chunks = nltk.ne_chunk(pos_tags, binary=False)

    entities = {
        "user_names": [],
        "locations": []
    }

    for chunk in chunks:
        if hasattr(chunk, 'label'):
            if chunk.label() == 'PERSON':
                entities['user_names'].append(' '.join(c[0] for c in chunk))
            elif chunk.label() == 'GPE': # GPE is the label for locations
                entities['locations'].append(' '.join(c[0] for c in chunk))

    return entities

# Apply the function to the dataframe
df['key_entities'] = df['cleaned_message_body'].apply(extract_entities_nltk)
df[['cleaned_message_body', 'key_entities']].head()

Classify each message into categories such as Complaint, Bug Report, Feature Request, Praise, etc., using either traditional ML models or transformer-based models.

In [None]:
#classify each message into categories
def classify_message(text):
    text = text.lower()
    if "billing" in text or "invoice" in text or "payment" in text:
        return "Complaint"
    elif "technical" in text or "error" in text or "bug" in text:
        return "Bug Report"
    elif "account" in text or "login" in text or "password" in text:
        return "Feature Request"
    else:
        return "Praise"
    
df['message_category'] = df['message_body'].apply(classify_message)
df[['message_body', 'message_category']].head()

In [None]:
#
# THIS IS THE CORRECTED TRANSFORMER NER CELL
#

from transformers import pipeline

# The key change is adding "dbmdz/" to the model name
ner_pipeline = pipeline("ner", model='dbmdz/bert-large-cased-finetuned-conll03-english', grouped_entities=True)

def transformer_ner(text):
    # It's good practice to handle empty strings
    if not isinstance(text, str) or not text.strip():
        return []
    return ner_pipeline(text)

# Let's apply it to a new column to test
# Note: This will be slow as it runs on the CPU by default.
# It's better to run this on a small sample if you're just testing.
df['transformer_ner_entities'] = df['cleaned_message_body'].head().apply(transformer_ner) # Using .head() to run on first 5 rows only

df[['cleaned_message_body', 'transformer_ner_entities']].head()

In [None]:
from transformers import pipeline
# from transformers import torchpipeline  <-- DELETE THIS LINE

# The model name is correct now
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", grouped_entities=True)

def transformer_ner(text):
    # It's good practice to handle empty strings
    if not isinstance(text, str) or not text.strip():
        return []
    return ner_pipeline(text)

# Let's apply it to a new column to test
# Note: Using .head(10) to run only on the first 10 rows to save time during testing
df['transformer_ner_entities'] = df['cleaned_message_body'].head(10).apply(transformer_ner)

df[['cleaned_message_body', 'transformer_ner_entities']].head(10)

Summarize lengthy messages — for messages exceeding 100 words, apply text summarization using pretrained models like T5 or BART.

In [None]:
#
# REPLACE YOUR ENTIRE TEXT SUMMARIZATION CELL WITH THIS
#

# 1. Check if a CUDA-enabled GPU is available
if torch.cuda.is_available():
    device = "cuda"
    print("GPU is available. Using CUDA.")
else:
    device = "cpu"
    print("GPU not available. Using CPU.")

# 2. Create the summarization pipeline and assign it to the detected device
# The `device=device` argument is the key change that enables the GPU.
summarizer = pipeline("summarization", model="t5-small", device=device)

def summarize_text(text):
    # Only summarize longer messages to save time
    if len(text.split()) > 100:
        summary = summarizer(text, max_length=50, min_length=25, do_sample=False)
        return summary[0]['summary_text']
    else:
        return "Message is short, no summary needed."

# This will now run much faster on the GPU for long messages
df['summary'] = df['cleaned_message_body'].apply(summarize_text)

# To see the full summary text in the output
pd.set_option('display.max_colwidth', None)

df[['cleaned_message_body', 'summary']].head()

Message Classification

In [24]:
#
# THIS IS THE MESSAGE CLASSIFICATION CELL
# Make sure this cell is present and has been run before creating the final CSV.
#

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Feature and target variables
# Ensure the 'processed_message' column was created in a previous step
X = df['processed_message']
y = df['true_category']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# (Optional) Check Model Accuracy
y_pred = model.predict(X_test_tfidf)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred):.2f}")

# The key line that creates the missing column
df_tfidf = tfidf_vectorizer.transform(df['processed_message'])
df['predicted_category'] = model.predict(df_tfidf)

df[['true_category', 'predicted_category']].head()

KeyError: 'processed_message'

Generate an output CSV file containing the cleaned text, predicted category, extracted entities, and summary.

In [None]:
#
# THIS IS THE CORRECTED FINAL CELL TO GENERATE THE CSV OUTPUT
#

# Select only the required columns for the final deliverable
final_columns = [
    'message_id',
    'cleaned_message_body',
    'predicted_category',
    'key_entities',
    'summary'
]

# Create the final DataFrame
output_df = df[final_columns].copy()

# Convert the 'key_entities' dictionary to a more readable string format for the CSV
output_df['key_entities'] = output_df['key_entities'].astype(str)

# Define the output path and save the CSV
output_file_path = "processed_customer_support_tickets.csv"
output_df.to_csv(output_file_path, index=False)

print(f"Final CSV file generated successfully at: {output_file_path}")
output_df.head()

(Optional) Develop an interactive app using Streamlit or Gradio, with filters for category, city, or keyword-based search.