In [8]:
# Import necessary libraries
import pandas as pd
import re
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models
from IPython.display import display, HTML
from nltk.corpus import stopwords
import nltk

# Install the required libraries
!pip install pyLDAvis --upgrade
!pip install gensim pandas nltk

# Download NLTK stopwords
nltk.download('stopwords')  # Download the stopwords dataset

# Load your CSV containing the emails and issues
df = pd.read_csv('/content/customer_service_email_issues1.csv')  # Update with your dataset path

# --- The issue is that 'email' and 'issue' are likely in one column, separated by ';' ---
# Split the single column into two columns named 'email' and 'issue'
df[['email', 'issue']] = df.iloc[:, 0].str.split(';', expand=True)
# --- Now you should have separate 'email' and 'issue' columns ---

# Print the column names to verify the actual names in your CSV
print(df.columns)

# Function to clean text data and remove non-relevant issues
def clean_issue(issue):
    # Remove special tokens and placeholders (e.g., "<|end_header_id|>")
    issue = re.sub(r'<\|.*?\|>', '', issue)
    # Remove non-alphanumeric characters and extra spaces
    issue = re.sub(r'[^a-zA-Z\s]', '', issue)
    # Convert text to lowercase and remove stopwords
    stop_words = set(stopwords.words('english'))
    issue = ' '.join([word for word in issue.lower().split() if word not in stop_words])
    # Remove very short or non-meaningful issues
    if len(issue.split()) < 2:  # Keep issues with at least 2 words
        return None
    return issue

# Function to merge similar issues based on semantic meaning
def merge_similar_issues(issue):
    # Handle None values before processing
    if issue is None:
        return None

    issue_mapping = {
        "dropped calls": ["dropping calls", "dropped phone calls", "calls dropped", "calls dropping frequently"],
        "intermittent disconnections": ["frequent disconnections", "intermittent outages", "intermittent disconnects"],
        "phone ringing": ["phone connection drops", "phone ringing issues"],
        # Add more mappings as necessary for other similar issues
    }

    # Check each mapping and replace similar phrases with the standard one
    for main_issue, similar_issues in issue_mapping.items():
        for similar in similar_issues:
            if similar in issue:
                return main_issue
    return issue

# Apply cleaning to the 'issue' column
df['clean_issue'] = df['issue'].dropna().apply(clean_issue)

# Apply merging of similar issues
df['clean_issue'] = df['clean_issue'].apply(merge_similar_issues)

# Drop rows with empty or None values after cleaning and merging
df = df.dropna(subset=['clean_issue'])

# Create a list of cleaned issues
issues = df['clean_issue'].tolist()

# Create a list of documents, treating each issue as a single token (since you want one issue per document)
documents = [[issue] for issue in issues]

# Create a dictionary and corpus for PyLDA
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# PyLDA Topic Modeling
num_topics = 5  # Define the number of topics
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Print the topics found by PyLDA
print("PyLDA Topics:")
for idx, topic in lda_model.print_topics(num_topics=num_topics):
    print(f"Topic {idx}: {topic}")

# Visualization of topics using pyLDAvis
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Save the visualization to an HTML file
output_html = '/content/lda_visualization.html'  # Specify your desired output file path
pyLDAvis.save_html(vis_data, output_html)

# Display a link to download the saved HTML visualization
#display(HTML(f'<a href="{output_html}" target="_blank">Click here to view the PyLDAvis visualization</a>'))

# Alternatively, display the saved HTML file directly in Colab (should work for Colab users)
#display(HTML(output_html))

#The pyLDA visualization is generated in a html file in the working directory for download and the PyLDA topics present in it are displayed below


  and should_run_async(code)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Index(['email;issue', 'email', 'issue'], dtype='object')
PyLDA Topics:
Topic 0: 0.799*"dial tone" + 0.128*"intermittent disconnections" + 0.037*"dropped calls frequently" + 0.006*"cant make calls" + 0.006*"dropped calls" + 0.006*"phone ringing" + 0.006*"exact problem" + 0.006*"incoming calls" + 0.006*"poor signal strength"
Topic 1: 0.379*"incoming calls" + 0.379*"cant make calls" + 0.035*"dropped calls" + 0.035*"dial tone" + 0.035*"exact problem" + 0.035*"phone ringing" + 0.035*"dropped calls frequently" + 0.035*"intermittent disconnections" + 0.034*"poor signal strength"
Topic 2: 0.426*"poor signal strength" + 0.075*"dropped calls" + 0.072*"dial tone" + 0.071*"exact problem" + 0.071*"phone ringing" + 0.071*"dropped calls frequently" + 0.071*"intermittent disconnections" + 0.071*"incoming calls" + 0.071*"cant make calls"
Topic 3: 0.112*"dropped calls" + 0.112*"exact problem" + 0.111*"phone ringing" + 0.111*"dial tone" + 0.111*"dropped calls frequently" + 0.111*"intermittent disconnecti