In [0]:
#%pip install -r requirements.txt

In [0]:
#%pip install pandas fsspec adlfs

In [0]:
#dbutils.library.restartPython()

In [0]:
#%run ./_paramgplayscrapper

In [0]:
#%run ./_envsettings

In [0]:
#Libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
import nltk
import openai

# Download NLTK stopwords if not already present
nltk.download('stopwords')

In [0]:
data = pd.read_csv(
    f"abfss://root@{account_name}.dfs.core.windows.net/{topic_modelling_input_file}",
    storage_options={
        "account_name": account_name,
        "account_key": account_key
    }
)

In [0]:
# Create a keywords corpus grouped by docID
data['keywords corpus'] = data.groupby('docID')['keyword'].transform(lambda x: ' '.join(x))
corpus_df = data[['docID', 'review', 'keywords corpus']].drop_duplicates()
 
# Preprocess the keywords corpus
stop_words = set(stopwords.words('english'))
corpus_df['keywords corpus'] = corpus_df['keywords corpus'].apply(
    lambda text: ' '.join([word for word in text.split() if word.lower() not in stop_words])
)
 
# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(corpus_df['keywords corpus'])
 
# Perform LDA for topic modeling
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)  # 5 topics
lda_model.fit(doc_term_matrix)
 
# Map topics to keywords
topic_keywords = {}
for idx, topic in enumerate(lda_model.components_):
    topic_keywords[idx] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]]  # Top 10 words per topic

In [0]:
# print keywords
print(topic_keywords)

In [0]:
# Assign topics to documents
def get_topic_name(text):
    doc_vector = vectorizer.transform([text])
    topic_distribution = lda_model.transform(doc_vector)
    topic_idx = topic_distribution.argmax()
    return f"Topic {topic_idx}: {', '.join(topic_keywords[topic_idx])}"
 
corpus_df['topic'] = corpus_df['keywords corpus'].apply(get_topic_name)

In [0]:
# Save the final output to a CSV file before AI-based topic naming
corpus_df.to_csv(
    f"abfss://root@{account_name}.dfs.core.windows.net/{topic_modelling_interim_output_file}",
    index=False,
    storage_options={"account_name": account_name, "account_key": account_key},
)
print(f"Intermediate output CSV saved at: {topic_modelling_interim_output_file}")

In [0]:
# --------------------------------------------------------------
# AI-based Topic Naming
# --------------------------------------------------------------
 
client = openai.OpenAI(api_key=apiKey) 

openai.debug = True  # Enable debugging for OpenAI API

In [0]:
# Function to assign topic names using AI 
def assign_topic_names_ai(topic_keywords):
    topic_names = {}
    for topic_id, keywords in topic_keywords.items():
        prompt = (
            f"The following keywords represent a topic: {', '.join(keywords)}.\n"
            "Generate a short, meaningful topic name (3–4 words max) that summarizes these keywords.\n"
            "Use the same concise style as the following examples:\n"
            "- Support/User Feedback\n"
            "- Login/Access Issue\n"
            "- Account Management\n"
            "- Delivery Experience\n\n"
            "Rules:\n"
            "1. Do not include quotation marks or punctuation around the output.\n"
            "2. Avoid generic words like 'topic', 'about', or 'related to'.\n"
            "3. Output only the topic name — nothing else."
        )
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an assistant that assigns topic names."},
                {"role": "user", "content": prompt},
            ],
            max_tokens=20,
            temperature=0.7,
        )
        topic_name = response.choices[0].message.content
        topic_names[topic_id] = topic_name
    return topic_names

In [0]:
# Generate AI-based topic names

ai_topic_names = assign_topic_names_ai(topic_keywords)

In [0]:
print(ai_topic_names)

In [0]:
# Map AI topic names to the dataframe
def map_ai_topic_name(text):
    doc_vector = vectorizer.transform([text])
    topic_distribution = lda_model.transform(doc_vector)
    topic_idx = topic_distribution.argmax()
    return ai_topic_names[topic_idx]
 
# Add AI-based topic names to the dataframe
corpus_df['AI Topic Name'] = corpus_df['keywords corpus'].apply(map_ai_topic_name)

In [0]:
# Save the final output with AI-based topic names
corpus_df.to_csv(
    f"abfss://root@{account_name}.dfs.core.windows.net/{topic_modelling_output_file}",
    index=False,
    storage_options={"account_name": account_name, "account_key": account_key},
)
 
print(f"Final output CSV with AI topic names saved at: {topic_modelling_output_file}")