In [1]:
import gradio as gr
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Download required NLTK data (run once)
try:
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('stopwords')
    nltk.download('wordnet')

# Preprocessing function
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    text = re.sub(r'\W+', ' ', text.lower())  # Remove punctuation and lowercase
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stopwords.words('english')]
    return ' '.join(tokens)

def process_topic_modeling(file):
    if file is None:
        return "Please upload a CSV file.", "", ""
    
    try:
        # Read the uploaded file
        df = pd.read_csv(file.name)
        
        # Check if 'text' column exists
        if 'text' not in df.columns:
            return f"Error: 'text' column not found. Available columns: {list(df.columns)}", "", ""
        
        # Show original data (convert to HTML for better display)
        raw_data_html = df.to_html(max_rows=10, classes='table table-striped')
        raw_data_display = f"<h3>📄 Raw Data (showing first 10 rows)</h3>{raw_data_html}"
        
        # Preprocess text
        df['processed'] = df['text'].apply(preprocess)
        
        # Show processed data
        processed_data_html = df[['text', 'processed']].head(10).to_html(classes='table table-striped')
        processed_data_display = f"<h3>🔧 Preprocessed Data (showing first 10 rows)</h3>{processed_data_html}"
        
        # TF-IDF Vectorizer
        vectorizer = TfidfVectorizer(max_df=0.1, min_df=1)
        
        # Vectorize
        X = vectorizer.fit_transform(df['processed'])
        
        # Topic modeling
        lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
        lda_model.fit(X)
        
        # Show topics
        topics_text = "<h3>💡 Top words per topic</h3>\n"
        for idx, topic in enumerate(lda_model.components_):
            top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]]
            topics_text += f"<p><strong>Topic {idx}:</strong> {', '.join(top_words)}</p>\n"
        
        return raw_data_display, processed_data_display, topics_text
        
    except Exception as e:
        error_msg = f"Error processing file: {str(e)}"
        return error_msg, "", ""

# Create Gradio interface
with gr.Blocks(title="🧠 Topic Modeling Explorer") as iface:
    gr.Markdown("# 🧠 Topic Modeling Explorer")
    gr.Markdown("Upload a CSV file with a 'text' column containing support tickets or other text data to discover topics.")
    
    with gr.Row():
        file_input = gr.File(
            label="Upload your support ticket file (CSV)",
            file_types=[".csv"]
        )
    
    with gr.Row():
        process_btn = gr.Button("Process File", variant="primary")
    
    with gr.Column():
        raw_data_output = gr.HTML(label="Raw Data")
        processed_data_output = gr.HTML(label="Preprocessed Data")
        topics_output = gr.HTML(label="Topics")
    
    # Connect the processing function
    process_btn.click(
        fn=process_topic_modeling,
        inputs=[file_input],
        outputs=[raw_data_output, processed_data_output, topics_output]
    )
    
    # Also trigger on file upload
    file_input.change(
        fn=process_topic_modeling,
        inputs=[file_input],
        outputs=[raw_data_output, processed_data_output, topics_output]
    )

# Launch the app
if __name__ == "__main__":
    iface.launch()

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\V16RKhalil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\V16RKhalil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.
