In [None]:
# ==============================================================================
# 1. Setup and Imports

import pandas as pd
import re
import os
import joblib
import json
from collections import Counter

import spacy
import nltk
from nltk.corpus import stopwords

import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

print("‚úÖ Libraries imported successfully.")

# Load the SpaCy model for English language processing
try:
    nlp = spacy.load("en_core_web_sm")
    print("‚úÖ SpaCy model 'en_core_web_sm' loaded successfully.")
except OSError:
    print("‚ùå SpaCy model not found. Please run: python -m spacy download en_core_web_sm")
    exit()

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# ==============================================================================
# 2. Load and Parse Raw Data


def parse_meeting_transcripts(filepath):
    """Parses the raw transcript text file into a pandas DataFrame."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        print(f"‚ùå Error: The file '{filepath}' was not found.")
        print("Please make sure you have uploaded data.txt to your environment.")
        return None

    meetings = re.split(r'\n(?=\[)', content.strip())
    all_turns = []
    for meeting in meetings:
        lines = meeting.strip().split('\n')
        meeting_title = lines[0].strip().strip('[]')
        for line in lines[1:]:
            match = re.match(r'([^:]+):\s*(.*)', line)
            if match:
                speaker, dialogue = match.groups()
                all_turns.append({
                    'meeting_title': meeting_title,
                    'speaker': speaker.strip(),
                    'dialogue': dialogue.strip()
                })
    return pd.DataFrame(all_turns)

print("\nüîÑ Loading and parsing raw data from data.txt...")
file_path = '/content/drive/MyDrive/data.txt'
df = parse_meeting_transcripts(file_path)

if df is not None:
    print(f"‚úÖ Loaded {df['meeting_title'].nunique()} meetings with {len(df)} dialogue turns.")
    print("--- Sample Raw Data ---")
    print(df.head())

# ==============================================================================
# 3. Comprehensive Text Preprocessing


def preprocess_for_lda(text):
    """A robust function to clean raw transcript text for topic modeling."""
    doc = nlp(text.lower().strip())
    tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop and not token.is_punct and token.is_alpha
    ]
    return tokens

if df is not None:
    print("\nüîÑ Applying text preprocessing for topic modeling...")

    # For topic modeling, it's best to treat each meeting as a single document.
    meeting_docs = df.groupby('meeting_title')['dialogue'].apply(' '.join).reset_index()
    meeting_docs['processed_tokens'] = meeting_docs['dialogue'].apply(preprocess_for_lda)

    # --- Save the processed data ---
    os.makedirs('data/processed', exist_ok=True)
    processed_data_path = 'data/processed/cleaned_meetings_for_lda.csv'
    meeting_docs.to_csv(processed_data_path, index=False)

    print(f"‚úÖ Text preprocessing complete. Cleaned data saved to '{processed_data_path}'.")
    print("--- Sample Cleaned Data ---")
    print(meeting_docs[['meeting_title', 'processed_tokens']].head())

# ==============================================================================
# 4. Train the Topic Model


if 'meeting_docs' in locals() and df is not None:
    print("\nü§ñ Starting Topic Modeling with Gensim LDA...")

    documents = meeting_docs['processed_tokens'].tolist()
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]

    # Build the LDA model
    num_topics = 5 # This is a key parameter to tune
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        update_every=1,
        chunksize=10,
        passes=10,
        alpha='auto'
    )

    print("\n‚úÖ LDA training complete.")
    print("--- Top Keywords for each Topic ---")
    for idx, topic in lda_model.print_topics(-1):
        print(f"Topic {idx+1}: {topic}")

# ==============================================================================
# 5. Visualize and Analyze Topics


if 'lda_model' in locals():
    print("\nüìà Generating topic visualizations...")
    vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

    # Save the visualization as a standalone HTML file
    os.makedirs('visualizations', exist_ok=True)
    pyLDAvis.save_html(vis_data, 'visualizations/lda_topics.html')
    print("‚úÖ Visualization generated and saved to 'visualizations/lda_topics.html'.")
    # In a notebook, you would display it with: display(vis_data)

# ==============================================================================
# 6. Information Extraction


def extract_action_items(dataframe):
    """Processes a dataframe to extract action items from the dialogue."""
    action_items = []
    action_keywords = ["i will", "we will", "we need to", "i'll", "let's", "next step", "action item", "to-do", "task is", "plan is", "agreed to"]
    responsibility_verbs = ["send", "create", "complete", "organize", "schedule", "follow up", "prepare", "review"]
    modal_verbs = ["should", "must", "will", "need to"]
    for _, row in dataframe.iterrows():
        for sent in nlp(row['dialogue']).sents:
            if any(k in sent.text.lower() for k in action_keywords) or (any(m.lemma_ in modal_verbs for m in sent) and any(v.lemma_ in responsibility_verbs for v in sent)):
                action_items.append({'meeting': row['meeting_title'], 'speaker': row['speaker'], 'action': sent.text.strip()})
    return pd.DataFrame(action_items).drop_duplicates(subset=['action'])

if df is not None:
    print("\nüîç Extracting Action Items...")
    action_items_df = extract_action_items(df)
    print(f"‚úÖ Found {len(action_items_df)} potential action items.")
    print(action_items_df.head())

    print("\nüîç Extracting Named Entities...")
    full_text = " ".join(df['dialogue'])
    entities = [(ent.text, ent.label_) for ent in nlp(full_text).ents]
    entity_counts = Counter(entities)
    print(f"‚úÖ Found {len(entity_counts)} unique named entities.")
    print("--- Top 15 Most Common Entities ---")
    for (entity, label), count in entity_counts.most_common(15):
        print(f"{entity} ({label}): {count}")

# ==============================================================================
# 7. Save the Final Models and Supporting Files


if 'lda_model' in locals():
    print("\nüíæ Saving the final trained models and assets...")
    os.makedirs('models', exist_ok=True)

    # Save LDA model
    lda_model_path = 'models/lda_model.joblib'
    joblib.dump(lda_model, lda_model_path)
    print(f"‚úÖ LDA model saved to '{lda_model_path}'.")

    # Save the dictionary
    dictionary_path = 'models/dictionary.joblib'
    joblib.dump(dictionary, dictionary_path)
    print(f"‚úÖ Dictionary saved to '{dictionary_path}'.")

    # Save the SpaCy model reference
    spacy_model_name = {"name": "en_core_web_sm"}
    with open('models/spacy_model.json', 'w') as f:
        json.dump(spacy_model_name, f)
    print("‚úÖ SpaCy model reference saved.")

print("\nüéâ Script run completed successfully! üéâ")

‚úÖ Libraries imported successfully.


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
[nltk_data] Downloading package stopwords to /root/nltk_data...


‚úÖ SpaCy model 'en_core_web_sm' loaded successfully.

üîÑ Loading and parsing raw data from data.txt...


  return datetime.utcnow().replace(tzinfo=utc)
[nltk_data]   Unzipping corpora/stopwords.zip.
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


‚úÖ Loaded 11 meetings with 650 dialogue turns.
--- Sample Raw Data ---
          meeting_title                  speaker  \
0  Brainstorm meeting 1  Meeting Chairman (Mark)   
1  Brainstorm meeting 1              Tom Robbins   
2  Brainstorm meeting 1         Meeting Chairman   
3  Brainstorm meeting 1           Jennifer Miles   
4  Brainstorm meeting 1         Meeting Chairman   

                                            dialogue  
0  Good morning, everyone. Thank you for coming. ...  
1  Just a quick note, Mark. The adjustments to ou...  
2            That‚Äôs great to hear, Tom. Anyone else?  
3  Yes, Mark. I wanted to mention that I received...  
4  Perfect. Let‚Äôs make sure we address those poin...  

üîÑ Applying text preprocessing for topic modeling...


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

‚úÖ Text preprocessing complete. Cleaned data saved to 'data/processed/cleaned_meetings_for_lda.csv'.
--- Sample Cleaned Data ---
                                       meeting_title  \
0                               Brainstorm meeting 1   
1                                 Complain dataset 1   
2                                     Latest meeting   
3  The phone rings for a few moments before Jamie...   
4                              Urban customer report   

                                    processed_tokens  
0  [good, morning, thank, come, pack, agenda, tod...  
1  [good, morning, jamie, appreciate, come, way, ...  
2  [sale, team, alice, team, handle, new, system,...  
3  [hello, hi, jamie, jack, peterson, southwest, ...  
4  [southwest, area, sale, november, jack, peters...  

ü§ñ Starting Topic Modeling with Gensim LDA...

‚úÖ LDA training complete.
--- Top Keywords for each Topic ---
Topic 1: 0.035*"customer" + 0.034*"rural" + 0.024*"sale" + 0.018*"area" + 0.014*"local" + 

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

‚úÖ Visualization generated and saved to 'visualizations/lda_topics.html'.

üîç Extracting Action Items...


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

‚úÖ Found 75 potential action items.
                meeting                  speaker  \
0  Brainstorm meeting 1            Jack Peterson   
1  Brainstorm meeting 1              John Ruting   
2  Brainstorm meeting 1         Meeting Chairman   
3  Brainstorm meeting 1           Jennifer Miles   
4  Brainstorm meeting 1  Meeting Chairman (Mark)   

                                              action  
0  Perhaps we need to segment our customer base a...  
1  But I still think we need to address the issue...  
2  It‚Äôs clear that we need to explore both techno...  
3  We need to empower agents to make decisions, b...  
4  So, in addition to automating the first point ...  

üîç Extracting Named Entities...


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return date

‚úÖ Found 112 unique named entities.
--- Top 15 Most Common Entities ---
Jack (PERSON): 85
Alice (PERSON): 55
Jennifer (PERSON): 42
Mark (PERSON): 35
Jamie (PERSON): 33
Donald (PERSON): 26
today (DATE): 24
John (PERSON): 23
one (CARDINAL): 21
first (ORDINAL): 20
CRM (PRODUCT): 19
Tom (PERSON): 16
One (CARDINAL): 14
AI (GPE): 14
‚Äôs (GPE): 7

üíæ Saving the final trained models and assets...
‚úÖ LDA model saved to 'models/lda_model.joblib'.
‚úÖ Dictionary saved to 'models/dictionary.joblib'.
‚úÖ SpaCy model reference saved.

üéâ Script run completed successfully! üéâ


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
