In [None]:
import pandas as pd
import numpy as np
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import pickle
import os
import random

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token.isalnum() and token not in stop_words]

# Create LDA model and return top 5 keywords
def create_lda_model(texts, num_topics=10):
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=100)
    
    top_words = []
    for topic_id in range(num_topics):
        top_words.append([word for word, _ in lda_model.show_topic(topic_id, topn=5)])
    
    return lda_model, dictionary, top_words

# Show titles for a specific topic
def show_topic_titles(topic_id, processed_titles, lda_model, dictionary, keyword_df, sample_size=None):
    topic_titles = []
    for i, title in enumerate(processed_titles):
        bow = dictionary.doc2bow(title)
        topic_distribution = lda_model.get_document_topics(bow)
        main_topic = max(topic_distribution, key=lambda x: x[1])[0]
        if main_topic == topic_id:
            topic_titles.append(keyword_df['title'].iloc[i])
    
    if sample_size and len(topic_titles) > sample_size:
        sampled_titles = random.sample(topic_titles, sample_size)
    else:
        sampled_titles = topic_titles
    
    print(f"\nTitles for Topic {topic_id}:")
    for title in sampled_titles:
        print(f"- {title}")
    
    print(f"Total titles for this topic: {len(topic_titles)}")

# Main function
def process_keywords(df, key_list):
    results = {}
    
    # Remove duplicates based on 'storyId' and reset index
    df = df.drop_duplicates(subset='storyId').reset_index(drop=True)
    
    for keyword in key_list:
        print(f"\nProcessing keyword: {keyword}")
        
        # Get data for this keyword
        keyword_df = df[df['keyword'] == keyword]
        
        # Preprocess titles
        processed_titles = [preprocess_text(title) for title in keyword_df['title']]
        
        # Create LDA model
        lda_model, dictionary, top_words = create_lda_model(processed_titles)
        
        while True:
            # Print top 5 keywords for each topic
            for i, words in enumerate(top_words):
                print(f"Topic {i}: {' '.join(words)}")
            
            # User input for topic removal
            topics_to_remove_input = input("Enter the topic numbers to remove (space-separated), 'all' to remove all, 'c<num>' to check 20 random topic titles, 'cc<num> all' to check all topic titles, or press Enter to keep all: ")
            
            if topics_to_remove_input.startswith('cc'):
                try:
                    topic_to_check = int(topics_to_remove_input.split()[0][2:])
                    show_topic_titles(topic_to_check, processed_titles, lda_model, dictionary, keyword_df)
                    continue
                except ValueError:
                    print("Invalid input. Please enter a valid topic number after 'cc'.")
                    continue
            elif topics_to_remove_input.startswith('c'):
                try:
                    topic_to_check = int(topics_to_remove_input[1:])
                    show_topic_titles(topic_to_check, processed_titles, lda_model, dictionary, keyword_df, sample_size=20)
                    continue
                except ValueError:
                    print("Invalid input. Please enter a valid topic number after 'c'.")
                    continue
            
            if topics_to_remove_input.strip().lower() == 'all':
                topics_to_remove = list(range(len(top_words)))
            elif topics_to_remove_input.strip() == '':
                topics_to_remove = []
            else:
                topics_to_remove = [int(x) for x in topics_to_remove_input.split()]
            
            # Show 10 random titles for each topic to be removed
            for topic in topics_to_remove:
                show_topic_titles(topic, processed_titles, lda_model, dictionary, keyword_df, sample_size=10)
            
            # Confirm deletion
            confirm = input("Do you want to proceed with this deletion? (y/n): ").strip().lower()
            if confirm == 'y':
                break
            else:
                print("Deletion cancelled. Please select topics to remove again.")
        
        if topics_to_remove == list(range(len(top_words))):
            filtered_df = keyword_df.iloc[[]].copy()  # Create an empty DataFrame
        else:
            # Filter data
            filtered_indices = []
            for i, title in enumerate(processed_titles):
                bow = dictionary.doc2bow(title)
                topic_distribution = lda_model.get_document_topics(bow)
                main_topic = max(topic_distribution, key=lambda x: x[1])[0]
                if main_topic not in topics_to_remove:
                    filtered_indices.append(i)
            
            filtered_df = keyword_df.iloc[filtered_indices]
        
        # Save model and results
        results[keyword] = {
            'lda_model': lda_model,
            'dictionary': dictionary,
            'topics_to_remove': topics_to_remove,
            'filtered_df': filtered_df
        }
        
        # Save model to file
        with open(f"{keyword}_lda_model.pkl", "wb") as f:
            pickle.dump((lda_model, dictionary, topics_to_remove), f)
    
    return results

# Function to process new data
def process_new_data(new_df, results):
    for keyword in results:
        keyword_df = new_df[new_df['keyword'] == keyword]
        
        if not keyword_df.empty:
            lda_model = results[keyword]['lda_model']
            dictionary = results[keyword]['dictionary']
            topics_to_remove = results[keyword]['topics_to_remove']
            
            if len(topics_to_remove) == lda_model.num_topics:
                # If all topics are removed, skip this keyword
                continue
            
            filtered_indices = []
            for i, title in enumerate(keyword_df['title']):
                processed_title = preprocess_text(title)
                bow = dictionary.doc2bow(processed_title)
                topic_distribution = lda_model.get_document_topics(bow)
                main_topic = max(topic_distribution, key=lambda x: x[1])[0]
                if main_topic not in topics_to_remove:
                    filtered_indices.append(i)
            
            filtered_df = keyword_df.iloc[filtered_indices]
            results[keyword]['filtered_df'] = pd.concat([results[keyword]['filtered_df'], filtered_df])
    
    return results

# Example usage
if __name__ == "__main__":
    # Assume we have a DataFrame with 'keyword', 'title', and 'storyId' columns
    df = pd.DataFrame({
        'keyword': ['tech', 'tech', 'sports', 'sports', 'music', 'music'],
        'title': [
            'New AI breakthrough in natural language processing',
            'The future of quantum computing',
            'World Cup 2022: Argentina wins the final',
            'NBA playoffs: Golden State Warriors advance to finals',
            'Taylor Swift announces world tour',
            'Classical music in the digital age'
        ],
        'storyId': [1, 2, 3, 4, 5, 6]
    })
    
    key_list = ['tech', 'sports', 'music']
    
    # Process existing data
    results = process_keywords(df, key_list)
    
    # Simulate new data
    new_df = pd.DataFrame({
        'keyword': ['tech', 'sports', 'music'],
        'title': [
            'Advancements in robotics and automation',
            'Olympic Games 2024: Preview of upcoming events',
            'The rise of K-pop in global music scene'
        ],
        'storyId': [7, 8, 9]
    })
    
    # Process new data
    updated_results = process_new_data(new_df, results)
    
    # Print results
    for keyword in updated_results:
        print(f"\nFiltered data for {keyword}:")
        print(updated_results[keyword]['filtered_df'])