In [None]:
import pandas as pd
import numpy as np
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import pickle
import os
import random
import json
from datetime import datetime

# Preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token.isalnum() and token not in stop_words]

# Create LDA model and return top keywords
def create_lda_model(texts, num_topics, num_words, random_state):
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=random_state)
    
    top_words = []
    for topic_id in range(num_topics):
        top_words.append([word for word, _ in lda_model.show_topic(topic_id, topn=num_words)])
    
    return lda_model, dictionary, top_words

# Show titles for a specific topic
def show_topic_titles(topic_id, processed_titles, lda_model, dictionary, keyword_df, sample_size=None):
    topic_titles = []
    for i, title in enumerate(processed_titles):
        bow = dictionary.doc2bow(title)
        topic_distribution = lda_model.get_document_topics(bow)
        main_topic = max(topic_distribution, key=lambda x: x[1])[0]
        if main_topic == topic_id:
            topic_titles.append(keyword_df['title'].iloc[i])
    
    if sample_size and len(topic_titles) > sample_size:
        sampled_titles = random.sample(topic_titles, sample_size)
    else:
        sampled_titles = topic_titles
    
    print(f"\nTitles for Topic {topic_id}:")
    for title in sampled_titles:
        print(f"- {title}")
    
    print(f"Total titles for this topic: {len(topic_titles)}")

def save_results(results, filename):
    serializable_results = {}
    for keyword, data in results.items():
        serializable_results[keyword] = {
            'topics_to_remove': data['topics_to_remove'],
            'lda_model_file': f"{keyword}_lda_model.pkl",
            'dictionary_file': f"{keyword}_dictionary.pkl"
        }
        # 分别保存 LDA 模型和字典
        with open(f"{keyword}_lda_model.pkl", "wb") as f:
            pickle.dump(data['lda_model'], f)
        with open(f"{keyword}_dictionary.pkl", "wb") as f:
            pickle.dump(data['dictionary'], f)
    
    with open(filename, 'w') as f:
        json.dump(serializable_results, f)

def load_results(filename):
    with open(filename, 'r') as f:
        serializable_results = json.load(f)
    
    results = {}
    for keyword, data in serializable_results.items():
        with open(data['lda_model_file'], "rb") as f:
            lda_model = pickle.load(f)
        with open(data['dictionary_file'], "rb") as f:
            dictionary = pickle.load(f)
        
        results[keyword] = {
            'lda_model': lda_model,
            'dictionary': dictionary,
            'topics_to_remove': data['topics_to_remove']
        }
    
    return results

def process_keywords(df, key_list, num_topics=10, num_words=5, random_state=100):
    results = {}
    all_filtered_df = pd.DataFrame()
    
    print(f"Length of df before removing duplicates: {len(df)}")
    df = df.drop_duplicates(subset='storyId').reset_index(drop=True)
    print(f"Length of df after removing duplicates: {len(df)}")
    
    for keyword in key_list:
        print(f"\nProcessing keyword: {keyword}")
        
        keyword_df = df[df['keyword'] == keyword]
        processed_titles = [preprocess_text(title) for title in keyword_df['title']]
        lda_model, dictionary, top_words = create_lda_model(processed_titles, num_topics, num_words, random_state)
        
        while True:
            # Print top keywords for each topic
            for i, words in enumerate(top_words):
                print(f"Topic {i}: {' '.join(words)}")
            
            # User input for topic removal
            topics_to_remove_input = input("Enter the topic numbers to remove (space-separated), 'all' to remove all, 'c' to check 10 random titles for each topic, 'cc' to check all titles for each topic, 'c<num>' to check 20 random titles for a specific topic, 'cc<num>' to check all titles for a specific topic, or press Enter to keep all: ")
            
            if topics_to_remove_input == 'cc':
                for topic_id in range(num_topics):
                    show_topic_titles(topic_id, processed_titles, lda_model, dictionary, keyword_df)
                continue
            elif topics_to_remove_input == 'c':
                for topic_id in range(num_topics):
                    show_topic_titles(topic_id, processed_titles, lda_model, dictionary, keyword_df, sample_size=10)
                continue
            elif topics_to_remove_input.startswith('cc'):
                try:
                    topic_to_check = int(topics_to_remove_input[2:])
                    show_topic_titles(topic_to_check, processed_titles, lda_model, dictionary, keyword_df)
                    continue
                except ValueError:
                    print("Invalid input. Please enter a valid topic number after 'cc'.")
                    continue
            elif topics_to_remove_input.startswith('c'):
                try:
                    topic_to_check = int(topics_to_remove_input[1:])
                    show_topic_titles(topic_to_check, processed_titles, lda_model, dictionary, keyword_df, sample_size=20)
                    continue
                except ValueError:
                    print("Invalid input. Please enter a valid topic number after 'c'.")
                    continue
            
            if topics_to_remove_input.strip().lower() == 'all':
                topics_to_remove = list(range(num_topics))
            elif topics_to_remove_input.strip() == '':
                topics_to_remove = []
                break  # No confirmation needed, proceed to next topic
            else:
                topics_to_remove = [int(x) for x in topics_to_remove_input.split()]
            
            # Show 10 random titles for each topic to be removed
            for topic in topics_to_remove:
                show_topic_titles(topic, processed_titles, lda_model, dictionary, keyword_df, sample_size=10)
            
            # Confirm deletion
            confirm = input("Do you want to proceed with this deletion? (y/n): ").strip().lower()
            if confirm == 'y':
                break
            else:
                print("Deletion cancelled. Please select topics to remove again.")
        
        if topics_to_remove == list(range(num_topics)):
            filtered_df = keyword_df.iloc[[]].copy()
        else:
            filtered_indices = []
            for i, title in enumerate(processed_titles):
                bow = dictionary.doc2bow(title)
                topic_distribution = lda_model.get_document_topics(bow)
                main_topic = max(topic_distribution, key=lambda x: x[1])[0]
                if main_topic not in topics_to_remove:
                    filtered_indices.append(i)
            
            filtered_df = keyword_df.iloc[filtered_indices]
        
        results[keyword] = {
            'lda_model': lda_model,
            'dictionary': dictionary,
            'topics_to_remove': topics_to_remove,
            'filtered_df': filtered_df
        }
        
        all_filtered_df = pd.concat([all_filtered_df, filtered_df])
    
    # 按 'createDate' 排序并保存到 Excel
    all_filtered_df = all_filtered_df.sort_values('createDate')
    all_filtered_df.to_excel('filtered_data.xlsx', index=False)
    
    # 保存 results 到 JSON 文件
    save_results(results, 'lda_results.json')
    
    return results, all_filtered_df

def process_new_data(new_df, results_file, existing_data_file):
    # 加载现有的过滤后的数据
    existing_df = pd.read_excel(existing_data_file)
    
    # 加载 LDA 模型和结果
    results = load_results(results_file)
    
    for keyword in results:
        keyword_df = new_df[new_df['keyword'] == keyword]
        
        if not keyword_df.empty:
            lda_model = results[keyword]['lda_model']
            dictionary = results[keyword]['dictionary']
            topics_to_remove = results[keyword]['topics_to_remove']
            
            if len(topics_to_remove) == lda_model.num_topics:
                continue
            
            filtered_indices = []
            for i, title in enumerate(keyword_df['title']):
                processed_title = preprocess_text(title)
                bow = dictionary.doc2bow(processed_title)
                topic_distribution = lda_model.get_document_topics(bow)
                main_topic = max(topic_distribution, key=lambda x: x[1])[0]
                if main_topic not in topics_to_remove:
                    filtered_indices.append(i)
            
            filtered_df = keyword_df.iloc[filtered_indices]
            existing_df = pd.concat([existing_df, filtered_df])
    
    # 按 'createDate' 排序并保存更新后的数据
    existing_df = existing_df.sort_values('createDate')
    existing_df.to_excel(existing_data_file, index=False)
    
    return existing_df

In [None]:
# Example usage
if __name__ == "__main__":
    # 假设我们有一个包含 'keyword', 'title', 'storyId', 和 'createDate' 列的 DataFrame
    df = pd.DataFrame({
        'keyword': ['tech', 'tech', 'sports', 'sports', 'music', 'music'],
        'title': [
            'New AI breakthrough in natural language processing',
            'The future of quantum computing',
            'World Cup 2022: Argentina wins the final',
            'NBA playoffs: Golden State Warriors advance to finals',
            'Taylor Swift announces world tour',
            'Classical music in the digital age'
        ],
        'storyId': [1, 2, 3, 4, 5, 6],
        'createDate': [
            datetime(2023, 1, 1),
            datetime(2023, 1, 2),
            datetime(2023, 1, 3),
            datetime(2023, 1, 4),
            datetime(2023, 1, 5),
            datetime(2023, 1, 6)
        ]
    })
    
    key_list = ['tech', 'sports', 'music']
    
    # 处理现有数据
    results, all_filtered_df = process_keywords(df, key_list)
    
    # 模拟新数据
    new_df = pd.DataFrame({
        'keyword': ['tech', 'sports', 'music'],
        'title': [
            'Advancements in robotics and automation',
            'Olympic Games 2024: Preview of upcoming events',
            'The rise of K-pop in global music scene'
        ],
        'storyId': [7, 8, 9],
        'createDate': [
            datetime(2023, 1, 7),
            datetime(2023, 1, 8),
            datetime(2023, 1, 9)
        ]
    })
    
    # 处理新数据
    updated_df = process_new_data(new_df, 'lda_results.json', 'filtered_data.xlsx')
    
    # 打印更新后的数据
    print("\nUpdated filtered data:")
    print(updated_df)