In [None]:
import pandas as pd
import numpy as np
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import pickle
import os

# 下载必要的NLTK数据
nltk.download('punkt')
nltk.download('stopwords')

# 预处理文本
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token.isalnum() and token not in stop_words]

# 创建LDA模型并返回前5个关键词
def create_lda_model(texts, num_topics=10):
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=100)
    
    top_words = []
    for topic_id in range(num_topics):
        top_words.append([word for word, _ in lda_model.show_topic(topic_id, topn=5)])
    
    return lda_model, dictionary, top_words

# 主函数
def process_keywords(df, key_list):
    results = {}
    
    for keyword in key_list:
        print(f"Processing keyword: {keyword}")
        
        # 获取该关键词对应的数据
        keyword_df = df[df['keyword'] == keyword]
        
        # 预处理标题
        processed_titles = [preprocess_text(title) for title in keyword_df['title']]
        
        # 创建LDA模型
        lda_model, dictionary, top_words = create_lda_model(processed_titles)
        
        # 打印每个主题的前5个关键词
        for i, words in enumerate(top_words):
            print(f"Topic {i}: {', '.join(words)}")
        
        # 人工输入要删除的主题编号
        topics_to_remove = input("Enter the topic numbers to remove (comma-separated): ")
        topics_to_remove = [int(x.strip()) for x in topics_to_remove.split(',')]
        
        # 过滤数据
        filtered_indices = []
        for i, title in enumerate(processed_titles):
            bow = dictionary.doc2bow(title)
            topic_distribution = lda_model.get_document_topics(bow)
            main_topic = max(topic_distribution, key=lambda x: x[1])[0]
            if main_topic not in topics_to_remove:
                filtered_indices.append(i)
        
        filtered_df = keyword_df.iloc[filtered_indices]
        
        # 保存模型和结果
        results[keyword] = {
            'lda_model': lda_model,
            'dictionary': dictionary,
            'topics_to_remove': topics_to_remove,
            'filtered_df': filtered_df
        }
        
        # 保存模型到文件
        with open(f"{keyword}_lda_model.pkl", "wb") as f:
            pickle.dump((lda_model, dictionary, topics_to_remove), f)
    
    return results

# 处理新数据的函数
def process_new_data(new_df, results):
    for keyword in results:
        keyword_df = new_df[new_df['keyword'] == keyword]
        
        if not keyword_df.empty:
            lda_model = results[keyword]['lda_model']
            dictionary = results[keyword]['dictionary']
            topics_to_remove = results[keyword]['topics_to_remove']
            
            filtered_indices = []
            for i, title in enumerate(keyword_df['title']):
                processed_title = preprocess_text(title)
                bow = dictionary.doc2bow(processed_title)
                topic_distribution = lda_model.get_document_topics(bow)
                main_topic = max(topic_distribution, key=lambda x: x[1])[0]
                if main_topic not in topics_to_remove:
                    filtered_indices.append(i)
            
            filtered_df = keyword_df.iloc[filtered_indices]
            results[keyword]['filtered_df'] = pd.concat([results[keyword]['filtered_df'], filtered_df])
    
    return results

# 示例用法
if __name__ == "__main__":
    # 假设我们有一个包含'keyword'和'title'列的DataFrame
    df = pd.DataFrame({
        'keyword': ['tech', 'tech', 'sports', 'sports', 'music', 'music'],
        'title': [
            'New AI breakthrough in natural language processing',
            'The future of quantum computing',
            'World Cup 2022: Argentina wins the final',
            'NBA playoffs: Golden State Warriors advance to finals',
            'Taylor Swift announces world tour',
            'Classical music in the digital age'
        ]
    })
    
    key_list = ['tech', 'sports', 'music']
    
    # 处理现有数据
    results = process_keywords(df, key_list)
    
    # 模拟新数据
    new_df = pd.DataFrame({
        'keyword': ['tech', 'sports', 'music'],
        'title': [
            'Advancements in robotics and automation',
            'Olympic Games 2024: Preview of upcoming events',
            'The rise of K-pop in global music scene'
        ]
    })
    
    # 处理新数据
    updated_results = process_new_data(new_df, results)
    
    # 打印结果
    for keyword in updated_results:
        print(f"\nFiltered data for {keyword}:")
        print(updated_results[keyword]['filtered_df'])