In [3]:
import requests
import praw
import spacy
import re
from transformers import pipeline
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from top2vec import Top2Vec
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
from datetime import datetime

# Set up API keys (secure these in a real application)
NEWS_API_KEY = 'your_news_api_key'
reddit = praw.Reddit(client_id='your_client_id',
                     client_secret='your_client_secret',
                     user_agent='event_summary_app')

# Function to fetch news from NewsAPI
def get_news(keyword, page_size=20):
    url = f'https://newsapi.org/v2/everything?q={keyword}&apiKey={NEWS_API_KEY}&pageSize={page_size}'
    response = requests.get(url)
    articles = response.json().get('articles', [])
    
    news = []
    for article in articles:
        news.append({
            'title': article['title'],
            'description': article['description'],
            'url': article['url'],
            'publishedAt': article['publishedAt']
        })
    return news

# Function to fetch top Reddit posts
def fetch_top_posts(search_query, subreddit_name='all', limit=10):
    subreddit = reddit.subreddit(subreddit_name)
    top_posts = subreddit.search(search_query, sort='top', limit=limit)
    
    post_data = []
    for post in top_posts:
        post_datetime = datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S')
        upvotes = post.ups
        downvotes = upvotes - post.score
        awards = post.all_awards if hasattr(post, 'all_awards') else []
        award_names = [award['name'] for award in awards]
        flair = post.link_flair_text if post.link_flair_text else "No Flair"
        
        post_data.append({
            'Title': post.title,
            'Body': post.selftext,
            'Date': post_datetime,
            'Upvotes': upvotes,
            'Downvotes': downvotes,
            'Comments': post.num_comments,
            'Link': post.url,
            'Awards': award_names,
            'Flair': flair
        })
    
    return post_data

# Function to clean and preprocess text
nlp = spacy.load("en_core_web_sm")
def preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Named Entity Recognition
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# Sentiment analysis
sentiment_analyzer = pipeline('sentiment-analysis')
def analyze_sentiment(text):
    return sentiment_analyzer(text)

# Topic Modeling with Top2Vec
def get_topics(documents):
    # Using Doc2Vec for initial processing
    tagged_data = [TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(documents)]
    model = Doc2Vec(vector_size=100, min_count=2, epochs=40, workers=4)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    
    # Use Top2Vec for topic modeling
    top2vec_model = Top2Vec(documents, embedding_model='doc2vec')
    topics, topic_words, word_scores, topic_scores = top2vec_model.get_topics()
    return topics, topic_words, word_scores, topic_scores

# Event Timeline Plotting
def plot_timeline(events):
    event_dates = [datetime.strptime(event['date'], "%Y-%m-%dT%H:%M:%SZ") for event in events]
    event_texts = [event['text'] for event in events]
    
    plt.figure(figsize=(10, 5))
    plt.plot(event_dates, [i for i in range(len(event_dates))], 'bo-')
    
    for i, text in enumerate(event_texts):
        plt.text(event_dates[i], i, text, fontsize=9, verticalalignment='bottom', horizontalalignment='right')
    
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d %H:%M"))
    plt.gca().xaxis.set_major_locator(mdates.HourLocator(interval=2))
    plt.gcf().autofmt_xdate()
    plt.title("Event Timeline")
    plt.xlabel("Date and Time")
    plt.ylabel("Events")
    plt.grid(True)
    plt.show()

# Main function to combine everything
def summarize_event(topic):
    # Fetch news and reddit data
    news_articles = get_news(topic)
    # reddit_posts = fetch_top_posts(topic)
    
    # Combine and preprocess data
    combined_data = [article['description'] for article in news_articles if article['description']] 
    # + \
                    # [post['Body'] for post in reddit_posts if post['Body']]

    cleaned_data = [preprocess_text(text) for text in combined_data]
    
    # Perform topic modeling
    topics, topic_words, _, _ = get_topics(cleaned_data)
    
    # Extract entities and sentiment for each news and reddit post
    events = []
    for article in news_articles:
        sentiment = analyze_sentiment(article['description'])
        entities = extract_entities(article['description'])
        events.append({
            'text': article['title'],
            'sentiment': sentiment,
            'entities': entities,
            'source': article['url'],
            'date': article['publishedAt']
        })
    
    for post in reddit_posts:
        sentiment = analyze_sentiment(post['Body'])
        entities = extract_entities(post['Body'])
        events.append({
            'text': post['Title'],
            'sentiment': sentiment,
            'entities': entities,
            'source': post['Link'],
            'date': post['Date']
        })
    
    # Plot timeline
    plot_timeline(events)
    
    # Print structured summary
    print("Summary of Events:")
    for event in events:
        print(f"- {event['text']} (Source: {event['source']}, Date: {event['date']})")
        print(f"  Sentiment: {event['sentiment'][0]['label']} ({event['sentiment'][0]['score']:.2f})")
        print(f"  Key Entities: {', '.join([ent[0] for ent in event['entities']])}")
        print()

if __name__ == "__main__":
    topic = input("Enter the event/topic: ")
    summarize_event(topic)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


RuntimeError: you must first build vocabulary before training the model