In [5]:
!pip install flask requests beautifulsoup4 nltk scikit-learn transformers tensorflow rouge-score python-dotenv schedule sqlalchemy




In [8]:
# Import necessary libraries
import os
import requests
from bs4 import BeautifulSoup
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from transformers import pipeline, TFAutoModelForSeq2SeqLM, AutoTokenizer
from rouge_score import rouge_scorer
from dotenv import load_dotenv
import schedule
import time
from datetime import datetime
from flask import Flask, jsonify

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load environment variables
load_dotenv()

NEWS_API_KEY = os.getenv('NEWS_API_KEY')
WORDPRESS_URL = os.getenv('WORDPRESS_URL')
WORDPRESS_USERNAME = os.getenv('WORDPRESS_USERNAME')
WORDPRESS_PASSWORD = os.getenv('WORDPRESS_PASSWORD')

# Fetch articles from NewsAPI
def fetch_articles_from_newsapi(query, api_key):
    url = f'https://newsapi.org/v2/everything?q={query}&apiKey={api_key}'
    response = requests.get(url)
    articles = response.json().get('articles', [])
    return articles

# Fetch articles from Reuters
def fetch_articles_from_reuters():
    url = 'https://www.reuters.com/news/archive/worldNews'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    articles = []
    for item in soup.select('.story-content a'):
        article_url = 'https://www.reuters.com' + item['href']
        article_response = requests.get(article_url)
        article_soup = BeautifulSoup(article_response.content, 'html.parser')
        title = article_soup.find('h1').get_text()
        content = ' '.join(p.get_text() for p in article_soup.find_all('p'))
        articles.append({'title': title, 'content': content})
    return articles

# Preprocess text data
def preprocess_text(text):
    sentences = nltk.sent_tokenize(text)
    words = [nltk.word_tokenize(sentence) for sentence in sentences]
    words = [word.lower() for sentence in words for word in sentence if word.isalpha()]
    return ' '.join(words)

# Extractive summarization
def extractive_summary(text, num_sentences=5):
    sentences = nltk.sent_tokenize(text)
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(sentences)
    kmeans = KMeans(n_clusters=num_sentences)
    kmeans.fit(X)
    selected_sentences = [sentences[i] for i in kmeans.cluster_centers_.argsort()[:, -1]]
    return ' '.join(selected_sentences)

# Abstractive summarization
def abstractive_summary(text, max_length=130, min_length=30):
    model_name = "t5-small"
    model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    summarizer = pipeline('summarization', model=model, tokenizer=tokenizer, framework='tf')
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']

# Evaluate summary using ROUGE
def evaluate_summary(reference, summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, summary)
    return scores

# Post summary to WordPress
def post_to_wordpress(title, content, url, username, password):
    data = {
        'title': title,
        'content': content,
        'status': 'publish'
    }
    response = requests.post(url, json=data, auth=(username, password))
    return response.json()

# Fetch, summarize, and post articles
def post_summaries():
    newsapi_articles = fetch_articles_from_newsapi('latest news', NEWS_API_KEY)
    reuters_articles = fetch_articles_from_reuters()

    all_articles = newsapi_articles + reuters_articles

    for article in all_articles:
        content = article.get('content') or article.get('description', '')
        preprocessed_text = preprocess_text(content)
        extractive = extractive_summary(preprocessed_text)
        abstractive = abstractive_summary(preprocessed_text)

        summary = f"Extractive Summary: {extractive}\n\nAbstractive Summary: {abstractive}"

        post_to_wordpress(article.get('title', 'No Title'), summary, WORDPRESS_URL, WORDPRESS_USERNAME, WORDPRESS_PASSWORD)

# Flask app to trigger summarization
app = Flask(__name__)

@app.route('/summarize', methods=['POST'])
def summarize():
    response = post_summaries()
    return jsonify({"status": "success", "response": response}), 200

# Schedule the job
schedule.every().day.at("08:00").do(post_summaries)

if __name__ == '__main__':
    app.run(debug=True)

# Keep the script running to check the schedule
while True:
    schedule.run_pending()
    time.sleep(60)


ModuleNotFoundError: No module named 'transformers'

In [7]:
try:
    from transformers import pipeline, TFAutoModelForSeq2SeqLM, AutoTokenizer
    print("Transformers library imported successfully.")
except ModuleNotFoundError as e:
    print("Error importing transformers:", e)


Error importing transformers: No module named 'transformers'
