In [47]:
%%writefile app.py
import streamlit as st
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from transformers import pipeline
import torch
from urllib.parse import urlparse

# Loading the models
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

st.set_page_config(page_title="News Q&A", page_icon="📰", layout="wide")

st.title('📰 News Q&A')
st.write('Enter URLs of news articles and questions to get answers.')

def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.scheme) and bool(parsed.netloc)

def fetch_article(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        text = ' '.join([para.get_text() for para in paragraphs])
        return text
    except requests.RequestException as e:
        st.error(f"Error fetching article from {url}: {e}")
        return ""

def embed_articles(articles):
    all_paragraphs = []
    all_embeddings = []
    article_sources = []
    
    for url, article in articles:
        if article.strip():
            paragraphs = article.split('\n')
            embeddings = model.encode(paragraphs)
            all_paragraphs.extend(paragraphs)
            all_embeddings.extend(embeddings)
            article_sources.extend([url] * len(paragraphs))
        
    return all_paragraphs, np.array(all_embeddings), article_sources

def create_index(embeddings):
    index = faiss.IndexFlatL2(embeddings.shape[1])  # Dimension of embeddings
    index.add(embeddings)
    return index

# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1

import re

def get_answer(question, all_paragraphs, index, article_sources, max_sentences=3):
    question_embedding = model.encode([question])
    D, I = index.search(np.array(question_embedding), k=5)
    
    candidate_paragraphs = [all_paragraphs[i] for i in I[0]]
    candidate_sources = [article_sources[i] for i in I[0]]
    
    best_answer = None
    best_source = None
    best_score = 0
    best_paragraph = None
    
    for paragraph, source in zip(candidate_paragraphs, candidate_sources):
        if not paragraph.strip():
            continue
        result = qa_pipeline(question=question, context=paragraph)
        if result['score'] > best_score:
            best_score = result['score']
            best_answer = result['answer']
            best_source = source
            
            # Split the paragraph into sentences
            sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', paragraph)
            # Take the first few sentences (max_sentences)
            best_paragraph = ' '.join(sentences[:max_sentences])
    
    return best_answer, best_source, best_paragraph

st.markdown("""
    <style>
    .main {
        background-color: #BB9AB1;
        padding: 20px;
    }
    .sidebar .sidebar-content {
        background-color: #987D9A;
    }
   
    </style>
    """, unsafe_allow_html=True)


# Ensure the session state has a key for tracking the display state of the source paragraph
# Layout
st.sidebar.title("User Input")
url1 = st.sidebar.text_input('Enter news article URL 1')
url2 = st.sidebar.text_input('Enter news article URL 2')
url3 = st.sidebar.text_input('Enter news article URL 3')
question = st.sidebar.text_input('Enter your question')

# Add a checkbox to toggle the display of the source paragraph
show_source_paragraph = st.sidebar.checkbox('Show Source Paragraph')

if st.sidebar.button('Get Answer'):
    if question:
        valid_urls = [url for url in [url1, url2, url3] if is_valid_url(url)]
        if valid_urls:
            with st.spinner('Fetching articles and generating answer...'):
                articles = [(url, fetch_article(url)) for url in valid_urls]
                all_paragraphs, all_embeddings, article_sources = embed_articles(articles)
                
                if all_paragraphs:
                    index = create_index(all_embeddings)
                    answer, source, best_paragraph = get_answer(question, all_paragraphs, index, article_sources)
                    if answer:
                        st.success('Answer generated successfully!')
                        st.write('### Answer:', answer)
                        if show_source_paragraph:  # Conditionally display the source paragraph
                            st.write('### Source Paragraph:', best_paragraph)
                        st.write('### Source URL:', source)
                    else:
                        st.error('No answer could be found.')
                else:
                    st.error('No valid content was found in the provided articles.')
        else:
            st.error('Please provide valid URLs.')
    else:
        st.error('Please provide a question.')


Overwriting app.py


In [49]:
import subprocess
subprocess.Popen(['streamlit', 'run', 'app.py'])


<Popen: returncode: None args: ['streamlit', 'run', 'app.py']>