In [None]:
# 1. We could use NER for keyword extractions
# 2. We could use google search for evidence retrieval
# 3. Fine tuning and prompt engineering needed
# 4. Avoid posting to the duplicate info to Weaviate
# 5. Combine with Predictive AI results
# 6. 

In [3]:
# Starting of the pipeline, get a real time news
import requests
from bs4 import BeautifulSoup
import re

def scrape_site(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract header
        header = soup.find(['h1']).get_text().strip()

        # Extract content
        content_tags = soup.find_all(['p'])
        content = [tag.get_text().strip().replace('\xa0', ' ') for tag in content_tags]

        # Find the keyword 'By' to extract the author's name
        page_text = soup.get_text()
        match = re.search(r'\bBy\s+([A-Za-z\s.,]+)', page_text)
        authors = match.group(1).strip().replace('and', ',') if match else 'Author not found'
        author_lst = [auth.strip() for auth in authors.split(',')]
        return header, content, author_lst
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None, None, None

url = "https://www.cnn.com/2024/02/18/politics/emissions-rules-ev-growth-biden-administration/index.html"
header, content, authors = scrape_site(url)

In [4]:
# Can potentially conduct NER methods to extract keywords
article = " ".join(content)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

tfidf_vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
tfidf_matrix = tfidf_vectorizer.fit_transform([article])
feature_names = tfidf_vectorizer.get_feature_names()
scores = tfidf_matrix.toarray().flatten()
indices = scores.argsort()[::-1]
top_n = 10
top_features = [(feature_names[i], scores[i]) for i in indices[:top_n]]
keywords = " ".join([feature for feature, score in top_features])
for feature, score in top_features:
    print(f"{feature}: {score}")

emissions: 0.28867513459481287
epa: 0.24743582965269675
biden: 0.24743582965269675
president: 0.20619652471058064
evs: 0.20619652471058064
automakers: 0.20619652471058064
climate: 0.20619652471058064
rule: 0.20619652471058064
considering: 0.1649572197684645
said: 0.1649572197684645


In [5]:
# Can potentially use Google search API instead
# Advanced RAG
from gnews import GNews
import numpy as np
google_news = GNews()
max_results = 20
# google_news.period = '7d'
google_news.max_results = max_results 
# google_news.country = 'United States'
google_news.language = 'english'
# google_news.exclude_websites = ['yahoo.com', 'cnn.com'] 
google_news.start_date = (2020, 1, 1)
google_news.end_date = (2024, 2, 3)
articles = []
news = google_news.get_news(keywords)
for i in range(max_results):
    try:
        article = google_news.get_full_article(
            news[i]['url']
        )
    except:
        break
    articles.append(article)
title_text = [article.title for article in articles if article]
article_text = [article.text for article in articles if article]

An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/business/environment/us-proposes-56-vehicle-emissions-cut-by-2032-requiring-big-ev-jump-2023-04-12/ on URL https://news.google.com/rss/articles/CBMie2h0dHBzOi8vd3d3LnJldXRlcnMuY29tL2J1c2luZXNzL2Vudmlyb25tZW50L3VzLXByb3Bvc2VzLTU2LXZlaGljbGUtZW1pc3Npb25zLWN1dC1ieS0yMDMyLXJlcXVpcmluZy1iaWctZXYtanVtcC0yMDIzLTA0LTEyL9IBAA?oc=5&hl=en-US&gl=US&ceid=US:en
An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.wsj.com/articles/epa-seeks-to-boost-evs-with-toughest-ever-rules-on-tailpipe-emissions-5658217d on URL https://news.google.com/rss/articles/CBMia2h0dHBzOi8vd3d3Lndzai5jb20vYXJ0aWNsZXMvZXBhLXNlZWtzLXRvLWJvb3N0LWV2cy13aXRoLXRvdWdoZXN0LWV2ZXItcnVsZXMtb24tdGFpbHBpcGUtZW1pc3Npb25zLTU2NTgyMTdk0gEA?oc=5&hl=en-US&gl=US&ceid=US:en


In [6]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Weaviate
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/zhj003/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# Split the articles into chunks before posting to Weaviate database
class Document:
    def __init__(self, text):
        self.page_content = text
        self.metadata = {'source': 'google news'}

documents = [Document(article) for article in article_text]
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0)
chunked_articles = text_splitter.split_documents(documents)
chunked_articles = [document.page_content for document in chunked_articles]

02/19/2024 06:13:49 AM - Created a chunk of size 414, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 374, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 326, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 409, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 590, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 315, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 310, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 520, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 417, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 320, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 326, which is longer 

02/19/2024 06:13:49 AM - Created a chunk of size 485, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 367, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 361, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 416, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 432, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 563, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 456, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 509, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 585, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 383, which is longer than the specified 300
02/19/2024 06:13:49 AM - Created a chunk of size 432, which is longer 

In [8]:
# Our tokenized method
import torch
from transformers import BertTokenizer, BertModel, pipeline

def text_embedding(data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
    
    def get_bert_embeddings(data):
        tokens = tokenizer(data.tolist(), padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)
        return embeddings

    batch_size = 128
    num_samples = len(data)
    num_batches = (num_samples + batch_size - 1) // batch_size

    embeddings_list = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        batch_data = data.iloc[start_idx:end_idx]
        batch_embeddings = get_bert_embeddings(batch_data)
        embeddings_list.append(batch_embeddings)

    embeddings = torch.cat(embeddings_list, dim=0).cpu().numpy()
    return embeddings

2024-02-19 06:13:52.415472: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
02/19/2024 06:13:54 AM - Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
02/19/2024 06:13:54 AM - NumExpr defaulting to 8 threads.


In [9]:
import weaviate
import json
import pandas as pd

client = weaviate.Client(
    url = "https://testing-cluster-2qgcoz4q.weaviate.network",  # Replace with your endpoint
    auth_client_secret=weaviate.auth.AuthApiKey(api_key="qRarwGLC0CwrpQsSpK64E1V0c3HajFoAy893"),  # Replace w/ your Weaviate instance API key
)

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


In [10]:
# Post data to weaviate cloud db 
for article in chunked_articles:
    properties = {"context": article}
    vector = text_embedding(pd.Series(article)).tolist()[0]
    client.data_object.create(properties, "test_dataset_1", vector=vector)

In [11]:
evidence = []
for text_query in content:
    query_vector = {"vector" : text_embedding(pd.Series(text_query)).tolist()[0],
                "distance" : 1.0
    }
    results = client.query.get("test_dataset_1", ["context"]).with_additional("distance"
                ).with_near_vector(query_vector).do()
    evidence.append([result["context"] for result in results['data']['Get']['Test_dataset_1'][:10]])

In [14]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key="AIzaSyClyO_P1azrly9sScfVL3dJnKy8q7HtayU")

In [18]:
def evaluate_claim(claim, evidence):
    prompt = f"""please rate the veracity of the following claim on a scale from 0 to 5,
    with 0 being completely true and 5 being entirely false.
    Please ensure that the first character in your response is a single integer between 0 and 5,
    and explain your reasoning with the evidence we provided below the claim:
    {claim} \n Here are the evidence of this claim"""

    response = llm.invoke(prompt).content

    rating = response[0]

    return response, int(rating)

for i in range (len(content)):
    print(evaluate_claim(content[i], evidence[i]))

('5\nEvidence not provided.', 5)
('2\n\nThe claim is partially true. The Biden administration is considering relaxing some of the stringent vehicle emissions rules it proposed last year, but it is not clear how much time automakers would be given to meet the requirements. The administration is still working on the details of the plan, and it is possible that the final version will be different from what was initially proposed.', 2)
("1\n\nThe claim is mostly true. The Environmental Protection Agency's vehicle emissions rule is a key plank of President Joe Biden's climate agenda. Biden has made the transition to EVs a signature issue of his presidency, stressing the economic impacts in addition to the boost for the climate. The rule will require new vehicles to emit significantly less greenhouse gas emissions, and it is expected to help the United States meet its emissions reduction targets under the Paris Agreement.", 1)
('4\n\nThe claim is that the EPA is considering delaying strict e