In [None]:
# 1. We could use NER for keyword extractions
# 2. We could use google search for evidence retrieval
# 3. Fine tuning and prompt engineering needed
# 4. Avoid posting to the duplicate info to Weaviate
# 5. Combine with Predictive AI results
# 6. 

In [1]:
# Starting of the pipeline, get a real time news
import requests
from bs4 import BeautifulSoup
import re

def scrape_site(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract header
        header = soup.find(['h1']).get_text().strip()

        # Extract content
        content_tags = soup.find_all(['p'])
        content = [tag.get_text().strip().replace('\xa0', ' ') for tag in content_tags]

        # Find the keyword 'By' to extract the author's name
        page_text = soup.get_text()
        match = re.search(r'\bBy\s+([A-Za-z\s.,]+)', page_text)
        authors = match.group(1).strip().replace('and', ',') if match else 'Author not found'
        author_lst = [auth.strip() for auth in authors.split(',')]
        return header, content, author_lst
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None, None, None

url = "https://www.cnn.com/2024/02/27/entertainment/jam-master-jay-murder-verdict/index.html"
header, content, authors = scrape_site(url)

In [2]:
# Can potentially conduct NER methods to extract keywords
article = " ".join(content)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

tfidf_vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
tfidf_matrix = tfidf_vectorizer.fit_transform([article])
feature_names = tfidf_vectorizer.get_feature_names()
scores = tfidf_matrix.toarray().flatten()
indices = scores.argsort()[::-1]
top_n = 10
top_features = [(feature_names[i], scores[i]) for i in indices[:top_n]]
keywords = " ".join([feature for feature, score in top_features])
for feature, score in top_features:
    print(f"{feature}: {score}")

jay: 0.44561177832918486
said: 0.25798576640110704
jam: 0.21107926341908756
jordan: 0.21107926341908756
master: 0.21107926341908756
washington: 0.18762601192807785
bryant: 0.18762601192807785
run: 0.1641727604370681
dmc: 0.1407195089460584
murder: 0.1407195089460584


In [3]:
# Can potentially use Google search API instead
# Advanced RAG
from gnews import GNews
import numpy as np
google_news = GNews()
max_results = 20
# google_news.period = '7d'
google_news.max_results = max_results 
# google_news.country = 'United States'
google_news.language = 'english'
# google_news.exclude_websites = ['yahoo.com', 'cnn.com'] 
google_news.start_date = (2020, 1, 1)
google_news.end_date = (2024, 2, 3)
articles = []
news = google_news.get_news(keywords)
for i in range(max_results):
    try:
        article = google_news.get_full_article(
            news[i]['url']
        )
    except:
        break
    articles.append(article)
title_text = [article.title for article in articles if article]
article_text = [article.text for article in articles if article]

In [8]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Weaviate
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/zhj003/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
# Split the articles into chunks before posting to Weaviate database
class Document:
    def __init__(self, text):
        self.page_content = text
        self.metadata = {'source': 'google news'}

documents = [Document(article) for article in article_text]
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0)
chunked_articles = text_splitter.split_documents(documents)
chunked_articles = [document.page_content for document in chunked_articles]

02/28/2024 01:37:11 AM - Created a chunk of size 362, which is longer than the specified 300
02/28/2024 01:37:11 AM - Created a chunk of size 321, which is longer than the specified 300
02/28/2024 01:37:11 AM - Created a chunk of size 355, which is longer than the specified 300
02/28/2024 01:37:11 AM - Created a chunk of size 350, which is longer than the specified 300
02/28/2024 01:37:11 AM - Created a chunk of size 324, which is longer than the specified 300
02/28/2024 01:37:11 AM - Created a chunk of size 378, which is longer than the specified 300
02/28/2024 01:37:11 AM - Created a chunk of size 342, which is longer than the specified 300
02/28/2024 01:37:11 AM - Created a chunk of size 327, which is longer than the specified 300
02/28/2024 01:37:11 AM - Created a chunk of size 344, which is longer than the specified 300
02/28/2024 01:37:11 AM - Created a chunk of size 368, which is longer than the specified 300
02/28/2024 01:37:11 AM - Created a chunk of size 362, which is longer 

In [10]:
# Our tokenized method
import torch
from transformers import BertTokenizer, BertModel, pipeline

def text_embedding(data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
    
    def get_bert_embeddings(data):
        tokens = tokenizer(data.tolist(), padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)
        return embeddings

    batch_size = 128
    num_samples = len(data)
    num_batches = (num_samples + batch_size - 1) // batch_size

    embeddings_list = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        batch_data = data.iloc[start_idx:end_idx]
        batch_embeddings = get_bert_embeddings(batch_data)
        embeddings_list.append(batch_embeddings)

    embeddings = torch.cat(embeddings_list, dim=0).cpu().numpy()
    return embeddings

2024-02-28 01:37:28.399610: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-28 01:37:28.433413: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [28]:
import weaviate
import json
import pandas as pd

client = weaviate.Client(
    url = "https://testing-cluster-2qgcoz4q.weaviate.network",  # Replace with your endpoint
    auth_client_secret=weaviate.auth.AuthApiKey(api_key="qRarwGLC0CwrpQsSpK64E1V0c3HajFoAy893"),  # Replace w/ your Weaviate instance API key
)

In [20]:
for article in chunked_articles:
    # Check for duplicate before posting
    query = """
    {
        Get {
            Test_dataset_1(where: {
                operator: Equal
                path: ["context"]
                valueString: "%s"
            }) {
                _additional {
                    id
                }
            }
        }
    }
    """ % article.replace('"', '\\"')
    result = client.query.raw(query)
    try:
        if not result['data']['Get']['Test_dataset_1']:
            properties = {"context": article}
            vector = text_embedding(pd.Series(article)).tolist()[0]
            client.data_object.create(properties, "test_dataset_1", vector=vector)
        else:
            print("Article already exists in the database.")
    except:
        continue

Article already exists in the database.
Article already exists in the database.
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
Article already exists in the database.
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found
not found


In [10]:
# Post data to weaviate cloud db 
# for article in chunked_articles:
#     properties = {"context": article}
#     vector = text_embedding(pd.Series(article)).tolist()[0]
#     client.data_object.create(properties, "test_dataset_1", vector=vector)

In [29]:
evidence = []
for text_query in content:
    query_vector = {"vector" : text_embedding(pd.Series(text_query)).tolist()[0],
                "distance" : 1.0
    }
    results = client.query.get("test_dataset_1", ["context"]).with_additional("distance"
                ).with_near_vector(query_vector).do()
    evidence.append([result["context"] for result in results['data']['Get']['Test_dataset_1'][:10]])

In [30]:
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)

llm = ChatGoogleGenerativeAI(model="gemini-pro", 
google_api_key="AIzaSyClyO_P1azrly9sScfVL3dJnKy8q7HtayU", 
                            safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    })

In [31]:
def evaluate_claim(claim, evidence):
    prompt = f"""
    
    Firstly, please consider and understand the scales below:
    Here are some examples of some claims and their veracity classifications on a scale of from 0 to 5,
    with 0 = completely true, 1 = mostly true, 2 = half true, 3 = barely true, 
    4 = false and 5 = entirely false or pants on fire.
    
    Secondly, please consider several examples of justification to understand the standard:
    
    Claim 1: 'Hillary Clinton in 2005 co-sponsored legislation that would jail flag burners.'
    Classification 1: '0. This claim is 0 (True) because Hillary Clinton co-sponsored legislation in 2005 that would jail flag burners.'
    
    Claim 2: ''On military recruiters at Harvard, Elena Kagan took a position and the Supreme Court ruled unanimously that she was wrong.'
    Classification 2: '0. This claim is 0 (True) because the court did  reject the arguments put forward by the law schools, 
    which included FAIR  and the brief filed by the Harvard professors. All of the justices who voted opposed the law schools arguments'
    
    Claim 3: 'Have the suburbs been inundated with former residents of Atlanta housing projects? Absolutely not.'
    Classification 3: ''0. This claim is 0 (True) because DeKalb Countys population is slightly more than 747,000, the Census Bureau data show.
    The data from the AHA and Georgia State both reach similar conclusions that the percentage of tenants moving out of the city has been small.'
    
    Claim 4: 'Under legislation that has cleared the Georgia House, some children who are legal refugees 
    could obtain state scholarships to attend private schools.'
    Classification 4: '0. This claim is 0(True) because legislation has cleared the Georgia House that would expand the list of students eligible for a 
    private school scholarship program created in 2007. The scholarships are now offered in varying amounts to students with disabilities.
    The bill would open the program to about 700 legal refugees who are not proficient in English.'
    
    Claim 5: 'Hillary Clinton agrees with John McCain by voting to give George Bush the benefit of the doubt on Iran.'
    Classification 5: '1. This claim is 1 (mostly true). Although Clinton may have "agreed" with McCain on the issue, they did not technically vote the same way on it. 
    To say that voting for Kyl-Lieberman is giving George Bush the benefit of the doubt on Iran remains a contentious issue. But Obamas main point is that Clinton and McCain 
    were on the same side, and that is correct.'
    
    Claim 6: 'Mark Pryor votes with Obama 93 percent of the time.'
    Classification 6: '1. This claim is 1 (mostly true) because Since Obama became president, Pryor has voted in line with the 
    presidents positions between 90 and 95 percent of the time, with 92. 6 percent -- basically 93 percent -- as the average, 
    according to the best rating system at our disposal. Pryor doesnt vote with the Democratic Party quite as often, though, 
    and in 2013 his presidential support votes were lower than every other Senate Democrat. Cottons number is on target based on 
    the data, but Pryor has also opposed Obama on a few key issues.'
    
    Claim 7: 'Numerous studies have shown that these so-called right-to-work laws do not generate jobs and economic growth.'
    Classification 10: '2. This claim is 2 (half true) because Right-to-work states have seen greater job increases, but one economist pointed out that such a dynamic doesnt prove
    right-to-work laws were the cause. Another professor who believes job growth results from right-to-work laws acknowledged that many other factors also could be responsible.'
    
    Claim 8: 'When did the decline of coal start? It started when natural gas took off that started to begin in President George W. Bushs administration.'
    Classification 11: '2. This claim is 2 (half true) because there is no doubt, natural gas has been gaining ground on coal in generating electricity. 
    The trend started in the 1990s but clearly gained speed during the Bush administration when the production of natural gas -- a competitor of coal -- picked up. 
    But analysts give little credit or blame to Bush for that trend. They note that other factors, such as technological innovation, entrepreneurship and
    policies of previous administrations, had more to do with laying the groundwork for the natural gas boom.'

    Claim 9: 'Hillary Clinton said the Veterans Affairs scandal is over-exaggerated. She said she was satisfied with what was going on.'
    Classification 7: 'This claim is 3 (barely true) because while Clinton has said problems at the VA have not been as widespread as it has been made out to be,
    she has also acknowledged systemic problems within the system and repeatedly urged reform so veterans can get care quickly.'
    
    Claim 10: 'the paperback edition of Mitt Romneys book deleted line that Massachusetts individual mandate should be the model for the country'
    Classification 8: '3. This claim is 3 (barely true) because Perry's right that Romney's comments about health care were edited between editions. Among other things, 
    a line that advocated the Massachusetts model as a strong option for other states was replaced by a shorter, more generic sentence. But that line was preceded
    by an argument for state-level solutions, exactly the argument Romney extends now. That's not how Perry characterized it.'
    
    Claim 11: 'This year in Congress Connie Mack IV has missed almost half of his votes.'
    Classification 9: '3. This claim is 3 (barely true) because Mack has more votes than the average member of
    the U. S.  House of Representatives, and hes missed high-profile votes on health care and the budget. He hasnt missed almost half of his votes, though.'
    
    Claim 12: 'Of Virginias 98,000 teachers who are K-12, over 53,000 of those teachers today are over 50 years old.'
    Classification 12: '4. This claim is 4 (false) because Virginia does not keep data that is solely focused on the age of teachers. The best statistics available show that the states 
    instructional staff -- including teachers, librarians, guidance counselors and technology instructors -- was 98,792 during the 2010-11 school year and, of them, 33,462 were 50 or older.
    The teaching corps is not moving toward retirement with anything close to the speed described.'
    
    Claim 13: 'What the Obama administration is going to come out with in the next several months is youre not even going to be able to burn coal very 
    limitedly in the existing plants.'
    Classification 13: '4. This claim is 4 (false) because The proposal the claim is referring to is an EPA plan to cut carbon emissions in existing power plants. Those rules do not prohibit 
    current facilities from burning coal, and even Capitos spokeswoman said the rule doesnt mean that every plant has to close. Some facilities will close down within the next decade, 
    but many of those plants were scheduled to be retired anyway due to age and other factors. States and power companies have options to continue to utilize coal for energy, and experts said 
    they expect coal to remain part of the national portfolio for years to come.'
    
    Claim 14: 'Charlie Crist supports cuts to the Medicare Advantage program.'
    Classification 14: '4. This claim is 4 (false) because Crist has flip-flopped on a lot of issues, including the federal health care law. He used to oppose the Affordable Care Act, 
    but now he supports it. The law tries to bring down future health care costs by reducing Medicare Advantage payments. But on the issue of Medicare Advantage, Crist has actually been
    consistent: Hes been critical of the Medicare Advantage cuts for years. He specifically said he opposed the reductions in 2009 and 2010, and he 
    still opposes them today. Crist doesnt appear to have come up with other ways to save money on health care without reducing payments to Medicare Advantage.'
    
    Claim 15: 'Says Ohio budget item later signed into law by Gov. John Kasich requires women seeking an abortion to undergo a mandatory vaginal probe.'
    Classification 15: '5. This claim is 5 (entirely false/fabricated) because there should be no debate about what types of ultrasounds these new regulations require. there is a mandate 
    -- but for external detection methods.'
    
    Claim 16: 'Roughly 25% of RI has a criminal record'
    Classification 16: '5. This claim is 5 (entirely false/fabricated) because They cant be sure of the exact percentage because of the way the data is collected, but theyre certain its nowhere near 25 percent. 
    DAREs statement is based on old, flawed statistics that were tweaked and re-tweaked to make a point.'
    
    Claim 17: 'We spend less on defense today as percentage of GDP than at any time since Pearl Harbor.'
    Classification 17: '5. This claim is 5 (entirely false/fabricated) because since the Cold War ended, and in a few other years as well, the percent of GDP used for defense has been consistently lower 
    than current spending levels.'
    
    Thirdly, Please rate the veracity of the following claim on the same 0 to 5 scale. Ensure that the first character 
    in your response is a single integer between 0 and 5, and explain your reasoning like the examples above, citing
    any evidence in a chain of thought fashion. {claim} 
    
    Fourthly, please consider the the evidence of this claim: {evidence}
    
    Finally, if the evidence is inconclusive or does not directly address the claim, please base your rating on your knowledge and indicate the lack of direct evidence in your explanation.
    """

    response = llm.invoke(prompt).content

    rating = response[0]

    return response, int(rating)

for i in range (len(content)):
    print(evaluate_claim(content[i], evidence[i]))

('3. This claim is 3 (barely true). The evidence provided does not directly address the claim, but there is some evidence that suggests that the claim may be true. For example, Figure 1 shows that the number of people in the United States who are incarcerated has increased significantly over the past few decades. Figure 2 shows that the United States has a higher rate of incarceration than any other country in the world. However, these figures do not provide any information about the reasons for the increase in incarceration rates. Therefore, it is difficult to say for sure whether or not the claim is true.', 3)
("0. This claim is 0 (True) because the claim is about two men were found guilty of murder Tuesday in the 2002 killing of Jam Master Jay, the pioneering DJ of the groundbreaking hip hop trio Run-DMC, in a case that for decades frustrated detectives and music fans alike. The evidence above supports this claim by providing information about the trial, the charges against the men,

('4. The claim is "Their song with Aerosmith was part of the triple-platinum selling album “Raising Hell,” which also included the hits “It’s Tricky,” “My Adidas” and “You Be Illin’.” The album was nominated for a Grammy Award for Best R&B Vocal Performance by a Duo or Group."\n\nThe evidence provided does not directly address whether the song with Aerosmith was part of the album "Raising Hell" or whether the album was nominated for a Grammy Award. However, the evidence does state that Run-DMC had a hit song with Aerosmith called "Walk This Way" and that the group was nominated for a Grammy Award in 1985. Therefore, it is likely that the claim is true.', 4)
('1. The claim states that Run-DMC released a total of six albums between 1984 and 1993, with “King of Rock” and “Tougher Than Leather” also reaching platinum status. \n\nThe evidence provided includes several references to Run-DMC\'s successful albums, including "King of Rock" and "Tougher Than Leather" reaching platinum status. Ho