In [14]:
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)

In [2]:
import getpass
import os
import pandas as pd
import numpy as np
import wikipedia
from nltk.tokenize import sent_tokenize
#import tensorflow_hub as hub
import spacy
import nltk
from scipy import spatial


In [11]:
safety_settings = [
    {
        "category": "HARM_CATEGORY_DANGEROUS",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
]

In [19]:
llm = ChatGoogleGenerativeAI(model="gemini-pro", 
google_api_key="removed", 
                            safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    })

# Evidence retrieval with Wikipedia

In [19]:
def wiki_search(text):
    try:
        page = wikipedia.page(text, auto_suggest=False)
    except wikipedia.DisambiguationError as e:
        sel = e.options[0]
        page = wikipedia.page(sel)
        
    return page.content

In [28]:
#embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/orenciolli/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1


[0m

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [26]:
import torch
from transformers import BertTokenizer, BertModel, pipeline

def text_embedding(data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
    
    def get_bert_embeddings(data):
        tokens = tokenizer(data.tolist(), padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)
        return embeddings

    batch_size = 128
    num_samples = len(data)
    num_batches = (num_samples + batch_size - 1) // batch_size

    embeddings_list = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        batch_data = data.iloc[start_idx:end_idx]
        batch_embeddings = get_bert_embeddings(batch_data)
        embeddings_list.append(batch_embeddings)

    embeddings = torch.cat(embeddings_list, dim=0).cpu().numpy()
    return embeddings
    
    

In [31]:
def get_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    return entities
    
def retrieve_documents(claim):
    docs = []
    entities = get_entities(claim)
    
    for entity in entities:
        docs.append(wiki_search(entity))
    return docs

def compute_similarity(sentence1, sentence2):
    
    data = pd.Series([sentence1, sentence2])
    embed = text_embedding(data)
    
    embedding1 = embed[0]
    embedding2 = embed[1]

    # Compute the cosine similarity between the two sentence embeddings
    similarity = 1 - spatial.distance.cosine(embedding1, embedding2)

    return similarity

def top_k_sentences(claim, split_sentences, k):
    sims = []
    
    for sentence in split_sentences: 
        sims.append((sentence, compute_similarity(sentence, claim)))

        #also post to db?

    sims = sorted(sims, key=lambda x: x[1], reverse = True)
    return [elem[0] for elem in sims[:k]]

def get_wiki_evidence(claim, k):
    docs = retrieve_documents(claim)
    evidence = []
    
    for doc in docs:
        sentences = sent_tokenize(doc)
        evidence += top_k_sentences(claim, sentences, k)
        
    return evidence


In [17]:
from scipy import spatial

In [33]:
get_wiki_evidence('Joe Biden created COVID', 3)

Joe Biden
COVID


KeyboardInterrupt: 

# LLM

In [24]:
def evaluate_claim(claim):

    #articles = get_article_evidence(claim)
#     ground_truth = get_wiki_evidence(claim)

#todo: add ground truth


    prompt = f""" 
    Here are some examples of some claims and their veracity classifications on a scale of from 0 to 5,
    with 0 being completely true and 5 being entirely false, with brief justification:
    
    Claim 1: 'Hillary Clinton in 2005 co-sponsored legislation that would jail flag burners.'
    Classification 1: '0. This claim is 0 (True) because Hillary Clinton co-sponsored legislation in 2005 that would jail flag burners.'
    
    Claim 2: ''On military recruiters at Harvard, Elena Kagan took a position and the Supreme Court ruled unanimously that she was wrong.'
    Classification 2: '0. This claim is 0 (True) because the court did  reject the arguments put forward by the law schools, 
    which included FAIR  and the brief filed by the Harvard professors. All of the justices who voted opposed the law schools arguments'
    
    Claim 3: 'Have the suburbs been inundated with former residents of Atlanta housing projects? Absolutely not.'
    Classification 3: 'DeKalb Countys population is slightly more than 747,000, the Census Bureau data show.
    The data from the AHA and Georgia State both reach similar conclusions that the percentage of tenants moving out of the city has been small.'
    
    Claim 4: 'Under legislation that has cleared the Georgia House, some children who are legal refugees 
    could obtain state scholarships to attend private schools.'
    Classification 4: '0. This claim is 0(True) because legislation has cleared the Georgia House that would expand the list of students eligible for a 
    private school scholarship program created in 2007. The scholarships are now offered in varying amounts to students with disabilities.
    The bill would open the program to about 700 legal refugees who are not proficient in English.'
    
    
    Claim 5: 'Hillary Clinton agrees with John McCain by voting to give George Bush the benefit of the doubt on Iran.'
    Classification 5: '1. This claim is 1 (mostly true). Although Clinton may have "agreed" with McCain on the issue, they did not technically vote the same way on it. 
    To say that voting for Kyl-Lieberman is giving George Bush the benefit of the doubt on Iran remains a contentious issue. But Obamas main point is that Clinton and McCain 
    were on the same side, and that is correct.'
    
    Claim 6: 'Mark Pryor votes with Obama 93 percent of the time.'
    Classification 6: '1. This claim is 1 (mostly true) because Since Obama became president, Pryor has voted in line with the 
    presidents positions between 90 and 95 percent of the time, with 92. 6 percent -- basically 93 percent -- as the average, 
    according to the best rating system at our disposal. Pryor doesnt vote with the Democratic Party quite as often, though, 
    and in 2013 his presidential support votes were lower than every other Senate Democrat. Cottons number is on target based on 
    the data, but Pryor has also opposed Obama on a few key issues.'
    
    Claim 7: 'Hillary Clinton said the Veterans Affairs scandal is over-exaggerated. She said she was satisfied with what was going on.'
    Classification 7: 'This claim is 3 (barely true) because while Clinton has said problems at the VA have not been as widespread as it has been made out to be,
    she has also acknowledged systemic problems within the system and repeatedly urged reform so veterans can get care quickly.'
    
    
    Claim 8: 'the paperback edition of Mitt Romneys book deleted line that Massachusetts individual mandate should be the model for the country'
    Classification 8: '3. This claim is 3 (barely true) because Perry's right that Romney's comments about health care were edited between editions. Among other things, 
    a line that advocated the Massachusetts model as a strong option for other states was replaced by a shorter, more generic sentence. But that line was preceded
    by an argument for state-level solutions, exactly the argument Romney extends now. That's not how Perry characterized it.'
    
    Claim 9: 'This year in Congress Connie Mack IV has missed almost half of his votes.'
    Classification 9: '3. This claim is 3 (barely true) because Mack has more votes than the average member of
    the U. S.  House of Representatives, and hes missed high-profile votes on health care and the budget. He hasnt missed almost half of his votes, though.'
    
    Claim 10: 'Numerous studies have shown that these so-called right-to-work laws do not generate jobs and economic growth.'
    Classification 10: '2. This claim is 2 (half true) because Right-to-work states have seen greater job increases, but one economist pointed out that such a dynamic doesnt prove
    right-to-work laws were the cause. Another professor who believes job growth results from right-to-work laws acknowledged that many other factors also could be responsible.'
    
    Claim 11: 'When did the decline of coal start? It started when natural gas took off that started to begin in President George W. Bushs administration.'
    Classification 11: '2. This claim is 2 (half true) because there is no doubt, natural gas has been gaining ground on coal in generating electricity. 
    The trend started in the 1990s but clearly gained speed during the Bush administration when the production of natural gas -- a competitor of coal -- picked up. 
    But analysts give little credit or blame to Bush for that trend. They note that other factors, such as technological innovation, entrepreneurship and
    policies of previous administrations, had more to do with laying the groundwork for the natural gas boom.'
    
    Claim 12: 'Of Virginias 98,000 teachers who are K-12, over 53,000 of those teachers today are over 50 years old.'
    Classification 12: '4. This claim is 4 (false) because Virginia does not keep data that is solely focused on the age of teachers. The best statistics available show that the states 
    instructional staff -- including teachers, librarians, guidance counselors and technology instructors -- was 98,792 during the 2010-11 school year and, of them, 33,462 were 50 or older.
    The teaching corps is not moving toward retirement with anything close to the speed described.'
    
    Claim 13: 'What the Obama administration is going to come out with in the next several months is youre not even going to be able to burn coal very 
    limitedly in the existing plants.'
    Classification 13: '4. This claim is 4 (false) because The proposal the claim is referring to is an EPA plan to cut carbon emissions in existing power plants. Those rules do not prohibit 
    current facilities from burning coal, and even Capitos spokeswoman said the rule doesnt mean that every plant has to close. Some facilities will close down within the next decade, 
    but many of those plants were scheduled to be retired anyway due to age and other factors. States and power companies have options to continue to utilize coal for energy, and experts said 
    they expect coal to remain part of the national portfolio for years to come.'
    
    Claim 14: 'Charlie Crist supports cuts to the Medicare Advantage program.'
    Classification 14: '4. This claim is 4 (false) because Crist has flip-flopped on a lot of issues, including the federal health care law. He used to oppose the Affordable Care Act, 
    but now he supports it. The law tries to bring down future health care costs by reducing Medicare Advantage payments. But on the issue of Medicare Advantage, Crist has actually been
    consistent: Hes been critical of the Medicare Advantage cuts for years. He specifically said he opposed the reductions in 2009 and 2010, and he 
    still opposes them today. Crist doesnt appear to have come up with other ways to save money on health care without reducing payments to Medicare Advantage.'
    
    Claim 15: 'Says Ohio budget item later signed into law by Gov. John Kasich requires women seeking an abortion to undergo a mandatory vaginal probe.'
    Classification 15: '5. This claim is 5 (entirely false/fabricated) because there should be no debate about what types of ultrasounds these new regulations require. there is a mandate 
    -- but for external detection methods.'
    
    Claim 16: 'Roughly 25% of RI has a criminal record'
    Classification 16: '5. This claim is 5 (entirely false/fabricated) because They cant be sure of the exact percentage because of the way the data is collected, but theyre certain its nowhere near 25 percent. 
    DAREs statement is based on old, flawed statistics that were tweaked and re-tweaked to make a point.'
    
    Claim 17: 'We spend less on defense today as percentage of GDP than at any time since Pearl Harbor.'
    Classification 17: '5. This claim is 5 (entirely false/fabricated) because since the Cold War ended, and in a few other years as well, the percent of GDP used for defense has been consistently lower 
    than current spending levels.'
    
    Please rate the veracity of the following claim on the same 0 to 5 scale. Ensure that the first character 
    in your response is a single integer between 0 and 5, and explain your reasoning like the examples above, citing
    any evidence in a chain of thought fashion. {claim} 
   
    """

    response = llm.invoke(prompt).content

    rating = response[0]

    return response, int(rating)

## LIAR evaluation

In [6]:
df = pd.read_csv('liar_plus/test2.tsv', delimiter='\t', header = None)

df = df.drop(columns = [0])
df.rename({1: 'id', 2: 'label', 3: 'statement', 4: 'subject', 5: 'speaker', 6: 'job-title',
           7: 'state_info', 8: 'party_affiliation', 9: 'barely_true_counts', 10: 'false_counts',
           11: 'half_true_counts', 12: 'mostly_true_counts', 13: 'pants_on_fire_counts', 14: 'context',
           15: 'justification'
          }, axis = 1, inplace = True)

In [7]:
label_map = {'pants-fire': 5, 'false': 4, 'barely-true': 3, 
             'half-true': 2, 'mostly-true': 1, 'true': 0}

df['label'] = df['label'].replace(label_map)

In [15]:
#preds = df['statement'].apply(lambda x: evaluate_claim(x)[1] )

In [29]:
from tqdm import tqdm

In [32]:
preds = []
justs = []
labels = []

for index, row in tqdm(df.iterrows()):
    try:
        justification, result = evaluate_claim(row['statement'])

        preds.append(result)
        labels.append(row['label'])
        justs.append(justification)
        
    except:
        continue

1267it [1:51:53,  5.30s/it]


In [33]:
results_df = pd.DataFrame({'predicted': preds, 'label': labels, 'justification': justs})

results_df['predicted'] = results_df['predicted'].astype(int)
(results_df['predicted'] == results_df['label']).mean()

0.23052208835341365

In [45]:
df['statement'].iloc[0]

'Building a wall on the U.S.-Mexico border will take literally years.'

In [44]:
evaluate_claim(df['statement'].iloc[0])

('0\nBuilding a wall on the U.S.-Mexico border is a complex and time-consuming project that would take years to complete. According to experts, the construction of a wall along the entire U.S.-Mexico border would take at least 3.5 years and cost billions of dollars.',
 0)

In [40]:
results_df.dtypes

predicted    object
label         int64
dtype: object

In [51]:
(abs(results_df['predicted'].astype(int) - results_df['label'].astype(int)) <= 1).mean()

0.5124584717607974