In [1]:
import torch
from transformers import BertTokenizer, BertModel, pipeline

def text_embedding(data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

    def get_bert_embeddings(data):
        tokens = tokenizer(data.tolist(), padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)
        return embeddings

    batch_size = 128
    num_samples = len(data)
    num_batches = (num_samples + batch_size - 1) // batch_size

    embeddings_list = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        batch_data = data.iloc[start_idx:end_idx]
        batch_embeddings = get_bert_embeddings(batch_data)
        embeddings_list.append(batch_embeddings)

    embeddings = torch.cat(embeddings_list, dim=0).cpu().numpy()
    return embeddings

2024-03-07 01:02:26.162351: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)
llm = ChatGoogleGenerativeAI(model="gemini-pro", 
google_api_key="AIzaSyClyO_P1azrly9sScfVL3dJnKy8q7HtayU", 
                            safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    })

The political bias score of this claim is {political_bias} ranging from 0 to 1, with 0 being not biased and 1 being biased.
    The credibility score of the author of this claim is {credibility} ranging from 0 to 1, with 0 being non-credible and 1 being credible.
    The text manipulation score of this claim is {style} ranging from 0 to 1, with 0 being not manipulated and 1 being manipulated.


    For your further reference, we built models to predict some factuality factor scores of 
    political bias (with 0 being not biased and 1 being biased),
    credibility (with 0 being non-credible and 1 being credible),
    and text manipulation (with 0 being not manipulated and 1 being manipulated).
        
    The overall score of all factuality scores is {factuality_score} from 0 to 1
        
    Please consider the evidence over the factuality score.

1. No in-context example, show 3 to 4 examples
2. Tell it what not to do
3. Explain reasoning step by step (chain of thought)
4. Give an example, prompt should be in sections (1,2,3 steps)

In [8]:
def evaluate_claim(claim, evidence, political_bias, credibility, text_manipulation, sentiment, source_reliability):

    prompt = f""" I have a scale that I use to rate the veracity of a claim, please read it over:
    
    TRUE (0) – The statement is accurate and there's nothing significant missing. It aligns entirely with verified facts, 
    without any distortions.
    MOSTLY TRUE (1) – The statement is accurate but needs clarification or additional information.
    HALF TRUE (2)– The statement is partially accurate but leaves out important details or takes things out of context.
    MOSTLY FALSE/BARELY TRUE (3) – The statement contains an element of truth but ignores critical facts that would give 
    a different impression. The truthful part is minimal compared to the overall inaccuracies.
    FALSE (4) – The statement is not accurate.
    ENTIRELY FABRICATED/PANTS ON FIRE (5) – The statement is not accurate AND it makes a ridiculous claim. It's completely fabricated 
    and has no basis in reality, and is likely a deliberate distortion intended to deceive.
    
    Here are some examples of some claims and their veracity classifications on the same scale of from 0 to 5,
    with 0 being true and 5 being entirely fabricated, with brief explanations:
    
    Claim 1: 'Hillary Clinton in 2005 co-sponsored legislation that would jail flag burners.'
    Classification 1: '0. This claim is 0 (True) because Hillary Clinton co-sponsored legislation in 2005 that would jail flag burners.'
    
    Claim 2: ''On military recruiters at Harvard, Elena Kagan took a position and the Supreme Court ruled unanimously that she was wrong.'
    Classification 2: '0. This claim is 0 (True) because the court did  reject the arguments put forward by the law schools, 
    which included FAIR  and the brief filed by the Harvard professors. All of the justices who voted opposed the law schools arguments'
    
    Claim 3: 'Have the suburbs been inundated with former residents of Atlanta housing projects? Absolutely not.'
    Classification 3: '0. This claim is 0 (True) because DeKalb Countys population is slightly more than 747,000, the Census Bureau data show.
    The data from the AHA and Georgia State both reach similar conclusions that the percentage of tenants moving out of the city has been small.'
    
    Claim 4: 'Under legislation that has cleared the Georgia House, some children who are legal refugees 
    could obtain state scholarships to attend private schools.'
    Classification 4: '0. This claim is 0 (True) because legislation has cleared the Georgia House that would expand the list of students eligible for a 
    private school scholarship program created in 2007. The scholarships are now offered in varying amounts to students with disabilities.
    The bill would open the program to about 700 legal refugees who are not proficient in English.'
    
    Claim 5: 'Hillary Clinton agrees with John McCain by voting to give George Bush the benefit of the doubt on Iran.'
    Classification 5: '1. This claim is 1 (mostly true). Although Clinton may have "agreed" with McCain on the issue, they did not technically vote the same way on it. 
    To say that voting for Kyl-Lieberman is giving George Bush the benefit of the doubt on Iran remains a contentious issue. But Obamas main point is that Clinton and McCain 
    were on the same side, and that is correct.'
    
    Claim 6: 'Mark Pryor votes with Obama 93 percent of the time.'
    Classification 6: '1. This claim is 1 (mostly true) because Since Obama became president, Pryor has voted in line with the 
    presidents positions between 90 and 95 percent of the time, with 92. 6 percent -- basically 93 percent -- as the average, 
    according to the best rating system at our disposal. Pryor doesnt vote with the Democratic Party quite as often, though, 
    and in 2013 his presidential support votes were lower than every other Senate Democrat. Cottons number is on target based on 
    the data, but Pryor has also opposed Obama on a few key issues.'
    
    Claim 7: 'Hillary Clinton said the Veterans Affairs scandal is over-exaggerated. She said she was satisfied with what was going on.'
    Classification 7: 'This claim is 3 (barely true) because while Clinton has said problems at the VA have not been as widespread as it has been made out to be,
    she has also acknowledged systemic problems within the system and repeatedly urged reform so veterans can get care quickly.'
    
    Claim 8: 'the paperback edition of Mitt Romneys book deleted line that Massachusetts individual mandate should be the model for the country'
    Classification 8: '3. This claim is 3 (barely true) because Perry's right that Romney's comments about health care were edited between editions. Among other things, 
    a line that advocated the Massachusetts model as a strong option for other states was replaced by a shorter, more generic sentence. But that line was preceded
    by an argument for state-level solutions, exactly the argument Romney extends now. That's not how Perry characterized it.'
    
    Claim 9: 'This year in Congress Connie Mack IV has missed almost half of his votes.'
    Classification 9: '3. This claim is 3 (barely true) because Mack has more votes than the average member of
    the U. S.  House of Representatives, and hes missed high-profile votes on health care and the budget. He hasnt missed almost half of his votes, though.'
    
    Claim 10: 'Numerous studies have shown that these so-called right-to-work laws do not generate jobs and economic growth.'
    Classification 10: '2. This claim is 2 (half true) because Right-to-work states have seen greater job increases, but one economist pointed out that such a dynamic doesnt prove
    right-to-work laws were the cause. Another professor who believes job growth results from right-to-work laws acknowledged that many other factors also could be responsible.'
    
    Claim 11: 'When did the decline of coal start? It started when natural gas took off that started to begin in President George W. Bushs administration.'
    Classification 11: '2. This claim is 2 (half true) because there is no doubt, natural gas has been gaining ground on coal in generating electricity. 
    The trend started in the 1990s but clearly gained speed during the Bush administration when the production of natural gas -- a competitor of coal -- picked up. 
    But analysts give little credit or blame to Bush for that trend. They note that other factors, such as technological innovation, entrepreneurship and
    policies of previous administrations, had more to do with laying the groundwork for the natural gas boom.'
    
    Claim 12: 'Of Virginias 98,000 teachers who are K-12, over 53,000 of those teachers today are over 50 years old.'
    Classification 12: '4. This claim is 4 (false) because Virginia does not keep data that is solely focused on the age of teachers. The best statistics available show that the states 
    instructional staff -- including teachers, librarians, guidance counselors and technology instructors -- was 98,792 during the 2010-11 school year and, of them, 33,462 were 50 or older.
    The teaching corps is not moving toward retirement with anything close to the speed described.'
    
    Claim 13: 'What the Obama administration is going to come out with in the next several months is youre not even going to be able to burn coal very 
    limitedly in the existing plants.'
    Classification 13: '4. This claim is 4 (false) because The proposal the claim is referring to is an EPA plan to cut carbon emissions in existing power plants. Those rules do not prohibit 
    current facilities from burning coal, and even Capitos spokeswoman said the rule doesnt mean that every plant has to close. Some facilities will close down within the next decade, 
    but many of those plants were scheduled to be retired anyway due to age and other factors. States and power companies have options to continue to utilize coal for energy, and experts said 
    they expect coal to remain part of the national portfolio for years to come.'
    
    Claim 14: 'Charlie Crist supports cuts to the Medicare Advantage program.'
    Classification 14: '4. This claim is 4 (false) because Crist has flip-flopped on a lot of issues, including the federal health care law. He used to oppose the Affordable Care Act, 
    but now he supports it. The law tries to bring down future health care costs by reducing Medicare Advantage payments. But on the issue of Medicare Advantage, Crist has actually been
    consistent: Hes been critical of the Medicare Advantage cuts for years. He specifically said he opposed the reductions in 2009 and 2010, and he 
    still opposes them today. Crist doesnt appear to have come up with other ways to save money on health care without reducing payments to Medicare Advantage.'
    
    Claim 15: 'Says Ohio budget item later signed into law by Gov. John Kasich requires women seeking an abortion to undergo a mandatory vaginal probe.'
    Classification 15: '5. This claim is 5 (entirely false/fabricated) because there should be no debate about what types of ultrasounds these new regulations require. there is a mandate 
    -- but for external detection methods.'
    
    Claim 16: 'Roughly 25% of RI has a criminal record'
    Classification 16: '5. This claim is 5 (entirely false/fabricated) because They cant be sure of the exact percentage because of the way the data is collected, but theyre certain its nowhere near 25 percent. 
    DAREs statement is based on old, flawed statistics that were tweaked and re-tweaked to make a point.'
    
    Claim 17: 'We spend less on defense today as percentage of GDP than at any time since Pearl Harbor.'
    Classification 17: '5. This claim is 5 (entirely false/fabricated) because since the Cold War ended, and in a few other years as well, the percent of GDP used for defense has been consistently lower 
    than current spending levels.'
    
    Additionally, I've rated the claim based on the following three factors: political bias, credibility, and text manipulation, whose scales are explained below: 
    1. Political Bias: 
        - 0 (Left-leaning): The claim or content exhibits a bias towards left-wing ideologies, favoring liberal or progressive viewpoints.
        - 0.5 (Neutral): The claim or content shows no clear bias towards either left-wing or right-wing ideologies and presents information in an impartial manner.
        - 1 (Right-leaning): The claim or content exhibits a bias towards right-wing ideologies, favoring conservative or traditional viewpoints.
    
    2. Credibility: A float between 0 and 1, where 1 indicates high credibility and 0 indicates low credibility. High credibility means the information is reliable, well-sourced, and supported by evidence, while low credibility suggests the information may be unsubstantiated, misleading, or false.
    
    3. Text Manipulation: 
       - 0: No text manipulation
       - 1: Text manipulation present
    4. Sentiment:
        - 0 (Positive): The text expresses mostly positive emotions or opinions.
        - 1 (Neutral): The text expresses a mix of positive and negative emotions, or neither.
        - 2 (Negative): The text expresses mostly negative emotions or opinions.
    5. Source Reliability: 
        - 0 (Completely True): The information in the source is considered highly reliable and trustworthy, based on established facts and evidence.
        - 1 (Mostly True): The information in the source is generally accurate, with minor potential for errors or biases.
        - 2 (Half True): The information in the source is a mix of true and false claims, making it difficult to discern the accuracy without further verification.
        - 3 (Barely True): The information in the source has limited accuracy and may contain significant misleading or false information.
        - 4 (False): The information in the source is demonstrably false and inaccurate.
        - 5 (Entirely False or Pants on Fire): The information in the source is blatantly false and fabricated, often used to deceive or mislead.

    My Political Bias rating for this claim is: {political_bias}
    My Credibility rating for this claim is: {credibility}
    My Text Manipulation rating for this claim is: {text_manipulation}
    My Sentiment rating for this claim is: {sentiment}
    My Source Reliability rating fot this claim is: {source_reliability}
    
    Please rate the veracity of the following claim on the same 0 to 5 scale. Ensure that the first character 
    in your response is a single integer between 0 and 5, and explain your reasoning like the examples above, citing
    any evidence in a chain of thought fashion. {claim} 
   
    """
    response = llm.invoke(prompt).content

    rating = response[0]

    return response, int(rating)

  5. Source Reliability:
        - 0 (Completely True): The information in the source is considered highly reliable and trustworthy, based on established facts and evidence.
        - 1 (Mostly True): The information in the source is generally accurate, with minor potential for errors or biases.
        - 2 (Half True): The information in the source is a mix of true and false claims, making it difficult to discern the accuracy without further verification.
        - 3 (Barely True): The information in the source has limited accuracy and may contain significant misleading or false information.
        - 4 (False): The information in the source is demonstrably false and inaccurate.
        - 5 (Entirely False or Pants on Fire): The information in the source is blatantly false and fabricated, often used to deceive or mislead.

    My Source Reliability rating fot this claim is: {source_reliability}


    Additionally, please consider the the evidence of this claim. if the evidence is inconclusive or does not directly address the claim, please base your rating on your knowledge and indicate the lack of direct evidence in your explanation.
    Here is the evidence: {evidence}

    Fourthly, please consider the the evidence of this claim: {evidence}
    
    Finally, if the evidence is inconclusive or does not directly address the claim, please base your rating on your knowledge and indicate the lack of direct evidence in your explanation.


In [5]:
import pandas as pd
# df = pd.read_csv('test2_score_1.tsv', delimiter='\t')

# df = df.drop(columns = [0])
# df.rename({1: 'id', 2: 'label', 3: 'statement', 4: 'subject', 5: 'speaker', 6: 'job-title',
#            7: 'state_info', 8: 'party_affiliation', 9: 'barely_true_counts', 10: 'false_counts',
#            11: 'half_true_counts', 12: 'mostly_true_counts', 13: 'pants_on_fire_counts', 14: 'context',
#            15: 'justification'
#           }, axis = 1, inplace = True)
df = pd.read_csv('../original_files/politifact_data_2022_score.csv')

df['documented_time'] = pd.to_datetime(df['documented_time'])

df = df[~df['label'].isin({'full-flop', 'half-flip', 'no-flip'})]

#only rows after 2022 (recent)
df = df[df['documented_time'].dt.year >= 2022]

label_map = {'pants-fire': 5, 'false': 4, 'barely-true': 3, 
             'half-true': 2, 'mostly-true': 1, 'true': 0}
df['label'] = df['label'].astype(int)
df['label'] = df['label'].replace(label_map)

In [6]:
min_max_dict = {
    'Credibility': (min(df['Credibility']), max(df['Credibility'])),
}

# Define the poly_score function
def poly_score(row):
    factor = 'Credibility'
    min_val, max_val = min_max_dict[factor]
    if max_val - min_val == 0:  # Check for zero division
        row[factor] = 0  # or 1, depending on your preference
    else:
        row[factor] = (row[factor] - min_val) / (max_val - min_val)
    return row

# Apply the function to each row
normalized_df = df.apply(poly_score, axis=1)

In [9]:
def get_ann(content, client):
    evidence = []
    if type(content) == str:
        content = [content]
    for text_query in content:
        query_vector = {"vector" : text_embedding(pd.Series(text_query)).tolist()[0],
                    "distance" : 1.0
        }
        results = client.query.get("test_dataset_1", ["context"]).with_additional("distance"
                    ).with_near_vector(query_vector).do()
        evidence.append([result["context"] for result in results['data']['Get']['Test_dataset_1'][:10]])
    return evidence

In [13]:
# Add output for each stage, check the evidence from the weaviate vector search occationally
import weaviate
from IPython.display import clear_output
import numpy as np
client = weaviate.Client(
        url = "https://testing-cluster-2qgcoz4q.weaviate.network",  # Replace with your endpoint
        auth_client_secret=weaviate.auth.AuthApiKey(api_key="qRarwGLC0CwrpQsSpK64E1V0c3HajFoAy893"),  # Replace w/ your Weaviate instance API key
    )

preds = []
labels = []
for index, row in normalized_df.iterrows():
    if index % 50 == 0:
        clear_output(wait=True)
        print(f"Running at iteration {index}")
    try:
#         evidence = get_ann(row['content'], client)
        evidence = ""
        political_bias, credibility, style, sentiment, reliability = row['Political_Bias'], row['Credibility'], row['Style'], row['Sentiment'], row['Reliability'] 
        result = evaluate_claim(row['content'], evidence, political_bias, credibility, style, sentiment, reliability)[1]
        preds.append(result)
        labels.append(row['label'])
    except:
        continue

Running at iteration 3000


In [14]:
from sklearn.metrics import f1_score
f1_score(labels, preds, average='macro')

0.1940232562033842

In [15]:
import numpy as np
sum(np.abs(np.array(preds) - np.array(labels)) <= 1)

2049

In [16]:
sum(np.abs(np.array(preds) - np.array(labels)) <= 1)/len(preds)

0.6887394957983193

In [17]:
results_df = pd.DataFrame({'predicted': preds, 'label': labels})

results_df['predicted'] = results_df['predicted'].astype(int)
(results_df['predicted'] == results_df['label']).mean()

0.2507563025210084

In [86]:
row['statement']

'Says the governor is going around the state talking about [how] we should fund an income tax cut that benefits higher income earners and not lower income earners'

In [87]:
evaluate_claim(row['statement'], evidence, political_bias, credibility, style)

('1. This claim is 1 (partially true). The claim is ambiguous in that it does not specify what type of income tax cut the governor is proposing. However, the evidence suggests that the governor is proposing an income tax cut that would benefit higher income earners and not lower income earners. For example, the evidence states that "the governor wants to raise the earned income tax credit to 40 percent of the federal level, so that working families can lift themselves out of poverty." This suggests that the governor is proposing an income tax cut that would benefit lower income earners. However, the evidence also states that "the governor is going around the state talking about [how] we should fund an income tax cut that benefits higher income earners and not lower income earners." This suggests that the governor is proposing an income tax cut that would benefit higher income earners. Therefore, the claim is partially true.',
 1)