In [3]:
import torch
from transformers import BertTokenizer, BertModel, pipeline

def text_embedding(data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

    def get_bert_embeddings(data):
        tokens = tokenizer(data.tolist(), padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)
        return embeddings

    batch_size = 128
    num_samples = len(data)
    num_batches = (num_samples + batch_size - 1) // batch_size

    embeddings_list = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        batch_data = data.iloc[start_idx:end_idx]
        batch_embeddings = get_bert_embeddings(batch_data)
        embeddings_list.append(batch_embeddings)

    embeddings = torch.cat(embeddings_list, dim=0).cpu().numpy()
    return embeddings

In [2]:
import pandas as pd

df = pd.read_csv('../original_files/politifact_data_2022_score.csv')

df['documented_time'] = pd.to_datetime(df['documented_time'])

df = df[~df['label'].isin({'full-flop', 'half-flip', 'no-flip'})]

#only rows after 2022 (recent)
df = df[df['documented_time'].dt.year >= 2022]

label_map = {'pants-fire': 5, 'false': 4, 'barely-true': 3, 
             'half-true': 2, 'mostly-true': 1, 'true': 0}
df['label'] = df['label'].astype(int)
df['label'] = df['label'].replace(label_map)

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from gnews import GNews
import numpy as np
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Weaviate
import nltk
import torch
from transformers import BertTokenizer, BertModel, pipeline
nltk.download('punkt')
def rag(content):
    # Advance RAG
    print("Retrieving keywords...")
    tfidf_vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
    tfidf_matrix = tfidf_vectorizer.fit_transform([content])
    feature_names = tfidf_vectorizer.get_feature_names()
    scores = tfidf_matrix.toarray().flatten()
    indices = scores.argsort()[::-1]
    top_n = 10
    top_features = [(feature_names[i], scores[i]) for i in indices[:top_n]]
    keywords = " ".join([feature for feature, score in top_features])
    for feature, score in top_features:
        print(f"{feature}: {score}")
    print("RAG: Getting new evidence...")
    google_news = GNews()
    max_results = 4
    # google_news.period = '7d'
    google_news.max_results = max_results 
    # google_news.country = 'United States'
    google_news.language = 'english'
    # google_news.exclude_websites = ['yahoo.com', 'cnn.com'] 
    google_news.start_date = (2022, 1, 1)
    google_news.end_date = (2023, 12, 31)
    articles = []
    news = google_news.get_news(keywords)
    for i in range(max_results):
        try:
            article = google_news.get_full_article(
                news[i]['url']
            )
        except:
            break
        articles.append(article)
    title_text = [article.title for article in articles if article]
    article_text = [article.text for article in articles if article]

    # Chunk the google news
    class Document:
        def __init__(self, text):
            self.page_content = text
            self.metadata = {'source': 'google news'}

    print("Chunking the articles")
    documents = [Document(article) for article in article_text]
    text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0)
    chunked_articles = text_splitter.split_documents(documents)
    chunked_articles = [document.page_content for document in chunked_articles]
    
    # #Advance RAG
    print("Posting new evidence to vector database...")
    for article in chunked_articles:
        # Check for duplicate before posting
        try:
            properties = {"context": article}
            vector = text_embedding(pd.Series(article)).tolist()[0]
            client.data_object.create(properties, "test_dataset_1", vector=vector)
        except:
            continue

2024-03-06 06:41:45.688919: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
03/06/2024 06:41:47 AM - Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
03/06/2024 06:41:47 AM - NumExpr defaulting to 8 threads.
[nltk_data] Downloading package punkt to /home/zhj003/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
import weaviate
client = weaviate.Client(
    url = "https://testing-cluster-2qgcoz4q.weaviate.network",  # Replace with your endpoint
    auth_client_secret=weaviate.auth.AuthApiKey(api_key="qRarwGLC0CwrpQsSpK64E1V0c3HajFoAy893"),  # Replace w/ your Weaviate instance API key
)

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


In [None]:
from IPython.display import clear_output
for index, row in df.iterrows():
    client = weaviate.Client(
    url = "https://testing-cluster-2qgcoz4q.weaviate.network",  # Replace with your endpoint
        auth_client_secret=weaviate.auth.AuthApiKey(api_key="qRarwGLC0CwrpQsSpK64E1V0c3HajFoAy893"),  # Replace w/ your Weaviate instance API key
    )
    if index < 50:
        continue
    if index % 50 == 0:
        clear_output(wait=True)
        print(f"Running at iteration {index}")
    rag(row['article'])

Running at iteration 700
Retrieving keywords...
pentagon: 0.3491282676376715
facebook: 0.2493773340269082
attack: 0.2493773340269082
explosion: 0.2493773340269082
news: 0.19950186722152657
post: 0.19950186722152657
22: 0.19950186722152657
near: 0.19950186722152657
said: 0.14962640041614492
smoke: 0.14962640041614492
RAG: Getting new evidence...


03/07/2024 05:34:57 AM - Created a chunk of size 326, which is longer than the specified 300
03/07/2024 05:34:57 AM - Created a chunk of size 361, which is longer than the specified 300
03/07/2024 05:34:57 AM - Created a chunk of size 366, which is longer than the specified 300
03/07/2024 05:34:57 AM - Created a chunk of size 332, which is longer than the specified 300
03/07/2024 05:34:57 AM - Created a chunk of size 376, which is longer than the specified 300
03/07/2024 05:34:57 AM - Created a chunk of size 320, which is longer than the specified 300
03/07/2024 05:34:57 AM - Created a chunk of size 364, which is longer than the specified 300
03/07/2024 05:34:57 AM - Created a chunk of size 305, which is longer than the specified 300
03/07/2024 05:34:57 AM - Created a chunk of size 340, which is longer than the specified 300


Chunking the articles
Posting new evidence to vector database...
Retrieving keywords...
pandemic: 0.3731033238480316
said: 0.3497843661075296
flu: 0.2798274928860237
children: 0.2798274928860237
bird: 0.1865516619240158
avian: 0.16323270418351382
health: 0.13991374644301185
positive: 0.13991374644301185
tested: 0.13991374644301185
morris: 0.13991374644301185
RAG: Getting new evidence...
Chunking the articles
Posting new evidence to vector database...
Retrieving keywords...
bowman: 0.44992127066584753
killed: 0.22496063533292376
dr: 0.179968508266339
burdock: 0.179968508266339
sebi: 0.179968508266339
facebook: 0.13497638119975425
false: 0.13497638119975425
video: 0.13497638119975425
died: 0.13497638119975425
prison: 0.13497638119975425
RAG: Getting new evidence...


03/07/2024 05:36:29 AM - Created a chunk of size 456, which is longer than the specified 300
03/07/2024 05:36:29 AM - Created a chunk of size 437, which is longer than the specified 300


Chunking the articles
Posting new evidence to vector database...
Retrieving keywords...
election: 0.3916103699798974
lake: 0.3916103699798974
signatures: 0.24920659907811654
ballots: 0.23140612771539393
maricopa: 0.1958051849899487
mail: 0.1780047136272261
county: 0.1780047136272261
arizona: 0.1780047136272261
signature: 0.1602042422645035
verification: 0.14240377090178089
RAG: Getting new evidence...


03/07/2024 05:36:59 AM - Created a chunk of size 444, which is longer than the specified 300
03/07/2024 05:36:59 AM - Created a chunk of size 336, which is longer than the specified 300
03/07/2024 05:36:59 AM - Created a chunk of size 344, which is longer than the specified 300
03/07/2024 05:36:59 AM - Created a chunk of size 445, which is longer than the specified 300
03/07/2024 05:36:59 AM - Created a chunk of size 331, which is longer than the specified 300
03/07/2024 05:36:59 AM - Created a chunk of size 678, which is longer than the specified 300
03/07/2024 05:36:59 AM - Created a chunk of size 447, which is longer than the specified 300
03/07/2024 05:36:59 AM - Created a chunk of size 394, which is longer than the specified 300
03/07/2024 05:36:59 AM - Created a chunk of size 412, which is longer than the specified 300
03/07/2024 05:36:59 AM - Created a chunk of size 302, which is longer than the specified 300
03/07/2024 05:36:59 AM - Created a chunk of size 488, which is longer 

Chunking the articles
Posting new evidence to vector database...
Retrieving keywords...
en: 0.4538485592273606
el: 0.41062488691999294
la: 0.3890130507663091
que: 0.30256570615157374
del: 0.2161183615368384
video: 0.1728946892294707
biden: 0.1728946892294707
peligro: 0.15128285307578687
discurso: 0.12967101692210303
publicación: 0.1080591807684192
RAG: Getting new evidence...


03/07/2024 05:39:42 AM - Created a chunk of size 367, which is longer than the specified 300


Chunking the articles
Posting new evidence to vector database...
Retrieving keywords...
men: 0.49761714291127057
labor: 0.29857028574676236
participation: 0.2587609143138607
force: 0.2587609143138607
age: 0.2587609143138607
working: 0.23885622859740988
scott: 0.1791421714480574
people: 0.1592374857316066
decline: 0.1592374857316066
rate: 0.13933280001515577
RAG: Getting new evidence...


03/07/2024 05:39:54 AM - Created a chunk of size 534, which is longer than the specified 300
03/07/2024 05:39:54 AM - Created a chunk of size 460, which is longer than the specified 300
03/07/2024 05:39:54 AM - Created a chunk of size 676, which is longer than the specified 300
03/07/2024 05:39:54 AM - Created a chunk of size 1368, which is longer than the specified 300
03/07/2024 05:39:54 AM - Created a chunk of size 341, which is longer than the specified 300
03/07/2024 05:39:54 AM - Created a chunk of size 370, which is longer than the specified 300
03/07/2024 05:39:54 AM - Created a chunk of size 477, which is longer than the specified 300
03/07/2024 05:39:54 AM - Created a chunk of size 387, which is longer than the specified 300
03/07/2024 05:39:54 AM - Created a chunk of size 1080, which is longer than the specified 300
03/07/2024 05:39:54 AM - Created a chunk of size 1003, which is longer than the specified 300
03/07/2024 05:39:54 AM - Created a chunk of size 841, which is long

Chunking the articles
Posting new evidence to vector database...
Retrieving keywords...
water: 0.49948427010514984
h3o2: 0.3178536264305499
video: 0.18163064367459994
fruits: 0.18163064367459994
said: 0.18163064367459994
facebook: 0.13622298275594996
body: 0.13622298275594996
drinking: 0.13622298275594996
human: 0.09081532183729997
plain: 0.09081532183729997
RAG: Getting new evidence...
Chunking the articles
Posting new evidence to vector database...
Retrieving keywords...
hiv: 0.37912988830561156
covid: 0.28434741622920867
aids: 0.260651798210108
vaccines: 0.260651798210108
said: 0.23695618019100723
19: 0.23695618019100723
cdc: 0.18956494415280578
post: 0.14217370811460434
increase: 0.14217370811460434
cancers: 0.14217370811460434
RAG: Getting new evidence...


03/07/2024 05:42:07 AM - Created a chunk of size 632, which is longer than the specified 300
03/07/2024 05:42:07 AM - Created a chunk of size 436, which is longer than the specified 300
03/07/2024 05:42:07 AM - Created a chunk of size 680, which is longer than the specified 300
03/07/2024 05:42:07 AM - Created a chunk of size 741, which is longer than the specified 300
03/07/2024 05:42:07 AM - Created a chunk of size 780, which is longer than the specified 300
03/07/2024 05:42:07 AM - Created a chunk of size 459, which is longer than the specified 300
03/07/2024 05:42:07 AM - Created a chunk of size 643, which is longer than the specified 300
03/07/2024 05:42:07 AM - Created a chunk of size 381, which is longer than the specified 300
03/07/2024 05:42:07 AM - Created a chunk of size 340, which is longer than the specified 300
03/07/2024 05:42:07 AM - Created a chunk of size 835, which is longer than the specified 300
03/07/2024 05:42:07 AM - Created a chunk of size 889, which is longer 

Chunking the articles
Posting new evidence to vector database...
Retrieving keywords...
students: 0.36628971787912823
school: 0.3174510888285778
000: 0.2686124597780274
year: 0.24419314525275215
miami: 0.24419314525275215
new: 0.21977383072747694
suarez: 0.19535451620220173
dade: 0.19535451620220173
public: 0.19535451620220173
number: 0.17093520167692652
RAG: Getting new evidence...
An error occurred while fetching the article: Article `download()` failed with HTTPSConnectionPool(host='www.miamiherald.com', port=443): Read timed out. (read timeout=7) on URL https://news.google.com/rss/articles/CBMiUWh0dHBzOi8vd3d3Lm1pYW1paGVyYWxkLmNvbS9uZXdzL2xvY2FsL2NvbW11bml0eS9taWFtaS1kYWRlL2FydGljbGUyNzY4NjA5MTMuaHRtbNIBUWh0dHBzOi8vYW1wLm1pYW1paGVyYWxkLmNvbS9uZXdzL2xvY2FsL2NvbW11bml0eS9taWFtaS1kYWRlL2FydGljbGUyNzY4NjA5MTMuaHRtbA?oc=5&hl=en-US&gl=US&ceid=US:en


03/07/2024 05:45:32 AM - Created a chunk of size 320, which is longer than the specified 300
03/07/2024 05:45:32 AM - Created a chunk of size 308, which is longer than the specified 300


Chunking the articles
Posting new evidence to vector database...
Retrieving keywords...
patients: 0.4607878789932069
pneumonia: 0.373018759184977
covid: 0.3071919193288046
19: 0.26330735942468964
associated: 0.19748051956851723
study: 0.19748051956851723
ventilator: 0.19748051956851723
ventilators: 0.17553823961645976
killed: 0.13165367971234482
nearly: 0.10971139976028735
RAG: Getting new evidence...
An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Max restarts limit reached for url: https://www.forbes.com/sites/brucelee/2023/06/11/claims-that-ventilators-killed-nearly-all-with-covid-19-are-unfounded/ on URL https://news.google.com/rss/articles/CBMid2h0dHBzOi8vd3d3LmZvcmJlcy5jb20vc2l0ZXMvYnJ1Y2VsZWUvMjAyMy8wNi8xMS9jbGFpbXMtdGhhdC12ZW50aWxhdG9ycy1raWxsZWQtbmVhcmx5LWFsbC13aXRoLWNvdmlkLTE5LWFyZS11bmZvdW5kZWQv0gF7aHR0cHM6Ly93d3cuZm9yYmVzLmNvbS9zaXRlcy9icnVjZWxlZS8yMDIzLzA2LzExL2NsYWltcy10aGF0LXZlbnRpbGF0b3JzLWtpbGxlZC1uZWFybHktYWxsLXdpdGgtY2

03/07/2024 05:47:27 AM - Created a chunk of size 361, which is longer than the specified 300
03/07/2024 05:47:27 AM - Created a chunk of size 373, which is longer than the specified 300
03/07/2024 05:47:27 AM - Created a chunk of size 387, which is longer than the specified 300
03/07/2024 05:47:27 AM - Created a chunk of size 486, which is longer than the specified 300
03/07/2024 05:47:27 AM - Created a chunk of size 389, which is longer than the specified 300
03/07/2024 05:47:27 AM - Created a chunk of size 340, which is longer than the specified 300
03/07/2024 05:47:27 AM - Created a chunk of size 397, which is longer than the specified 300
03/07/2024 05:47:27 AM - Created a chunk of size 528, which is longer than the specified 300
03/07/2024 05:47:27 AM - Created a chunk of size 433, which is longer than the specified 300
03/07/2024 05:47:27 AM - Created a chunk of size 373, which is longer than the specified 300
03/07/2024 05:47:27 AM - Created a chunk of size 457, which is longer 

Chunking the articles
Posting new evidence to vector database...
Retrieving keywords...
myocarditis: 0.39497143888904995
covid: 0.29622857916678746
19: 0.27154286423622187
cases: 0.27154286423622187
said: 0.2221714343750906
health: 0.2221714343750906
enterovirus: 0.19748571944452498
wales: 0.14811428958339373
infection: 0.14811428958339373
vaccines: 0.14811428958339373
RAG: Getting new evidence...


03/07/2024 05:49:07 AM - Created a chunk of size 428, which is longer than the specified 300
03/07/2024 05:49:07 AM - Created a chunk of size 370, which is longer than the specified 300
03/07/2024 05:49:07 AM - Created a chunk of size 312, which is longer than the specified 300
03/07/2024 05:49:07 AM - Created a chunk of size 331, which is longer than the specified 300
03/07/2024 05:49:07 AM - Created a chunk of size 337, which is longer than the specified 300
03/07/2024 05:49:07 AM - Created a chunk of size 349, which is longer than the specified 300
03/07/2024 05:49:07 AM - Created a chunk of size 460, which is longer than the specified 300


Chunking the articles
Posting new evidence to vector database...
