In [1]:
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import torch
from langchain.text_splitter import CharacterTextSplitter
from transformers import DistilBertTokenizer, DistilBertModel, pipeline
import requests
import time

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Weaviate
import pandas as pd

import weaviate
import json


  torch.utils._pytree._register_pytree_node(


In [2]:
def text_embedding(data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
    
    def get_bert_embeddings(data):
        tokens = tokenizer(data.tolist(), padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)
        return embeddings

    batch_size = 128
    num_samples = len(data)
    num_batches = (num_samples + batch_size - 1) // batch_size

    embeddings_list = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        batch_data = data.iloc[start_idx:end_idx]
        batch_embeddings = get_bert_embeddings(batch_data)
        embeddings_list.append(batch_embeddings)

    embeddings = torch.cat(embeddings_list, dim=0).cpu().numpy()
    return embeddings

In [3]:
client = weaviate.Client(
    url = "https://testing-cluster-2qgcoz4q.weaviate.network",
    auth_client_secret=weaviate.auth.AuthApiKey(api_key="qRarwGLC0CwrpQsSpK64E1V0c3HajFoAy893"),
)

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [4]:
# schema = {
#     "classes": [
#         {
#             "class": "npr_archive",
#             "vectorizer": None,
#             "properties": [
#                 {"name": "context", "dataType": ["string"], "index" : True},
#                 {"name": "vector", "dataType": ["int[]"], "index" : True}
#             ],
#         }
#     ],
# }

# client.schema.create(schema)

In [4]:
#url = "https://www.npr.org/sections/news/archive?date=12-31-2020" #2020 URL
#url = "https://www.npr.org/sections/news/archive?date=2-29-2024" #2024 url
#url = "https://www.npr.org/sections/news/archive?date=12-31-2023" #2023 url
# url = "https://www.npr.org/sections/news/archive?date=12-31-2022" #2022
url = "https://www.npr.org/sections/politics/archive?date=11-30-2022" #nov-2022 politics

response = requests.get(url)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

class Document:
    def __init__(self, text):
        self.page_content = text
        self.metadata = {'source': 'NPR 2022'}
        
#scraping base articles
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")

    archive_list = soup.find('div', class_='archivelist')
    articles = archive_list.find_all('article', recursive=False)

    for article in articles:
        
        texts = []

        link = article.find('a')['href']
        
        article_response = requests.get(link)
        
        if article_response.status_code == 200:
            
            #scraping
            article_soup = BeautifulSoup(article_response.content, "html.parser")
            paragraphs = article_soup.find_all('p')

            text = '\n'.join([p.get_text() for p in paragraphs])
            texts.append(text)
            
            #chunking
            documents = [Document(article) for article in texts]
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
            chunked_text = text_splitter.split_documents(documents)
            chunked_text = [document.page_content for document in chunked_text]
            
            
            #post to weaviate db
            for sentence in chunked_text:
                properties = {"context": sentence}
                vector = text_embedding(pd.Series(sentence)).tolist()[0]
                client.data_object.create(properties, "npr_archive", vector=vector)
                
            print(link)
           
            
            
        else:
            print(f"Failed to fetch {link}. Status code: {article_response.status_code}")

https://www.npr.org/2022/11/30/1139968573/nyc-mayor-adams-faces-backlash-for-move-to-involuntarily-hospitalize-homeless-pe
https://www.npr.org/2022/11/30/1139874678/can-newly-elected-lgbtq-lawmakers-shift-the-landscape-for-lgbtq-rights
https://www.npr.org/2022/11/30/1139971241/anti-semitism-is-on-the-rise-and-not-just-among-high-profile-figures
https://www.npr.org/2022/11/30/1139924914/twitters-chaos-could-make-political-violence-worse-outside-of-the-u-s
https://www.npr.org/2022/11/30/1139876084/congress-house-railroad-strike-bill
https://www.npr.org/2022/11/30/1139742011/jeffries-poised-to-make-history-as-first-black-person-to-lead-congressional-part
https://www.npr.org/2022/11/30/1139848199/lawmakers-in-congress-take-on-same-sex-marriage-and-a-potential-rail-strike
https://www.npr.org/sections/law/
https://www.npr.org/2022/11/30/1139744006/macron-state-visit-biden
https://www.npr.org/2022/11/29/1139772081/supreme-court-immigration-enforcement
https://www.npr.org/2022/11/29/1139454126

# Infinite scroll

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By

In [6]:
def extract_articles(soup):
    articles = []
    archive_list = soup.find('div', {'id': 'infinitescroll'})
    if archive_list:
        articles.extend(archive_list.find_all('article', recursive=False))
    return articles

def extract_text_from_article(article):
    link = article.find('a')['href']
    article_response = requests.get(link)
    if article_response.status_code == 200:
        article_soup = BeautifulSoup(article_response.content, "html.parser")
        paragraphs = article_soup.find_all('p')
        text = '\n'.join([p.get_text() for p in paragraphs])
        return text
    else:
        print(f"Failed to fetch {link}. Status code: {article_response.status_code}")
        return None

def simulate_scroll(driver):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#     time.sleep(2)

def is_loading_complete(soup):
    loading_div = soup.find('div', id='infscr-loading')
    #the div is set to display: none after ~4 seconds, but it takes longer than that to scrape
    return loading_div is None or 'display: none;' in loading_div.get('style', '')


visited_links = set()

In [11]:
driver = webdriver.Chrome()  

driver.get(url)

In [None]:
complete = False

while complete == False:
   
    simulate_scroll(driver)
    
    try: 
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        articles = extract_articles(soup)



        for article in articles:
            link = article.find('a')['href']
            texts = []

            if link not in visited_links:
                text = extract_text_from_article(article)
                print(link)

                if text:
                    texts.append(text)


                    #chunking
                    documents = [Document(article) for article in texts]
                    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
                    chunked_text = text_splitter.split_documents(documents)
                    chunked_text = [document.page_content for document in chunked_text]

                    #post to weaviate db
                    for sentence in chunked_text:
                        properties = {"context": sentence}
                        vector = text_embedding(pd.Series(sentence)).tolist()[0]
                        client.data_object.create(properties, "npr_archive", vector=vector)
                visited_links.add(link)
    except exception as e:
        print(e)
        continue

            
    simulate_scroll(driver)
    
   
#      if is_loading_complete(soup):
#         complete = True
#         print(link)
#         print('Scraping complete')
        
        
driver.quit()

https://www.npr.org/sections/national/
https://www.npr.org/2022/11/29/1139531185/immigration-supreme-court-biden-trump-priorities-arguments
https://www.npr.org/2022/11/28/1139537325/how-abortion-bans-even-with-medical-emergency-exemptions-impact-healthcare
https://www.npr.org/2022/11/28/1139544131/closing-the-gender-pay-gap-could-be-critical-in-reducing-california-homelessness
https://www.npr.org/2022/11/28/1139447507/arizona-midterm-election-results-cochise-county
https://www.npr.org/2022/11/28/1139417380/white-house-christmas-holiday-decor-theme
https://www.npr.org/2022/11/28/1139388814/ahead-of-next-years-divided-congress-democrats-have-much-work-to-do
https://www.npr.org/2022/11/28/1139388821/can-biden-push-gun-control-legislation-through-congress-before-the-end-of-the-ye
https://www.npr.org/sections/asia/
https://www.npr.org/2022/11/28/1139166191/dont-call-florida-a-red-state-yet-left-leaning-groups-say-their-voters-stayed-ho
https://www.npr.org/2022/11/28/1139127543/democrats-mad

https://www.npr.org/2022/11/15/1136918057/white-house-ukraine-funding-request-congress
https://www.npr.org/2022/11/15/1136641841/temporary-protected-status-extended-trump-biden
https://www.npr.org/2022/11/15/1136811629/mccarthy-nominated-house-speaker-gop-republicans
https://www.npr.org/2022/11/15/1133915672/oregon-midterm-results-gun-control-ballot-measure
https://www.npr.org/2022/11/15/1136563709/young-voters-helped-democrats-win-the-senate-and-other-midterm-elections
https://www.npr.org/2022/11/15/1136738594/if-the-gop-wins-the-house-kevin-mccarthy-will-make-a-bid-to-be-speaker
https://www.npr.org/2022/11/15/1136738587/with-some-elections-undecided-congress-returns-for-a-busy-lame-duck-session
https://www.npr.org/2022/11/15/1136738580/morning-news-brief
https://www.npr.org/2022/11/15/1136491096/kevin-mccarthy-faces-early-loyalty-test-in-his-bid-for-gop-speaker
https://www.npr.org/2022/11/14/1134151957/hobbs-lake-arizona-midterm-election-results-governor
https://www.npr.org/2022/11/1

https://www.npr.org/2022/11/09/1135479163/2022-election-results-history-making-winners
https://www.npr.org/2022/11/09/1135488184/republicans-were-adamant-about-overwhelming-victories-but-they-didnt-happen
https://www.npr.org/2022/11/09/1135496189/examining-the-impact-of-former-president-trump-on-the-midterms
https://www.npr.org/2022/11/09/1134835022/kentucky-abortion-amendment-midterms-results
https://www.npr.org/2022/11/09/1135488183/ahead-of-midterms-the-white-house-insisted-bidens-programs-were-popular
https://www.npr.org/2022/11/09/1135488182/a-ballot-measure-passed-in-missouri-requiring-kansas-city-to-spend-more-on-polic
https://www.npr.org/2022/11/09/1135485034/going-into-the-election-the-gop-controlled-about-two-thirds-of-state-legislature
https://www.npr.org/2022/11/09/1135484400/arizona-has-not-yet-completed-its-vote-counts-for-the-midterm-races
https://www.npr.org/2022/11/09/1135484484/the-party-out-of-power-usually-has-an-advantage-in-midterms-how-did-that-play-ou
https://ww

https://www.npr.org/2022/11/07/1134513927/many-midterm-voters-are-concerned-about-abortion-but-inflation-is-the-top-issue
https://www.npr.org/2022/11/07/1134121891/why-a-southern-california-congressional-race-hinges-on-asian-american-voters
https://www.npr.org/2022/11/07/1134688740/morning-news-brief
https://www.npr.org/2022/11/07/1134513959/as-democrats-try-to-hang-on-to-power-republicans-have-their-own-midterm-agenda
https://www.npr.org/2022/11/07/1134535372/abortion-midterm-election-michigan-kentucky-amendment-roe-dobbs
https://www.npr.org/2022/11/07/1134646535/election-results-delay
https://www.npr.org/2022/11/07/1134525422/detroit-black-representation-house-bivings-thanedar
https://www.npr.org/2022/11/07/1133129875/berlin-wall-npr
https://www.npr.org/2022/11/06/1134608104/politics-chat-biden-focuses-on-policies-in-final-stretch-before-polls-close
https://www.npr.org/2022/11/06/1134412184/election-day-is-coming-heres-what-to-think-about-before-sharing-news
https://www.npr.org/2022/

https://www.npr.org/2022/10/29/1131086240/inflation-biden-economy
https://www.npr.org/2022/10/28/1132554560/obama-says-democracy-on-the-ballot-in-georgia-early-voting-rally
