In [1]:
import requests
from bs4 import BeautifulSoup
import re

def scrape_site(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract header
        header = soup.find(['h1']).get_text().strip()

        # Extract content
        content_tags = soup.find_all(['p'])
        content = [tag.get_text().strip().replace('\xa0', ' ') for tag in content_tags]

        # Find the keyword 'By' to extract the author's name
        page_text = soup.get_text()
        match = re.search(r'\bBy\s+([A-Za-z\s.,]+)', page_text)
        authors = match.group(1).strip().replace('and', ',') if match else 'Author not found'
        author_lst = [auth.strip() for auth in authors.split(',')]
        return header, content, author_lst
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None, None, None

url = "https://www.cnn.com/2024/01/17/politics/biden-ukraine-white-house-meeting/index.html"
header, content, authors = scrape_site(url)

In [2]:
article = " ".join(content)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

tfidf_vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
tfidf_matrix = tfidf_vectorizer.fit_transform([article])
feature_names = tfidf_vectorizer.get_feature_names()
scores = tfidf_matrix.toarray().flatten()
indices = scores.argsort()[::-1]
top_n = 10
top_features = [(feature_names[i], scores[i]) for i in indices[:top_n]]
keywords = " ".join([feature for feature, score in top_features])
for feature, score in top_features:
    print(f"{feature}: {score}")

house: 0.3868699071282999
said: 0.36650833306891567
border: 0.34614675900953146
white: 0.26470046277199466
meeting: 0.24433888871261045
ukraine: 0.20361574059384205
security: 0.18325416653445784
biden: 0.16289259247507362
national: 0.14253101841568944
president: 0.14253101841568944


In [3]:
from gnews import GNews
google_news = GNews()

In [4]:
max_results = 20
# google_news.period = '7d'
google_news.max_results = max_results 
# google_news.country = 'United States'
google_news.language = 'english'
# google_news.exclude_websites = ['yahoo.com', 'cnn.com'] 
google_news.start_date = (2020, 1, 1)
google_news.end_date = (2024, 2, 3)

In [5]:
articles = []
news = google_news.get_news(keywords)
for i in range(max_results):
    try:
        article = google_news.get_full_article(
            news[i]['url']
        )
    except:
        break
    articles.append(article)

An error occurred while fetching the article: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/world/biden-republicans-talking-ukraine-us-border-white-house-meeting-2024-01-17/ on URL https://news.google.com/rss/articles/CBMiaWh0dHBzOi8vd3d3LnJldXRlcnMuY29tL3dvcmxkL2JpZGVuLXJlcHVibGljYW5zLXRhbGtpbmctdWtyYWluZS11cy1ib3JkZXItd2hpdGUtaG91c2UtbWVldGluZy0yMDI0LTAxLTE3L9IBAA?oc=5&hl=en-US&gl=US&ceid=US:en
An error occurred while fetching the article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.wsj.com/politics/policy/fight-over-ukraine-u-s-border-moves-to-white-house-ac15d700 on URL https://news.google.com/rss/articles/CBMiX2h0dHBzOi8vd3d3Lndzai5jb20vcG9saXRpY3MvcG9saWN5L2ZpZ2h0LW92ZXItdWtyYWluZS11LXMtYm9yZGVyLW1vdmVzLXRvLXdoaXRlLWhvdXNlLWFjMTVkNzAw0gEA?oc=5&hl=en-US&gl=US&ceid=US:en


In [6]:
import numpy as np
title_text = [article.title for article in articles if article]
article_text = [article.text for article in articles if article]

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/zhj003/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Weaviate

In [9]:
# Hypothetical adjustment if objects with a 'page_content' attribute are needed
class Document:
    def __init__(self, text):
        self.page_content = text
        self.metadata = {'source': 'google news'}

documents = [Document(article) for article in article_text]
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0)
chunked_articles = text_splitter.split_documents(documents)
chunked_articles = [document.page_content for document in chunked_articles]

02/07/2024 10:05:10 PM - Created a chunk of size 304, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 344, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 341, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 380, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 373, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 333, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 512, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 398, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 335, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 306, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 361, which is longer 

02/07/2024 10:05:10 PM - Created a chunk of size 356, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 351, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 362, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 350, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 509, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 425, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 321, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 382, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 419, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 320, which is longer than the specified 300
02/07/2024 10:05:10 PM - Created a chunk of size 358, which is longer 

In [10]:
# from nltk.tokenize import sent_tokenize
# def chunk_text(text):
#     cleaned_text = text.replace('\n', ' ')
#     sentences = sent_tokenize(cleaned_text)
#     chunked_sentences = [' '.join(sentences[i:i+2]) for i in range(0, len(sentences), 2)]

#     return chunked_sentences
# chunked_articles = [chunk_text(text) for text in article_text]

In [10]:
import torch
from transformers import BertTokenizer, BertModel, pipeline

def text_embedding(data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
    
    def get_bert_embeddings(data):
        tokens = tokenizer(data.tolist(), padding=True, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            embeddings = bert_model(**tokens).last_hidden_state.mean(dim=1)
        return embeddings

    batch_size = 128
    num_samples = len(data)
    num_batches = (num_samples + batch_size - 1) // batch_size

    embeddings_list = []

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        batch_data = data.iloc[start_idx:end_idx]
        batch_embeddings = get_bert_embeddings(batch_data)
        embeddings_list.append(batch_embeddings)

    embeddings = torch.cat(embeddings_list, dim=0).cpu().numpy()
    return embeddings

2024-02-07 22:05:20.934490: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
02/07/2024 10:05:22 PM - Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
02/07/2024 10:05:22 PM - NumExpr defaulting to 8 threads.


In [11]:
import pandas as pd

In [12]:
import weaviate
import json

client = weaviate.Client(
    url = "https://testing-cluster-2qgcoz4q.weaviate.network",  # Replace with your endpoint
    auth_client_secret=weaviate.auth.AuthApiKey(api_key="qRarwGLC0CwrpQsSpK64E1V0c3HajFoAy893"),  # Replace w/ your Weaviate instance API key
#     additional_headers = {
#         "X-OpenAI-Api-Key": "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"  # Replace with your inference API key
#     }
)

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


In [13]:
client.schema.delete_all()

In [14]:
# schema = {
#     "classes": [
#         {
#             "class": "Test_dataset_1",
#             "vectorizer": None,
#             "properties": [
#                 {"name": "context", "dataType": ["string"], "index" : True},
#                 {"name": "vector", "dataType": ["int[]"], "index" : True}
#             ],
#         }
#     ],
#       # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
# #     "moduleConfig": {
# #         "text2vec-openai": {}
# #     }
# }

# client.schema.create(schema)

In [15]:
# Post data to weaviate cloud db 
for article in chunked_articles:
    properties = {"context": article}
    vector = text_embedding(pd.Series(article)).tolist()[0]
    client.data_object.create(properties, "test_dataset_1", vector=vector)

In [17]:
client.query.get("test_dataset_1", ["_additional {id}", "context"]).do()

{'data': {'Get': {'Test_dataset_1': [{'_additional': {'id': '00f38d56-18b0-4940-841e-2c193cd02ee7'},
     'context': 'Johnson said while House Republicans "understand the necessity about Ukraine funding" they still have questions about Ukraine\'s "strategy and endgame."'},
    {'_additional': {'id': '015a8894-7faf-45be-9e62-9f8e9d02b691'},
     'context': 'Nearly half of Americans (47%) now say Washington should urge Kyiv to settle for peace as soon as possible.\n\nVOA State Department Bureau Chief Nike Ching and Ukrainian Service reporter Iuliia Iarmolenko contributed to this report.'},
    {'_additional': {'id': '01881e20-87c1-41c1-8d7c-ef9704832481'},
     'context': 'Republicans, even defense hawks who strongly back Ukraine, insist the money must come with U.S. border provisions.'},
    {'_additional': {'id': '027ff80b-1a57-4ea0-bd16-39791f10551d'},
     'context': 'It also comes as Congress is about to quickly approve temporary funding to avoid a government shutdown, postponing th

In [18]:
all_articles = []
limit = 100 
offset = 0

while True:
    # Adjust the query to include limit and offset
    response = client.query.get("Test_dataset_1", ["context"]).with_limit(limit).with_offset(offset).do()
    # Check if the response contains articles
    if 'data' in response and 'Get' in response['data'] and 'Test_dataset_1' in response['data']['Get']:
        articles = response['data']['Get']['Test_dataset_1']
        all_articles.extend(articles)
        if len(articles) < limit:
            break
        offset += limit
    else:
        break

print(f"Total articles retrieved: {len(all_articles)}")

Total articles retrieved: 396


In [39]:
test_query = "We should have peace instead of war."
query_vector = {"vector" : text_embedding(pd.Series(test_query)).tolist()[0],
                "distance" : 1.0
               }

In [40]:
results = client.query.get("test_dataset_1", ["context"]).with_additional("distance"
                ).with_near_vector(query_vector).do()

In [44]:
results

{'data': {'Get': {'Test_dataset_1': [{'_additional': {'distance': 0.30815715},
     'context': '“If we walk away, and Russia is able to sustain their onslaught and bring down Ukraine, what do you think’s going to happen in the Balkan countries?” Biden said during an event with American mayors. “It changes the dynamic.”'},
    {'_additional': {'distance': 0.3126707},
     'context': 'McConnell has said he wants to help Ukraine, but it can’t happen without some kind of border deal.'},
    {'_additional': {'distance': 0.31319594},
     'context': '“It’s time to cut a deal that both sides can agree to,” Young said Sunday.'},
    {'_additional': {'distance': 0.31415153},
     'context': '“We want to solve that, to secure the border. I just saw the president of the United States say that we’ve got to secure the border. He’s right. So, any effort that doesn’t do that will be rejected by Republicans,” Romney said.'},
    {'_additional': {'distance': 0.31623423},
     'context': 'The U.S. wants

In [45]:
for result in results['data']['Get']['Test_dataset_1']:
    print(result['_additional']['distance'], result['context'])

0.30815715 “If we walk away, and Russia is able to sustain their onslaught and bring down Ukraine, what do you think’s going to happen in the Balkan countries?” Biden said during an event with American mayors. “It changes the dynamic.”
0.3126707 McConnell has said he wants to help Ukraine, but it can’t happen without some kind of border deal.
0.31319594 “It’s time to cut a deal that both sides can agree to,” Young said Sunday.
0.31415153 “We want to solve that, to secure the border. I just saw the president of the United States say that we’ve got to secure the border. He’s right. So, any effort that doesn’t do that will be rejected by Republicans,” Romney said.
0.31623423 The U.S. wants Ukraine to make battlefield progress rapidly without dragging NATO into a direct military confrontation with Moscow, said George Beebe, director of Grand Strategy at the Quincy Institute.
0.3194397 “We have to take care of our own house, we have to secure our own border before we talk about doing anythi