In [2]:
import os
import json
import openai
import asyncio
import tiktoken

from bs4 import BeautifulSoup
from pymongo import MongoClient
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import LLMExtractionStrategy

In [3]:
import nest_asyncio
nest_asyncio.apply()

In [4]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [5]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [6]:
def get_completion(prompt, model="gpt-3.5-turbo", temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = openai.chat.completions.create(
        model=model,
        messages=messages,
        response_format={ "type": "json_object" },
        temperature=temperature, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

### Load mongodb

In [7]:
# Connect to the MongoDB server
client = MongoClient("mongodb://localhost:27017/")  # Replace with your MongoDB connection string
db = client['crawler']  # Replace with your database name
cache_coll = db['httpcache']
batches_coll = db['batches']  # Replace with your collection name
urls_coll = db['urls_db'] 
content_coll = db['content'] 

In [8]:
query = {"fingerprint": '4f834bcc47a545cb55894d101761bf5ee80684e0'}
documents = list(content_coll.find(query))

# ids = [doc['batch_id'] for doc in documents]
# ids

len(documents)

1

In [9]:
body = documents[0]['body']
len(body)

182725

In [10]:
# async def extract_tech_content(url):
#     async with AsyncWebCrawler(verbose=True) as crawler:
#         result = await crawler.arun(
#             # url="https://www.nbcnews.com/business",
#             url=url,
#             extraction_strategy=LLMExtractionStrategy(
#                 provider="openai/gpt-4o",
#                 api_token=os.getenv('OPENAI_API_KEY'),
#                 instruction="Extract only content related to technology"
#             ),
#             bypass_cache=True,
#         )

#     return result

# async def make_requests_in_parallel(url):
#     """Make requests in parallel and return the responses."""
#     return await asyncio.gather(extract_tech_content(url))

### OpenAI scrape

In [11]:
from scrapy.selector import Selector

In [12]:
query = {"url": 'https://www.brat.ro/sati/rezultate/type/site/page/1/c/all'}
# query = {"url": 'https://www.brat.ro/sati/export-rezultate/export/xls/type/site/c/all/period_type/day/category/all/editor/all/regie/all/period_filter/2024-12-3/order_by/name/order/asc/'}
projection = {'url': 1, 'body': 1, '_id': 0}
documents = list(cache_coll.find(query, projection))

len(documents)

1

In [13]:
query = {"fingerprint": '4f834bcc47a545cb55894d101761bf5ee80684e0'}
documents = list(content_coll.find(query))

In [14]:
# documents
body = documents[0]['body']
type(body), len(body)

(str, 182725)

In [15]:
soup = BeautifulSoup(body, "html.parser")

In [16]:
# # Remove all text, leaving only tags
# for element in soup.find_all(string=True):
#     element.replace_with("")

print(len(soup.prettify()))

# # Remove all <script> tags
# for script in soup.find_all("script"):
#     script.decompose()

for tag in soup(['script', 'style', 'meta', 'head', 'nav', 'footer', 'aside']):
    tag.decompose()

print(len(soup.prettify()))

189004
79104


In [17]:
len(soup.get_text(separator=' ', strip=True))

11267

In [18]:
text = soup.get_text(separator=' ', strip=True)
len(text), text

(11267,
 'Sari direct la conținut CAUTĂ Actualitate joi, 28 noiembrie 2024, 00:19 Cât de periculoasă ar fi victoria lui Georgescu și cât ar dura ieşirea României din UE dacă independentul ar ajunge la Cotroceni – explicațiile unui vicepreşedinte al Parlamentului European Nicoleta Onofrei HotNews.ro Share Vicepreşedintele Parlamentului European, Nicu Ştefănuţă, explică, miercuri seară, cât de periculoasă ar fi victoria lui Călin Georgescu pentru politica externă a României, dar și cât ar dura o un ROEXIT și o eventuală ieșire din NATO, în cazul în care acesta ar ajunge la Cotroceni și ar vrea să îndepărteze țara noastră de alianța transatlantică și de blocul comunitar european, subliniind că preşedintele unei ţări nu poate deveni dictator peste noapte. În contextul în care Călinescu a declarat că „NATO este cea mai slabă alianță de pe pământ” și „dacă lucrurile continuă în felul acesta”, ar abandona-o instant, vicepreşedintele Parlamentului European Nicu Ștefănuță a făcut câteva preceză

In [19]:
num_tokens_from_string(text, 'gpt-4o')

3324

In [20]:
print(soup.prettify())

<body class="post-template-default single single-post postid-1847441 single-format-standard wp-embed-responsive cat-de-periculoasa-ar-fi-victoria-lui-georgescu-si-cat-ar-dura-iesirea-romaniei-din-ue-daca-independentul-ar-ajunge-la-cotroceni-explicatiile-unui-vicepresedinte-al-parlamentului-european-1847441 category-actualitate cat-color-blue">
 <div class="site" id="app">
  <a class="sr-only focus:not-sr-only" href="#main">
   Sari direct la conținut
  </a>
  <header class="site-header" id="js-site-header">
   <div class="row blue main-area">
    <div class="hn-container">
     <div class="main-row">
      <div class="box">
       <a href="https://hotnews.ro/">
        <img alt="HotNews.ro" class="logo" src="https://hotnews.ro/wp-content/themes/hotnews/public/images/hotnews_header.svg"/>
       </a>
      </div>
      <div class="box">
       <div class="menu-items">
        <div class="menu-btn" id="js-trigger-search-form">
         <img alt="search_icon" class="icon" src="https://hot

In [21]:
sel = Selector(text=body)

In [24]:
# foo = sel.xpath("//body//table").getall()
foo = sel.xpath("//body//tbody").getall()

len(foo), len(foo[0]), len(body)

IndexError: list index out of range

In [25]:
foo


[]

In [None]:
num_tokens_from_string(foo[0], 'gpt-4o'), num_tokens_from_string(foo[0], 'gpt-3.5-turbo')

In [None]:
num_tokens_from_string(soup.prettify(), 'gpt-4o'), num_tokens_from_string(soup.prettify(), 'gpt-3.5-turbo')

In [None]:
# prompt = f'you are webscraper. your job is to extract the data from the following html file and extract the table into json. the html follow here: {foo[0]}'
prompt = f"""the following text is a news article scraped off a news webpage and is polluted with bits of text that are not related to the subject matter of the article.
determine the relevant data and leave out any unrelated data.
return a json object with the article date, title and body
the text is bellow:
{text}"""
model = 'gpt-4o-mini'
res = get_completion(prompt, model=model)

In [None]:
res

In [None]:
print(res[7:])

In [None]:
foo = json.loads(res)

In [None]:
# prompt = f'you are webscraper. your job is to extract the data from the following html file and extract the table into json. the html follow here: {foo[0]}'
prompt = f'you are webscraper. your job is to extract the data from the following html file and extract the table into json. the html follow here: {soup.prettify()}'
model = 'gpt-4o-mini'
res = get_completion(prompt, model=model)

In [None]:
print(res)

In [None]:
body