In [16]:
from bs4 import BeautifulSoup
import os 
dir_path = os.path.abspath('')
import requests
import time
import random

In [2]:
#open the file

#returns a list of tuples formatted as [article title, publish date]
def scrape(filename):
    with open(dir_path+'\\' + filename) as html_file:
        content = html_file.read()

        soup = BeautifulSoup(content, 'lxml')

        #get the table with the articles, and get rid of whitespace
        tabs = soup.find("mw-tabs")

        #each article is in a div with the class being article__content
        articles = tabs.find_all('div', class_="article__content")

        accum = []
        for i in articles:

            #get the title and clean the string of whitespace
            title = i.find('a').stripped_strings

            #making a title and date tuple
            title_date = []

            #I hope there is only one title per article
            for string in title:
                title_date.append(string)

            #get the date from the timestamp span
            title_date.append(i.find('span',class_="article__timestamp")["data-est"])
            accum.append(title_date)
        return accum
                

In [3]:
tuples = scrape(filename='Search - MarketWatch.html')
print(tuples)

[['Oil futures hit five-month high on strong manufacturing data, escalating geopolitical tensions', '2024-04-01T16:07:00'], ['Companies will soon have to disclose how they’re managing climate-related risk. That’s good news for stock investors.', '2024-03-30T12:49:00'], ['The EPA Wants Greener Trucks. Why That’s Fine for Diesel Engine Giant Cummins.', '2024-03-29T13:56:00'], ["TotalEnergies releases its Universal Registration Document 2023 (Document d'enregistrement universel 2023) and its Form 20-F 2023 as well as the proposed resolutions for the Combined Shareholders' Meeting of May 24, 2024", '2024-03-29T12:14:00'], ['ETF flows in first quarter reflect investor hopes for ‘soft landing’', '2024-03-29T04:34:00'], ['TotalEnergies Celebrates its 100(th) Anniversary and Launches the Operation "100 for 100"', '2024-03-29T04:20:00'], ['Mitsui & Co. Plans to Invest in Vietnam Gas Project', '2024-03-29T02:03:00'], ['Oil ends higher, with U.S. prices up 16% for the quarter', '2024-03-28T15:38:

In [4]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [6]:
#defining the labels to make interpretation easier.
labels = {0:'neutral', 1:'positive',2:'negative'}

In [7]:
#tokenize the first entry as a test
input = tokenizer(tuples[0][0], return_tensors="pt", padding=True)

#enter the tokenized string into finbert and retrieve the output
output = finbert(**input)[0]

#get the actual prediction from the resulting array. This will be the argmax of the output array.
val = labels[np.argmax(output.detach().numpy())]
print(val)
print(output.detach().numpy())

positive
[[-6.342599  10.591372  -5.9413424]]


In [8]:
#this is a test that has a clear negative sentiment. It is the opposite of the first test
test = tokenizer("Oil futures hit five-month Low on weak manufacturing data, escalating geopolitical tensions", return_tensors="pt", padding=True)
test_out = finbert(**test)[0]
val = labels[np.argmax(test_out.detach().numpy())]
print(val)
print(test_out.detach().numpy())

negative
[[-4.665029  -3.9620922 12.77002  ]]


In [9]:
#getting the main page of articles.
import uncurl
uncurled = uncurl.parse("curl 'https://www.marketwatch.com/search?q=natural%20gas&ts=5&sd=01%2F05%2F2007&ed=05%2F01%2F2024&tab=All%20News' \
  -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7' \
  -H 'accept-language: en-US,en;q=0.9' \
  -H 'cache-control: max-age=0' \
  -H 'cookie: gdprApplies=false; refresh=off; _pnvl=false; pushly.user_puuid=N6mgtv7o13gye9Pov5S8WP6vW2WdTwDj; letsGetMikey=enabled; _pubcid=2008d82d-6476-4aa0-aca9-82881880d4d8; _sp_su=false; AMCVS_CB68E4BA55144CAA0A4C98A5%40AdobeOrg=1; ajs_anonymous_id=298f408c-540f-4076-8b2c-15cd25be026a; _fbp=fb.1.1712330579933.1489826469; _meta_facebookTag_sync=1712330579934; _pcid=%7B%22browserId%22%3A%22lumtezcryp6s0mh2%22%7D; cX_P=lumtezcryp6s0mh2; _pctx=%7Bu%7DN4IgrgzgpgThIC4B2YA2qA05owMoBcBDfSREQpAeyRCwgEt8oBJAEzIE4AmHgZi4CsvAIwB2DqIAMADkHTRvEAF8gA; s_cc=true; _gcl_au=1.1.1051478361.1712330580; _ncg_domain_id_=34cab120-525d-43ef-810b-cea4b39df739.1.1712330580081.1775402580081; _scor_uid=2595a1a20ad64a989f47676ba304c36e; _ncg_g_id_=f18f861e-cfce-4c14-bd45-4b17515106b5.3.1712330128.1775402580081; _dj_sp_id=a77f52ff-1f26-4401-850b-2d11bfe89ccb; _cls_v=8c52e2bb-3fbe-4fb2-baba-83ceef014339; _cls_s=7bb3b0d8-8c66-4480-ad53-8cec7fc83bc1:0; cX_G=cx%3A2sncjh13mlfa82qelqjrstho9h%3A1yqdkcza0sdwu; permutive-id=5bde565c-3b5c-4549-b439-264ae36c1cae; _pnlspid=11018; wsjregion=na%2Cus; ccpaApplies=false; vcdpaApplies=false; regulationApplies=gdpr%3Afalse%2Ccpra%3Afalse%2Cvcdpa%3Afalse; ab_uuid=490cc91d-6da6-4b4b-ab56-64bf275405fa; djvideovol=1; _ncg_id_=34cab120-525d-43ef-810b-cea4b39df739; recentqsmkii=Future-US-CL.1; _meta_cross_domain_id=f094a52d-7e9f-4d05-9436-3b67d001a186; mw_loc=%7B%22Region%22%3A%22NY%22%2C%22Country%22%3A%22US%22%2C%22Continent%22%3A%22NA%22%2C%22ApplicablePrivacy%22%3A0%7D; fullcss-section=section-b1201b4c3a.min.css; icons-loaded=true; _lr_geo_location_state=NY; _lr_geo_location=US; AMCV_CB68E4BA55144CAA0A4C98A5%40AdobeOrg=1585540135%7CMCIDTS%7C19847%7CMCMID%7C85637347026552322914309742615444452874%7CMCAAMLH-1715357857%7C7%7CMCAAMB-1715357857%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1714760257s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C4.4.0; _meta_cross_domain_recheck=1714753058189; _pnpdm=true; _pnss=blocked; _mfuuid_=2b44d50e-6996-447c-b928-6b488462cf90; _pubcid_cst=kSylLAssaw%3D%3D; utag_main=v_id:018eaedb1ee70016901b3d22e7a00506f001c06700978$_sn:4$_ss:0$_st:1714757595586$vapi_domain:marketwatch.com$ses_id:1714753057244%3Bexp-session$_pn:9%3Bexp-session$_prevpage:MW_Search%3Bexp-1714759395590; _rdt_uuid=1712330580074.7bfcaaa4-26e2-48f7-b6bf-9129ee0a5a1c; _dj_id.cff7=.1712330580.4.1714755796.1713834402.ee71490a-a388-487a-9ee8-e8d41320275a; _ncg_sp_id.f57d=34cab120-525d-43ef-810b-cea4b39df739.1712330580.4.1714755796.1713834403.f72e2fd6-cc5a-4394-895f-e888c2ab8442.15e3781d-229b-4f62-a103-d68d7668cce4.67ff2799-79fb-4df1-8724-9db592550931.1714753058174.13; __gads=ID=5c5ea10353d6cf9a:T=1712330583:RT=1714755796:S=ALNI_MascskHCRnL-ErSu1zZ_zwjawm6wQ; __gpi=UID=00000ddc0da89c36:T=1712330583:RT=1714755796:S=ALNI_MZOdKTohFxAEwIuOgp32bZzwrFpug; __eoi=ID=ec072ecbe38ede61:T=1712330583:RT=1714755796:S=AA-AfjbW9jqY1N0TjqjAMfFbhx4P; datadome=7mShbl~qJQTycHhoQFmP3YVGN9EjZTvq8COvGW2TOZO156Rybug8kyB8vZedyrm0py5mgIDMnS2Om8qqGb_wQfXBFlNZNF~yicrK5npfEANi3_q4qaFs2xB~eDYXNm0M; s_tp=3803; s_ppv=MW_Search%2C32%2C32%2C1203' \
  -H 'priority: u=0, i' \
  -H 'sec-ch-ua-mobile: ?0' \
  -H 'sec-ch-ua-platform: 'Windows'' \
  -H 'sec-fetch-dest: document' \
  -H 'sec-fetch-mode: navigate' \
  -H 'sec-fetch-site: same-origin' \
  -H 'sec-fetch-user: ?1' \
  -H 'upgrade-insecure-requests: 1' \
  -H 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'")

print(uncurled)

requests.get("https://www.marketwatch.com/search?q=natural%20gas&ts=5&sd=01%2F05%2F2007&ed=05%2F01%2F2024&tab=All%20News",
    headers={
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-language": "en-US,en;q=0.9",
        "cache-control": "max-age=0",
        "priority": "u=0, i",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "Windows",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "same-origin",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    },
    cookies={
        "AMCVS_CB68E4BA55144CAA0A4C98A5%40AdobeOrg": "1",
        "AMCV_CB68E4BA55144CAA0A4C98A5%40AdobeOrg": "1585540135%7CMCIDTS%7C19847%7CMCMID%7C856373470265523229143

In [14]:
next_page = uncurl.parse("curl 'https://www.marketwatch.com/search/moreHeadlines?q=natural%20gas&ts=5&sd=03%2F01%2F2024&ed=04%2F01%2F2024&partial=true&tab=All%20News&pageNumber=1' \
  -H 'accept: */*' \
  -H 'accept-language: en-US,en;q=0.9' \
  -H 'cookie: mw_loc=%7B%22Region%22%3A%22NY%22%2C%22Country%22%3A%22US%22%2C%22Continent%22%3A%22NA%22%2C%22ApplicablePrivacy%22%3A0%7D; gdprApplies=false; ab_uuid=44fdec96-d52b-4e73-8c2c-519002ec53e3; fullcss-section=section-b1201b4c3a.min.css; refresh=off; icons-loaded=true; pushly.user_puuid=qfCU4uL7vqeJRt2mnbrMjQiz1i1vsXdC; _pnss=none; letsGetMikey=enabled; _lr_geo_location_state=NY; _lr_geo_location=US; dnsDisplayed=undefined; ccpaApplies=false; signedLspa=undefined; _pubcid=6157263b-53a3-4284-b4b5-74adc3bcfceb; _pubcid_cst=kSylLAssaw%3D%3D; _sp_su=false; utag_main=v_id:018f43f8c41c009f28e6d96241b80506f006a06700978$_sn:1$_ss:1$_st:1714834127709$ses_id:1714832327709%3Bexp-session$_pn:1%3Bexp-session$_prevpage:MW_Search%3Bexp-1714835927713$vapi_domain:marketwatch.com; ccpaUUID=8bf5f977-ded3-4977-89d1-b2ae77e9e804; AMCVS_CB68E4BA55144CAA0A4C98A5%40AdobeOrg=1; s_cc=true; AMCV_CB68E4BA55144CAA0A4C98A5%40AdobeOrg=1585540135%7CMCIDTS%7C19848%7CMCMID%7C85637347026552322914309742615444452874%7CMCAAMLH-1715437127%7C7%7CMCAAMB-1715437127%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1714839528s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C4.4.0; _pcid=%7B%22browserId%22%3A%22lvs6w4kvhoy4t31d%22%7D; cX_P=lvs6w4kvhoy4t31d; _pctx=%7Bu%7DN4IgrgzgpgThIC4B2YA2qA05owMoBcBDfSREQpAeyRCwgEt8oBJAEzIE4AmHgZi4CsvAIwB2DqIAMADkHTRvEAF8gA; _ncg_sp_ses.f57d=*; _ncg_id_=4a5d0e62-9cc1-4018-94a9-d108d1026f6c; _dj_ses.cff7=*; _dj_id.cff7=.1714832328.1.1714832328.1714832328.94c0976b-46e4-442f-a83a-7094d617604d; _rdt_uuid=1714832328305.45446486-56a1-4f63-9c11-fa621ea2389f; ajs_anonymous_id=3efed52a-511d-4f1b-9b78-3c40d062336a; _fbp=fb.1.1714832328456.565096108; _meta_facebookTag_sync=1714832328456; _ncg_domain_id_=afc269ad-ad21-4b5b-a8c6-385afa549ece.1.1714832328107.1777904328107; _gcl_au=1.1.740111044.1714832329; _scor_uid=085e954bac9349bc830861950de62bef; _dj_sp_id=d2bd7665-7c5a-42d4-bdd1-0997df43050b; _ncg_g_id_=f18f861e-cfce-4c14-bd45-4b17515106b5.3.1712330128.1777904328107; _meta_cross_domain_id=85f0d8aa-3fab-491c-852d-5c7315f82421; _meta_cross_domain_recheck=1714832328796; _ncg_sp_id.f57d=4a5d0e62-9cc1-4018-94a9-d108d1026f6c.1714832328.1.1714832329..7e444741-d51a-4132-aa03-584ee5feb11f..42fed95e-5acb-478b-9eab-e3427c6e9ee1.1714832328233.3; permutive-id=5bde565c-3b5c-4549-b439-264ae36c1cae; cX_G=cx%3A2sncjh13mlfa82qelqjrstho9h%3A1yqdkcza0sdwu; _pnlspid=11018; datadome=v0pSezaEJYKq12~NjmlsZ2QfnUgUenjr8CBWzyAprGAPgxHYo6PoyGkSv4mip8HcEItCHRdqyti4MQYYkQ3mGILI1swjP_5mR3MWsFs8Kb4pQ0LAK88s0Zzg4rm8zjew; __gads=ID=15e0f355ec6479a2:T=1714832345:RT=1714832345:S=ALNI_MYg1vlnbgXQK6zuPvdLK4f8GqWLeQ; __gpi=UID=00000dfd06568f2d:T=1714832345:RT=1714832345:S=ALNI_Ma6xawG7_O6eXp3BMtRBQiEFY-q9A; __eoi=ID=b28565e0ebe9fa0c:T=1714832345:RT=1714832345:S=AA-AfjZHzlGLqqy6vp_o56BS1Sqn; s_tp=4194; s_ppv=MW_Search%2C84%2C84%2C3505.39990234375' \
  -H 'newrelic: eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6IjE2ODQyNzMiLCJhcCI6Ijc1NDg5OTM4MiIsImlkIjoiZDMzODY2YzFmNmZhNWNiYyIsInRyIjoiNmM0YWY5NzUyMmRiNjJlOGUwMDRhNDIwYTU1YTE0YjYiLCJ0aSI6MTcxNDgzMjM0NzA4MiwidGsiOiIxMDIyNjgxIn19' \
  -H 'priority: u=1, i' \
  -H 'referer: https://www.marketwatch.com/search?q=natural%20gas&ts=5&sd=03%2F01%2F2024&ed=04%2F01%2F2024&tab=All%20News&pageNumber=1' \
  -H 'sec-ch-device-memory: 8' \
  -H 'sec-ch-ua-arch: 'x86'' \
  -H 'sec-ch-ua-mobile: ?0' \
  -H 'sec-ch-ua-model: ""' \
  -H 'sec-ch-ua-platform: 'Windows'' \
  -H 'sec-fetch-dest: empty' \
  -H 'sec-fetch-mode: cors' \
  -H 'sec-fetch-site: same-origin' \
  -H 'traceparent: 00-6c4af97522db62e8e004a420a55a14b6-d33866c1f6fa5cbc-01' \
  -H 'tracestate: 1022681@nr=0-1-1684273-754899382-d33866c1f6fa5cbc----1714832347082' \
  -H 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'")

print(next_page)

requests.get("https://www.marketwatch.com/search/moreHeadlines?q=natural%20gas&ts=5&sd=03%2F01%2F2024&ed=04%2F01%2F2024&partial=true&tab=All%20News&pageNumber=1",
    headers={
        "accept": "*/*",
        "accept-language": "en-US,en;q=0.9",
        "newrelic": "eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6IjE2ODQyNzMiLCJhcCI6Ijc1NDg5OTM4MiIsImlkIjoiZDMzODY2YzFmNmZhNWNiYyIsInRyIjoiNmM0YWY5NzUyMmRiNjJlOGUwMDRhNDIwYTU1YTE0YjYiLCJ0aSI6MTcxNDgzMjM0NzA4MiwidGsiOiIxMDIyNjgxIn19",
        "priority": "u=1, i",
        "referer": "https://www.marketwatch.com/search?q=natural%20gas&ts=5&sd=03%2F01%2F2024&ed=04%2F01%2F2024&tab=All%20News&pageNumber=1",
        "sec-ch-device-memory": "8",
        "sec-ch-ua-arch": "x86",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-model": "",
        "sec-ch-ua-platform": "Windows",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "traceparent": "00-6c4af97522db62e8e004a420a55a

In [17]:
def scrape_pages():

    reqLink = "https://www.marketwatch.com/search/moreHeadlines?q=natural%20gas&ts=5&sd=01%2F05%2F2007&ed=05%2F01%2F2024&partial=true&tab=All%20News&pageNumber="
    accum = []
    for i in range(1,501):
        time.sleep(random.random()*2)
        newLink = reqLink + str(i)
        html = requests.get(newLink,
    headers={
        "accept": "*/*",
        "accept-language": "en-US,en;q=0.9",
        "newrelic": "eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6IjE2ODQyNzMiLCJhcCI6Ijc1NDg5OTM4MiIsImlkIjoiZDMzODY2YzFmNmZhNWNiYyIsInRyIjoiNmM0YWY5NzUyMmRiNjJlOGUwMDRhNDIwYTU1YTE0YjYiLCJ0aSI6MTcxNDgzMjM0NzA4MiwidGsiOiIxMDIyNjgxIn19",
        "priority": "u=1, i",
        "referer": "https://www.marketwatch.com/search?q=natural%20gas&ts=5&sd=03%2F01%2F2024&ed=04%2F01%2F2024&tab=All%20News&pageNumber=1",
        "sec-ch-device-memory": "8",
        "sec-ch-ua-arch": "x86",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-model": "",
        "sec-ch-ua-platform": "Windows",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "traceparent": "00-6c4af97522db62e8e004a420a55a14b6-d33866c1f6fa5cbc-01",
        "tracestate": "1022681@nr=0-1-1684273-754899382-d33866c1f6fa5cbc----1714832347082",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    },
    cookies={
        "AMCVS_CB68E4BA55144CAA0A4C98A5%40AdobeOrg": "1",
        "AMCV_CB68E4BA55144CAA0A4C98A5%40AdobeOrg": "1585540135%7CMCIDTS%7C19848%7CMCMID%7C85637347026552322914309742615444452874%7CMCAAMLH-1715437127%7C7%7CMCAAMB-1715437127%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1714839528s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C4.4.0",
        "__eoi": "ID=b28565e0ebe9fa0c:T=1714832345:RT=1714832345:S=AA-AfjZHzlGLqqy6vp_o56BS1Sqn",
        "__gads": "ID=15e0f355ec6479a2:T=1714832345:RT=1714832345:S=ALNI_MYg1vlnbgXQK6zuPvdLK4f8GqWLeQ",
        "__gpi": "UID=00000dfd06568f2d:T=1714832345:RT=1714832345:S=ALNI_Ma6xawG7_O6eXp3BMtRBQiEFY-q9A",
        "_dj_id.cff7": ".1714832328.1.1714832328.1714832328.94c0976b-46e4-442f-a83a-7094d617604d",
        "_dj_ses.cff7": "*",
        "_dj_sp_id": "d2bd7665-7c5a-42d4-bdd1-0997df43050b",
        "_fbp": "fb.1.1714832328456.565096108",
        "_gcl_au": "1.1.740111044.1714832329",
        "_lr_geo_location": "US",
        "_lr_geo_location_state": "NY",
        "_meta_cross_domain_id": "85f0d8aa-3fab-491c-852d-5c7315f82421",
        "_meta_cross_domain_recheck": "1714832328796",
        "_meta_facebookTag_sync": "1714832328456",
        "_ncg_domain_id_": "afc269ad-ad21-4b5b-a8c6-385afa549ece.1.1714832328107.1777904328107",
        "_ncg_g_id_": "f18f861e-cfce-4c14-bd45-4b17515106b5.3.1712330128.1777904328107",
        "_ncg_id_": "4a5d0e62-9cc1-4018-94a9-d108d1026f6c",
        "_ncg_sp_id.f57d": "4a5d0e62-9cc1-4018-94a9-d108d1026f6c.1714832328.1.1714832329..7e444741-d51a-4132-aa03-584ee5feb11f..42fed95e-5acb-478b-9eab-e3427c6e9ee1.1714832328233.3",
        "_ncg_sp_ses.f57d": "*",
        "_pcid": "%7B%22browserId%22%3A%22lvs6w4kvhoy4t31d%22%7D",
        "_pctx": "%7Bu%7DN4IgrgzgpgThIC4B2YA2qA05owMoBcBDfSREQpAeyRCwgEt8oBJAEzIE4AmHgZi4CsvAIwB2DqIAMADkHTRvEAF8gA",
        "_pnlspid": "11018",
        "_pnss": "none",
        "_pubcid": "6157263b-53a3-4284-b4b5-74adc3bcfceb",
        "_pubcid_cst": "kSylLAssaw%3D%3D",
        "_rdt_uuid": "1714832328305.45446486-56a1-4f63-9c11-fa621ea2389f",
        "_scor_uid": "085e954bac9349bc830861950de62bef",
        "_sp_su": "false",
        "ab_uuid": "44fdec96-d52b-4e73-8c2c-519002ec53e3",
        "ajs_anonymous_id": "3efed52a-511d-4f1b-9b78-3c40d062336a",
        "cX_G": "cx%3A2sncjh13mlfa82qelqjrstho9h%3A1yqdkcza0sdwu",
        "cX_P": "lvs6w4kvhoy4t31d",
        "ccpaApplies": "false",
        "ccpaUUID": "8bf5f977-ded3-4977-89d1-b2ae77e9e804",
        "datadome": "v0pSezaEJYKq12~NjmlsZ2QfnUgUenjr8CBWzyAprGAPgxHYo6PoyGkSv4mip8HcEItCHRdqyti4MQYYkQ3mGILI1swjP_5mR3MWsFs8Kb4pQ0LAK88s0Zzg4rm8zjew",
        "dnsDisplayed": "undefined",
        "fullcss-section": "section-b1201b4c3a.min.css",
        "gdprApplies": "false",
        "icons-loaded": "true",
        "letsGetMikey": "enabled",
        "mw_loc": "%7B%22Region%22%3A%22NY%22%2C%22Country%22%3A%22US%22%2C%22Continent%22%3A%22NA%22%2C%22ApplicablePrivacy%22%3A0%7D",
        "permutive-id": "5bde565c-3b5c-4549-b439-264ae36c1cae",
        "pushly.user_puuid": "qfCU4uL7vqeJRt2mnbrMjQiz1i1vsXdC",
        "refresh": "off",
        "s_cc": "true",
        "s_ppv": "MW_Search%2C84%2C84%2C3505.39990234375",
        "s_tp": "4194",
        "signedLspa": "undefined",
        "utag_main": "v_id:018f43f8c41c009f28e6d96241b80506f006a06700978$_sn:1$_ss:1$_st:1714834127709$ses_id:1714832327709%3Bexp-session$_pn:1%3Bexp-session$_prevpage:MW_Search%3Bexp-1714835927713$vapi_domain:marketwatch.com"
    },
    auth=(),
)
        soup = BeautifulSoup(html.text, "lxml")
        articles = soup.find_all('div', class_="article__content")
        for j in articles:

            #get the title and clean the string of whitespace
            title = j.find('a').stripped_strings

            #making a title and date tuple
            title_date = []

            #I hope there is only one title per article
            for string in title:
                title_date.append(string)

            #get the date from the timestamp span
            timestamp = j.find('span',class_="article__timestamp")
            if timestamp:
                title_date.append(timestamp["data-est"])
            else:
                continue

            token = tokenizer(title_date[0], return_tensors="pt", padding=True)
            fin_out = finbert(**token)[0]
            val = labels[np.argmax(fin_out.detach().numpy())]
            title_date.append((val, test_out.detach().tolist())[0])
            accum.append(title_date)
    return accum

In [18]:
pages = scrape_pages()
print(pages[:100])

[['Ariston Subsidiary Placed Under Management of Gazprom', '2024-04-29T03:16:00', 'neutral'], ["Sinopec's Net Profit Declined on Higher Costs", '2024-04-28T21:11:00', 'negative'], ['Energy Drops After Mixed Earnings -- Energy Roundup', '2024-04-26T17:26:00', 'negative'], ['These Stocks Moved the Most Today: Alphabet, Microsoft, Intel, Snap, Exxon, Roku, Skechers, ResMed, and More', '2024-04-26T16:12:00', 'neutral'], ['Oil prices score weekly gain, breaking run of back-to-back weekly losses', '2024-04-26T15:26:00', 'neutral'], ["Mexico's Pemex 1Q Profit Falls on Lower Sales, Less Exchange Gains", '2024-04-26T13:20:00', 'negative'], ['Chevron’s stock pulls back as profit drops amid plunge in natural-gas prices', '2024-04-26T12:23:00', 'negative'], ['Exxon Mobil’s stock falls after profit and production drop below forecasts', '2024-04-26T11:53:00', 'negative'], ['Exxon and Chevron Stocks Fall After Earnings. The Bar Was High.', '2024-04-26T11:07:00', 'positive'], ['Imperial Oil 1Q Net Inc

In [19]:
import json
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(pages, f, ensure_ascii=False, indent=4)


In [20]:
print(len(pages))

8982
