In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import csv


In [2]:
washington_state_wire = []

In [7]:
with open('news_articles_links/washington_state_wire.txt', 'r') as file:
    for line in file:
        url = line.strip()
        washington_state_wire.append(url)

In [24]:
def get_washington_news_article(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    
    title = soup.find('h1', class_='story-title').get_text(strip=True)
    date = soup.find('span', class_='story-date').get_text(strip=True)
    
    article_content = soup.find('div', class_='story-content').get_text(strip=True)
    news_source = url.split('/')[2].split('.')[0]
    if article_content:
        return title, article_content, date, news_source
    else:
        print("couldnt find the article content")

In [25]:
get_washington_news_article('https://washingtonstatewire.com/cities-awarded-funding-for-electric-vehicle-charging-stations/')

('Cities awarded funding for electric vehicle charging stations',
 'The Washington State Department of Commerce (COM) has awarded $9.8 million in Electrification of Transportation Systems (ETS) grants for 14 projects located in cities across the state.These grant awards follow an initial round of 37 applications totaling $25 million in requests for funding, according to COM.The Morning Wire: Keeping you informed on politics, policies, and personalities of Washington State.The agency says that priority was given to projects designed to close gaps in availability of EV charging infrastructure and aid communities disproportionately impacted by climate change and pollution from transportation systems.Investing in the electrification of transportation is essential to Washington’s clean energy future and for equitable economic recovery and growth throughout the state,” said Commerce Director Lisa Brown. “Importantly, these projects cover the entire state, supporting expansion of EV infrastru

In [4]:
def create_dataset(file_name, dir_path, news_source, news_fetcher):
    os.makedirs(dir_path, exist_ok=True)
    csv_filename = os.path.join(dir_path, file_name)
    
    with open(csv_filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['title', 'article', 'date', 'news_source'])
        
        for news in news_source:
            title, article, date, news_source = news_fetcher(news)
            writer.writerow([title, article, date, news_source])

In [5]:
dir_path = 'news/washington-state/'
create_dataset('articles.csv', dir_path, washington_state_wire, get_washington_news_article)

file_name = os.path.join(dir_path, 'articles.csv')
df = pd.read_csv(file_name)

df['state'] = 'washington-state'
df.to_csv(file_name, index=False)


In [6]:
df = pd.read_csv(file_name)
df.head()

Unnamed: 0,title,article,date,news_source,state
0,Cities awarded funding for electric vehicle ch...,The Washington State Department of Commerce (C...,"December 28, 2020",washingtonstatewire,washington-state
1,Adapt or mitigate? Washington Republicans roll...,A new proposal from Washington State Republica...,"November 30, 2021",washingtonstatewire,washington-state
2,Op-ed: Now is the Time to Electrify Public Veh...,Matthew Metz is the founder and co-executive d...,"June 5, 2018",washingtonstatewire,washington-state
3,Washington State passes law establishing 2030 ...,The Washington State legislature has passed gr...,"April 15, 2021",washingtonstatewire,washington-state
4,Electric Vehicle Tax Break Extension Faces Sti...,Consider the political oddity that is the elec...,"January 21, 2015",washingtonstatewire,washington-state


# Los angeles daily news

In [9]:
la_daily_news = []

In [10]:
with open('news_articles_links/la_daily_news.txt', 'r') as file:
    for line in file:
        url = line.strip()
        la_daily_news.append(url)

In [11]:
la_daily_news

['https://www.dailynews.com/2023/10/18/pennsylvania-regulators-follow-californias-lead-with-the-same-abysmal-results/',
 'https://www.dailynews.com/2023/10/22/proposed-tax-credit-could-help-landscapers-transition-to-electric-leaf-blowers-lawn-mowers/',
 'https://www.dailynews.com/2023/10/09/tesla-prices-now-rival-average-us-carsafter-billions-in-cuts/',
 'https://www.dailynews.com/2023/09/22/why-cant-americans-buy-cheap-chinese-evs/']

In [7]:
def get_la_news(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    article_content = soup.find('div', class_='article-body').get_text(strip=True)
    title = soup.find('span', class_='metered').get_text(strip=True)
    date = soup.find('time', class_='updated').get_text(strip=True)
    news_source = url.split('/')[2].split('.')[1]
    
    if article_content:
        return title, article_content, date, news_source
    else:
        print("couldnt find article content")

In [8]:
dir_path = 'news/california/la/'
create_dataset('articles.csv', dir_path, la_daily_news, get_la_news)

file_name = os.path.join(dir_path, 'articles.csv')
df = pd.read_csv(file_name)

df['state'] = 'california-la'
df.to_csv(file_name, index=False)


In [9]:
df.head()

Unnamed: 0,title,article,date,news_source,state
0,Pennsylvania regulators follow…,Pennsylvania’s Peter Brothers Trucking deliver...,"October 18, 2023 at 5:01 a.m.",dailynews,california-la
1,Proposed tax credit could help landscapers…,When Sana Sirodan and her team launched Greenp...,"October 22, 2023 at 7:01 a.m.",dailynews,california-la
2,Tesla prices rival average US cars after…,By Tom Randall| BloombergTesla Inc.’s top-sell...,"October 9, 2023 at 9:40 a.m.",dailynews,california-la
3,Why can’t Americans buy cheap Chinese…,By Kyle Stock | BloombergEV variety is easy to...,"September 22, 2023 at 12:04 p.m.",dailynews,california-la


# San francisco chronicles

In [12]:
sf_chronicles_news = []

In [13]:
with open('news_articles_links/sf_chronicles_news.txt', 'r') as file:
    for line in file:
        url = line.strip()
        sf_chronicles_news.append(url)

In [14]:
sf_chronicles_news

['https://web.archive.org/web/20190323184608/https://www.sfchronicle.com/bayarea/article/Environmental-groups-want-SF-to-nudge-Uber-Lyft-13707375.php',
 'https://web.archive.org/web/20190716185842/https://www.sfchronicle.com/bayarea/article/To-cut-carbon-footprint-SF-moves-to-eliminate-14097997.php',
 'https://web.archive.org/web/20230320181546/https://www.sfchronicle.com/bayarea/article/With-new-grant-program-Bay-Area-spreads-electric-14888605.php',
 'https://web.archive.org/web/20230129201621/https://www.sfchronicle.com/bayarea/article/Berkeley-to-considering-banning-the-sale-of-15862688.php']

In [10]:
def get_sf_news(url):
    text = ''
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')
    title = soup.find('h1', class_='header-title')
    if not title or title.text.strip() == '':
        title = soup.find('h1', class_='articleHeader--headline')
    title = title.text.strip() if title else None
    text = ' '.join(p.get_text(strip=True) for p in paragraphs)
    published_date = soup.find('time').text
    news_source = url.split('/')[7].split('.')[1]
    
    return title, text, published_date, news_source
 

In [11]:
dir_path = 'news/california/sf/'

create_dataset('articles.csv', dir_path, sf_chronicles_news, get_sf_news)

file_name = os.path.join(dir_path, 'articles.csv')
df = pd.read_csv(file_name)

df['state'] = 'california-sf'
df.to_csv(file_name, index=False)

In [12]:
df = pd.read_csv('news/california/sf/articles.csv')
df.head()

Unnamed: 0,title,article,date,news_source,state
0,"Environmental groups want SF to nudge Uber, Ly...",Environmental advocates want San Francisco to ...,"\n March 21, 2019\n",sfchronicle,california-sf
1,"To cut carbon footprint, SF moves to eliminate...",San Francisco officials want to shrink the cit...,"\n July 16, 2019\n",sfchronicle,california-sf
2,"With new grant program, Bay Area spreads elect...",This is a carousel. Use Next and Previous butt...,"Dec. 7, 2019",sfchronicle,california-sf
3,Berkeley considers banning the sale of gas-pow...,This is a carousel. Use Next and Previous butt...,"Jan. 11, 2021",sfchronicle,california-sf


# illinois

In [15]:
cst_news = []

In [16]:
with open('news_articles_links/cst_news.txt', 'r') as file:
    for line in file:
        url = line.strip()
        cst_news.append(url)

In [17]:
cst_news

['https://chicago.suntimes.com/2021/4/5/22368102/electric-cars-vehicles-charging-stations-illinois',
 'https://chicago.suntimes.com/politics/2021/11/16/22785414/illinois-electric-vehicle-legislation-ev-clean-transportation-revolution-pritzker',
 'https://chicago.suntimes.com/2021/4/4/22362953/electric-vehicles-volkswagen-trust-money-illinois-editorial',
 'https://chicago.suntimes.com/2023/7/17/23795105/electric-vehicle-manufacturing-illinois-pritzker-corporate-headquarters-jobs-editorial',
 'https://chicago.suntimes.com/2018/1/14/18356079/ford-to-bring-out-40-electrics-hybrids-by-2022']

In [13]:
def get_cst_news(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    article_content = soup.find('div', class_='RichTextBody')
    if article_content:
        text = article_content.get_text()
    else:
        print("couldnt find article content")
    title = soup.find('h1', class_='Page-headline').get_text(strip=True)
    date = 'nan'
    news_source = '-'.join(url.split('/')[2].split('.')[:2])

    return title, text, date, news_source

dir_path = 'news/illinois/'

create_dataset('articles.csv', dir_path, cst_news, get_cst_news)

file_name = os.path.join(dir_path, 'articles.csv')
df = pd.read_csv(file_name)

df['state'] = 'illinois'
df.to_csv(file_name, index=False)

In [14]:
df.head()

Unnamed: 0,title,article,date,news_source,state
0,Electric vehicle charging stations could be bo...,\nI have owned and driven an electric vehicle ...,,chicago-suntimes,illinois
1,"Pritzker signs electric vehicle legislation, a...",\nGov. J.B. Pritzker on Tuesday signed legisla...,,chicago-suntimes,illinois
2,Illinois is sitting on millions that could be ...,\nIllinois has an electric vehicle chicken-and...,,chicago-suntimes,illinois
3,Bring electric vehicle manufacturing to Illino...,\nAs some corporate headquarters decamp for ot...,,chicago-suntimes,illinois
4,"Ford to bring out 40 electrics, hybrids by 2022",\nDETROIT — Ford Motor Co. said it’s doubling ...,,chicago-suntimes,illinois


# Texas

In [18]:
dallas_news = []

In [19]:
with open('news_articles_links/dallas_news.txt', 'r') as file:
    for line in file:
        url = line.strip()
        dallas_news.append(url)

In [20]:
dallas_news

['https://www.dallasnews.com/business/autos/2022/12/28/2023-tax-credits-may-boost-the-appeal-of-electric-vehicles/',
 'https://www.dallasnews.com/news/2022/10/08/are-electric-vehicles-the-future-heres-what-state-fair-of-texas-visitors-are-saying/',
 'https://www.dallasnews.com/business/autos/2023/04/17/only-10-electric-vehicles-will-qualify-for-7500-tax-credit/',
 'https://www.dallasnews.com/news/politics/2023/07/03/new-texas-laws-aimed-at-sharp-rise-in-electric-vehicle-ownership-in-lone-star-state/']

In [15]:
def get_dallas_news(url):
    text = ''
    response = requests.get(url)
    response.raise_for_status()  # This will raise an exception if there's a HTTP error

    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')
    
    text = ' '.join(p.get_text(strip=True) for p in paragraphs)
    title = soup.find('span', class_='dmnc_generic-header-header-module__i2K-Y mr-7').get_text(strip=True)
    date = soup.find('div', class_='text-gray-medium')
    second_p = date.find_all('p')[1].get_text(strip=True)
    news_source = url.split('/')[2].split('.')[1]
    
    return title, text, second_p, news_source

dir_path = 'news/texas/'
create_dataset('articles.csv', dir_path, dallas_news, get_dallas_news)

file_name = os.path.join(dir_path, 'articles.csv')
df = pd.read_csv(file_name)

df['state'] = 'texas'
df.to_csv(file_name, index=False)

In [16]:
df.head()

Unnamed: 0,title,article,date,news_source,state
0,2023 tax credits may boost the appeal of elect...,businessAutos ByThe Associated Press 5:38 AM o...,"5:38 AM on Dec 28, 2022 CST",dallasnews,texas
1,Are electric vehicles the future? Here’s what ...,"News ByNoor Adatia 8:00 AM on Oct 8, 2022 CDT ...","8:00 AM on Oct 8, 2022 CDT",dallasnews,texas
2,"Only 10 electric vehicles will qualify for $7,...",businessAutos ByThe Associated Press 1:38 PM o...,"1:38 PM on Apr 17, 2023 CDT",dallasnews,texas
3,New Texas laws aimed at sharp rise in electric...,"newsPolitics ByAarón Torres 6:00 AM on Jul 3, ...","6:00 AM on Jul 3, 2023 CDT",dallasnews,texas


# Maryland

In [21]:
maryland_news = []

In [22]:
with open('news_articles_links/maryland_news.txt', 'r') as file:
    for line in file:
        url = line.strip()
        maryland_news.append(url)

In [23]:
maryland_news

['https://www.baltimoresun.com/business/ct-biz-electric-vehicles-tax-credits-ap-20220809-xyjdmajuurh4lehqct2mek7nly-story.html',
 'https://www.baltimoresun.com/opinion/columnists/dan-rodricks/bs-ed-rodricks-0816-electric-cars-20220816-jx5tqcuzhffmrh5un3mubollri-story.html',
 'https://www.baltimoresun.com/business/ct-biz-rivian-georgia-meeting-20220420-qjctyvdlqfal5gyncifphhxpzu-story.html',
 'https://www.baltimoresun.com/news/environment/bs-md-hogan-administration-california-electric-cars-rule-20221213-ag72mp7qrfe2jb23yhym5hapqe-story.html',
 'https://www.baltimoresun.com/news/environment/bs-md-california-electric-vehicle-rule-20230313-44vamz2vczevvahuhivdluqscy-story.html']

In [17]:
def get_maryland_news(url):
    text = ''
    response = requests.get(url)
    response.raise_for_status()  # This will raise an exception if there's a HTTP error

    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find('h1', class_='headline').get_text(strip=True)
    
    paragraphs = soup.find('article', class_='article-body-wrapper-custom')
    text = ' '.join(p.get_text(strip=True) for p in paragraphs.find_all('p'))
    
    date = soup.find('time')
    datetime_value = date['datetime']
    
    news_source = url.split('/')[2].split('.')[1]
    
    return title, text, datetime_value, news_source

In [18]:
dir_path = 'news/maryland/'
create_dataset('articles.csv', dir_path, maryland_news, get_maryland_news)

file_name = os.path.join(dir_path, 'articles.csv')
df = pd.read_csv(file_name)

df['state'] = 'maryland'
df.to_csv(file_name, index=False)

In [19]:
df.head()

Unnamed: 0,title,article,date,news_source,state
0,Most electric vehicles won’t qualify for feder...,"DETROIT — A tax credit of up to $7,500 could b...",2022-08-09 10:54:00,baltimoresun,maryland
1,Dan Rodricks: All charged up for an electric c...,"If you own an electric car or truck, or drive ...",2022-08-16 11:12:00,baltimoresun,maryland
2,Rivian electric car plant blasted by foes at G...,"MONROE, Ga. — Opponents trying to derail a $5 ...",2022-04-20 12:13:00,baltimoresun,maryland
3,Hogan administration won’t match California’s ...,Maryland Gov. Larry Hogan’s administration won...,2022-12-14 15:59:00,baltimoresun,maryland
4,California electric vehicle rule takes another...,Maryland took a big step Monday toward eventua...,2023-03-13 17:52:00,baltimoresun,maryland
