In [51]:
## import libraries
import requests  # Makes HTTP requests to fetch web pages from URLs
from bs4 import BeautifulSoup  # Parses HTML content into navigable Python objects for web scraping
import pandas as pd  # Creates and manipulates DataFrames for organizing scraped data into tables
import time  # Adds delays between requests to avoid overwhelming the server
from random import uniform  # Generates random time intervals to make scraping delays less predictable

In [53]:
## creating a function that makes a request to the newsbank website and returns content as soup.

def makeSoup(url):
    
    headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
    response = requests.get(url, headers = headers)
    if response.status_code == 200:
        return BeautifulSoup(response.text, "html.parser")

    else:
        print(f"Your request returned {response.status_code}")

In [68]:
#trying to see response code. Included all this in this cell because only response.status_code was not working since my response was/
# was only inside the function. 
# Anyway, 200 means it's working.

url = "https://infoweb-newsbank-com.journalism.ezproxy.cuny.edu/apps/news/results?sort=YMD_date%3AD&p=AWNB&hide_duplicates=0&t=state%3ANY%21USA%2520-%2520New%2520York&maxresults=20&f=advanced&offset=0&val-base-0=%22boil%20water%20advisory%22&fld-base-0=Title&bln-base-1=and&val-base-1=2015-2025&fld-base-1=YMD_date"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
response = requests.get(url, headers=headers)
print(response.status_code)

200


In [70]:
# for string content like HTML, XML etc
response.text

'<!DOCTYPE html>\n<html lang="en">\n\t<head>\n\t\t<meta charset="utf-8">\n\t\t<title>Log In | Off Campus Access @ Craig Newmark Graduate School of Journalism</title>\n\t\t<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-T3c6CoIi6uLrA9TneNEoa7RxnatzjcDSCmG1MXxSR1GAsXEV/Dwwykc2MPK8M2HN" crossorigin="anonymous" />\n\t\t<link href="public/favicon.ico" rel="shortcut icon" type="image/x-icon">\n\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t<script src="public/modal.js"></script>\n\t\t\t\t\t\n\t\t\t\t\t\n\t</head>\n\t<body>\n\t\t<main>\n\t\t\t<div class="container">\n\t\t\t\t<header class="d-flex flex-wrap justify-content-center py-3 mb-4 border-bottom">\n\t\t\t\t\t<a class="d-flex align-items-center mb-3 mb-md-0 me-md-auto link-body-emphasis text-decoration-none" href="https://www.journalism.cuny.edu/current-students/research-center/">\n\t\t\t\t\t\t<img class="bi me-4" src="public/logo.png" alt="Craig Newmark Graduate School of Journ

In [72]:
## convert response.text into a BeautifulSoup object
soup = BeautifulSoup(response.text,"html.parser")
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Log In | Off Campus Access @ Craig Newmark Graduate School of Journalism</title>
<link crossorigin="anonymous" href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" integrity="sha384-T3c6CoIi6uLrA9TneNEoa7RxnatzjcDSCmG1MXxSR1GAsXEV/Dwwykc2MPK8M2HN" rel="stylesheet"/>
<link href="public/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<script src="public/modal.js"></script>
</head>
<body>
<main>
<div class="container">
<header class="d-flex flex-wrap justify-content-center py-3 mb-4 border-bottom">
<a class="d-flex align-items-center mb-3 mb-md-0 me-md-auto link-body-emphasis text-decoration-none" href="https://www.journalism.cuny.edu/current-students/research-center/">
<img alt="Craig Newmark Graduate School of Journalism" class="bi me-4" src="public/logo.png" style="max-height:32px;"/>
<span class="fs-4"> Library</span>
</a>
<ul class="nav nav-pills">
<li class="nav-item"><a class="n

In [74]:
## prettify our printout
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Log In | Off Campus Access @ Craig Newmark Graduate School of Journalism
  </title>
  <link crossorigin="anonymous" href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" integrity="sha384-T3c6CoIi6uLrA9TneNEoa7RxnatzjcDSCmG1MXxSR1GAsXEV/Dwwykc2MPK8M2HN" rel="stylesheet"/>
  <link href="public/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <script src="public/modal.js">
  </script>
 </head>
 <body>
  <main>
   <div class="container">
    <header class="d-flex flex-wrap justify-content-center py-3 mb-4 border-bottom">
     <a class="d-flex align-items-center mb-3 mb-md-0 me-md-auto link-body-emphasis text-decoration-none" href="https://www.journalism.cuny.edu/current-students/research-center/">
      <img alt="Craig Newmark Graduate School of Journalism" class="bi me-4" src="public/logo.png" style="max-height:32px;"/>
      <span class="fs-4">
       Library
      </span>
  

# Things I need from the page:
1. 