<a href="https://colab.research.google.com/github/shaarialwi/Data-analyst/blob/main/Web_Scraping_for_Crime_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
crime_news_url = "https://chicago.suntimes.com/crime/2021/3/19/22341321/4-year-old-boy-shot-washington-park"

In [None]:
from bs4 import BeautifulSoup # The library to parse HTML code 
import requests # The library to get the content of the website via the URL 

In [None]:
# Consume the URL and save the response into page_response ( variable )
page_response = requests.get(crime_news_url, timeout=5)

# Parse the content from page_response with HTML Parser
page_content = BeautifulSoup(page_response.content, "html.parser")

In [None]:
print(page_content)


<!DOCTYPE html>

<html lang="en">
<head>
<title>Washington Park shooting: 4-year-old boy shot on Michigan - Chicago Sun-Times</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<meta content="Chicago Sun-Times" name="apple-mobile-web-app-title">
<meta content="Sun-Times Wire" property="author">
<meta content="2021-03-19T21:39:59-05:00" property="article:published_time">
<meta content="2021-03-19T21:39:59-05:00" property="article:modified_time"/>
<link href="https://use.typekit.net" rel="preconnect"/>
<meta data-cdata='{"version":"8a470034e039723c0baf0727d836d6da","fonts_catalog":[{"slug":"cst-farnham-headline","family":"farnham-headline","weight":"400","style":"italic","woff2_url":"","woff_url":""},{"slug":"cst-farnham-headline","family":"farnham-headline","weight":"400","style":"normal","woff2_url":"","woff_url":""},{

In [None]:
titles = page_content.select('h1') # Query all h1 (element) from the page content
 
 # Get the 1st item from the list
 # Strip - remove space before and after the text
title = titles[0].text.strip()
print(title)

4-year-old boy shot in Washington Park


In [None]:
# Images is a list
images = page_content.find_all("picture", {"class": "c-picture"})

# select 1st item from list
image = images[0]

# Select the img HTML tag 
img_tags = image.select('img')

# img_tags ( list ), select 1st item from list
img_tag = img_tags[0]

# extract the image source from img tag via src
image_path = img_tag['src']
print(image_path)

https://cdn.vox-cdn.com/thumbor/n1_u-EVGTNqt13oE9wzsi4RK3BA=/0x0:1087x725/1200x800/filters:focal(458x277:630x449)/cdn.vox-cdn.com/uploads/chorus_image/image/68997332/Evidence_Markers.0.jpg


In [None]:
# Scrap content of the news via div
content_raw = page_content.find_all(
    "div", 
    {"class": "c-entry-content piano-paywall-hide"}
)

# Select paragraph from content_raw
paragraphs = content_raw[0].select('p')

# Create an empty string variable 
content = ""

# loop paragraphs and save it inside content
for p in paragraphs:
  print(f"p.text.strip() : {p.text.strip()}")
  content = content + p.text.strip()
  # print(f"content : {content}\n")

p.text.strip() : A 4-year-old boy was shot Friday afternoon in Washington Park on the South Side.
content : A 4-year-old boy was shot Friday afternoon in Washington Park on the South Side.

p.text.strip() : The boy was riding in a vehicle with his family about 4 p.m. when someone in another vehicle pulled alongside and opened fire in the 6100 block of South Michigan Avenue, Chicago police said.
content : A 4-year-old boy was shot Friday afternoon in Washington Park on the South Side.The boy was riding in a vehicle with his family about 4 p.m. when someone in another vehicle pulled alongside and opened fire in the 6100 block of South Michigan Avenue, Chicago police said.

p.text.strip() : The boy was struck in the chin and taken to Comer Children’s Hospital in good condition, according to Deputy Chief Yolanda Talley.
content : A 4-year-old boy was shot Friday afternoon in Washington Park on the South Side.The boy was riding in a vehicle with his family about 4 p.m. when someone in anoth

In [None]:
news = {
    "title": title,
    "img_path": image_path,
    "content": content
}
print(news)

# In order to, import into a Pandas DataFrame, we will need to change
# the content into a list
news_df = {
    "title": [title],
    "img_path": [image_path],
    "content": [content]
}


{'title': '4-year-old boy shot in Washington Park', 'img_path': 'https://cdn.vox-cdn.com/thumbor/n1_u-EVGTNqt13oE9wzsi4RK3BA=/0x0:1087x725/1200x800/filters:focal(458x277:630x449)/cdn.vox-cdn.com/uploads/chorus_image/image/68997332/Evidence_Markers.0.jpg', 'content': 'A 4-year-old boy was shot Friday afternoon in Washington Park on the South Side.The boy was riding in a vehicle with his family about 4 p.m. when someone in another vehicle pulled alongside and opened fire in the 6100 block of South Michigan Avenue, Chicago police said.The boy was struck in the chin and taken to Comer Children’s Hospital in good condition, according to Deputy Chief Yolanda Talley.No one is in custody as Area Two detectives investigate.The shooting happened hours before a 10-year-old boy was wounded in a triple shooting in East Garfield Park.'}


In [None]:
import pandas as pd
import pandas_gbq
from google.colab import auth

auth.authenticate_user()

In [None]:
df = pd.DataFrame.from_dict(news_df)
df.head()

Unnamed: 0,title,img_path,content
0,4-year-old boy shot in Washington Park,https://cdn.vox-cdn.com/thumbor/n1_u-EVGTNqt13...,A 4-year-old boy was shot Friday afternoon in ...


In [None]:
project_id = "theleadio"
dataset = "DS360Dataset"
table_name = "CrimeNews"

In [None]:
# Push the table into Google BigQuery Table
pandas_gbq.to_gbq(
        df, 
        f"{dataset}.{table_name}", 
        project_id=project_id,
        if_exists='append'
      )

1it [00:03,  3.42s/it]


In [None]:
url = "https://chicago.suntimes.com/crime/"
page_response = requests.get(url, timeout=5)
page_content = BeautifulSoup(page_response.content, "html.parser")

In [None]:
news_list = page_content.find_all("div", {"class": "c-entry-box--compact__body"})

for news in news_list:
  # Get the HTML a hyper refence within the span HTML TAG
  temp_news = news.select("h2")
  # Extract url link from the span HTML TAG
  url = temp_news[0].select('a')[0]['href']
  page_response = requests.get(url, timeout=5)
  page_content = BeautifulSoup(page_response.content, "html.parser")

  titles = page_content.select('h1') # Query all h1 (element) from the page content
 
  # Get the 1st item from the list
  # Strip - remove space before and after the text
  title = titles[0].text.strip()
  
  # Images is a list
  images = page_content.find_all("picture", {"class": "c-picture"})

  # select 1st item from list
  if images:
    image = images[0]

    # Select the img HTML tag 
    img_tags = image.select('img')

    # img_tags ( list ), select 1st item from list
    img_tag = img_tags[0]

  # extract the image source from img tag via src
    image_path = img_tag['src']
  else:
    image_path = ""

  # Scrap content of the news via div
  content_raw = page_content.find_all(
      "div", 
      {"class": "c-entry-content piano-paywall-hide"}
  )
  if content_raw:
    # Select paragraph from content_raw
    paragraphs = content_raw[0].select('p')

    # Create an empty string variable 
    content = ""

    # loop paragraphs and save it inside content
    for p in paragraphs:
      print(f"p.text.strip() : {p.text.strip()}")
      content = content + p.text.strip()
      # print(f"content : {content}\n")
  else:
    content = ""
  # In order to, import into a Pandas DataFrame, we will need to change
  # the content into a list
  if title and image_path and content:
    news_df = {
        "title": [title],
        "img_path": [image_path],
        "content": [content]
    }
    df = pd.DataFrame.from_dict(news_df)
    # Push the table into Google BigQuery Table
    pandas_gbq.to_gbq(
            df, 
            f"{dataset}.{table_name}", 
            project_id=project_id,
            if_exists='append'
          )