In [74]:
# import web driver
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC


import requests
import time
import random
import re
from seleniumrequests import Chrome
import getpass
import os
import pandas as pd
import datetime as dt
from bs4 import BeautifulSoup

Prepare Chicago Crime Data

In [2]:
# # Preprocessing Chicago Crime Data

# # Import Chicago Crime Data : Source : https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present/ijzp-q8t2
# df_crime = pd.read_csv("chicago_crime.csv")

# # Convert text date to datetime
# df_crime['DATE_TIME'] = pd.to_datetime(df_crime['Date'],format="%m/%d/%Y %I:%M:%S %p", errors = 'coerce')

# # Strip hr, min, sec data so that aggregation per day is possible
# df_crime['DATE_TIME_TRUN'] = df_crime['DATE_TIME'].map(lambda x: x.strftime('%Y-%m-%d'))

# # Drop unneeded columns
# df_crime = df_crime[['ID','DATE_TIME_TRUN']].copy()

# # Just get the date and total crime count and create new df after aggregating by single day
# df_crime = df_crime.groupby(['DATE_TIME_TRUN'],as_index = False).count().copy()

# # Save cleaned data
# df_crime.to_csv('chicago_crime_cleaned_aggregated.csv',index=False)

In [3]:
df_crime = pd.read_csv('chicago_crime_cleaned_aggregated.csv')
df_crime['DATE_TIME'] = pd.to_datetime(df_crime['DATE_TIME_TRUN'])
df_crime['YEAR_MONTH'] = df_crime['DATE_TIME_TRUN'].apply(lambda x:x[:-3])
df_crime = df_crime[df_crime['DATE_TIME_TRUN']>'2009-09-01'] #limited by weather data
df_crime.rename(columns={'ID':'CRIME_COUNT'}, inplace=True)

In [4]:
df_crime.describe()

Unnamed: 0,CRIME_COUNT
count,3592.0
mean,824.354399
std,152.031129
min,320.0
25%,716.0
50%,799.0
75%,923.0
max,1538.0


In [5]:
df_crime.head()

Unnamed: 0,DATE_TIME_TRUN,CRIME_COUNT,DATE_TIME,YEAR_MONTH
3166,2009-09-02,1105,2009-09-02,2009-09
3167,2009-09-03,1125,2009-09-03,2009-09
3168,2009-09-04,1172,2009-09-04,2009-09
3169,2009-09-05,1067,2009-09-05,2009-09
3170,2009-09-06,1105,2009-09-06,2009-09


# Weather Data Scraping

### Initiate Chrome browser

In [206]:
# Initiate Selenium Chrome driver for Mac
git_folder_location = os.path.abspath(os.path.dirname('metis_proj_2_luther'))
full_path_to_chromedriver = os.path.join(git_folder_location, "chromedriver")
driver = Chrome(executable_path = full_path_to_chromedriver)
# url = 'https://www.timeanddate.com/weather/usa/chicago/historic?month={}&year={}'.format(2,2010)
# driver.get(url) 

### (Testing) scroll length

In [202]:
# url = 'https://www.timeanddate.com/weather/usa/chicago/historic?month={}&year={}'.format(2,2010)
# driver.get(url) 
# # Load Entire Page by Scrolling to charts
# SCROLL_PAUSE_TIME = 2# Scroll to Very Bottom to Load All
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight/4);") # Scroll down to bottom

### Web scraping function

In [207]:
def scrape_weather_url(url):
    # weather data holder
    high_low, weather_desc, humidity_barometer, wind, date_time = [], [], [], [], []
    # open url
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "lxml")
    days_chain = [x.find_all('a') for x in soup.find_all(class_='weatherLinks')]
    time.sleep(5)
    
    # Load Entire Page by Scrolling to charts
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight/3.5);") # Scroll down to bottom


    
    counter = 0
    for ix,link in enumerate(days_chain[0]):
        
        '''
        Bottom section tries to solve loading issue
        Refer : https://selenium-python.readthedocs.io/waits.html
        '''
        wait = WebDriverWait(driver, 10)
        if counter!=0:
            delay = 3 # seconds
            try:
                myElem = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'weatherLinks')))
            except TimeoutException:
                print("Loading took too much time!" ) 
            day_link = driver.find_element_by_xpath("//div[@class='weatherLinks']/a[{}]".format(ix+1))
            wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='weatherLinks']/a[{}]".format(ix+1))))
            day_link.click()
        else:
            delay = 5 # seconds
            try:
                myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'weatherLinks')))
            except TimeoutException:
                print("Loading took too much time!" ) 
            day_link = driver.find_element_by_xpath("//div[@class='weatherLinks']/a[{}]".format(ix+1))
            wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='weatherLinks']/a[{}]".format(ix+1))))
            time.sleep(4)
            day_link.click()
            time.sleep(3)
            counter+=1

        time.sleep(2.5)
            
        # Selenium locating element document : https://selenium-python.readthedocs.io/locating-elements.html#locating-elements
#         try:
#             myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'right__block')))
#         except TimeoutException:
#             print("Loading took too much time!")
        high_low.insert(0,driver.find_elements_by_xpath("//div[@class='temp']")[-1].text) #notice elements, s at the end. This returns a list, and I can index it.
        weather_desc.insert(0,driver.find_element_by_xpath("//div[@class='wdesc']").text)
        humidity_barometer.insert(0,driver.find_element_by_xpath("//div[@class='mid__block']").text)
        wind.insert(0,driver.find_element_by_xpath("//div[@class='right__block']").text)
        date_time.insert(0,driver.find_elements_by_xpath("//div[@class='date']")[-1].text)
    return high_low, weather_desc, humidity_barometer, wind, date_time

 

### Remove month & year already scraped. Then start scraping

In [208]:
# all of month & year combo that need to be scraped
year_month_list = list(df_crime['YEAR_MONTH'].unique())

# already scraped (list from what's already saved in the folder)
done_list = os.listdir(git_folder_location+'/weather_data/')[1:]
done_list_processed = [x[11:][:-4].split('_') for x in done_list]
for date in done_list_processed:
    if len(date[0])<2:
        date[0]='0'+date[0]
done_list_processed = [x[1]+'-'+x[0] for x in done_list_processed]

# remove done items from list of month & year combe that need to be scraped
year_month_list_done_removed = year_month_list.copy()
for done_item in done_list_processed:
    year_month_list_done_removed.remove(done_item)

# define initial empty dataframe
df_weather = pd.DataFrame({'DATE_TIME':[], 'HIGH_LOW':[], 'WEATHER_DESC':[],'HUMIDITY_BAROMETER':[],'WIND':[]})

# iterate through each year & month combo to scrape
for date in year_month_list_done_removed:
    month = int(date[5:])
    year = date[:4]
    url = 'https://www.timeanddate.com/weather/usa/chicago/historic?month={}&year={}'.format(month,year)
    high_low, weather_desc, humidity_barometer, wind, date_time = scrape_weather_url(url)
    df_weather_holder = pd.DataFrame({'DATE_TIME':date_time, 'HIGH_LOW':high_low, 'WEATHER_DESC':weather_desc,'HUMIDITY_BAROMETER':humidity_barometer,'WIND':wind})
    df_weather = df_weather.append(df_weather_holder)
    df_weather.to_csv('weather_data/df_weather_{}_{}.csv'.format(month,year),index=False)
    

KeyboardInterrupt: 

### (Troubleshooting) Compare saved & actual weather data

In [210]:
# read_index = 0

# # all of month & year combo that need to be scraped
# year_month_list = list(df_crime['YEAR_MONTH'].unique())
# # already scraped (list from what's already saved in the folder)
# done_list = os.listdir(git_folder_location+'/weather_data/')[1:]
# done_list_processed = [x[11:][:-4].split('_') for x in done_list]
# for date in done_list_processed:
#     if len(date[0])<2:
#         date[0]='0'+date[0]
# done_list_processed = [x[1]+'-'+x[0] for x in done_list_processed]
# # remove done items from list of month & year combe that need to be scraped
# year_month_list_done_removed = year_month_list.copy()
# for done_item in done_list_processed:
#     year_month_list_done_removed.remove(done_item)
# # Open weather site for that month
# read_month = [x[11:][:-4].split('_') for x in done_list][read_index][0]
# read_year = [x[11:][:-4].split('_') for x in done_list][read_index][1]
# url = 'https://www.timeanddate.com/weather/usa/chicago/historic?month={}&year={}'.format(read_month,read_year)
# git_folder_location = os.path.abspath(os.path.dirname('metis_proj_2_luther'))
# full_path_to_chromedriver = os.path.join(git_folder_location, "chromedriver")
# driver = Chrome(executable_path = full_path_to_chromedriver)
# driver.get(url) 
# # Load Entire Page by Scrolling to charts
# SCROLL_PAUSE_TIME = 2# Scroll to Very Bottom to Load All
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight/4);") # Scroll down to bottom
# # Load csv file in panda.
# pd.read_csv(os.getcwd()+'/weather_data/'+done_list[read_index])

### Fine Next 
soup.find('a').findNextSibling()
### Return a list of all matches
soup.find_all('a')  
[link for link in soup.find_all('a') if 'joelcoen' in str(link)]
### Retrieve the url from an anchor tag
soup.find('a')['href']
### Find all based on id or class
soup.find_all(id='top_links')  
soup.find_all(class_='mp_box_content')
### Beautiful Soup - Chaining Finds
chain = [x.find_all('td') for x in soup.find_all(class_='mp_box_content')]  

To extract just the value of interest:   
soup.find(class_='mp_box_content').find_all('td')[1].text