In [None]:
from bs4 import BeautifulSoup as bsoup
from urllib.request import Request, urlopen
import urllib.error
import csv
import requests
import pandas as pd
import numpy as np
import time
import logging

In [None]:
logging.basicConfig(filename='sightings_webscrape.log', format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO)
logger = logging.getLogger()

In [None]:
def get_page(url):
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    count = 0
    webpage = ""
    while count < 100:
        count += 1
        try:
            webpage = urlopen(req).read()
        except urllib.error.HTTPError as e:
            # Return code error (e.g. 404, 501, ...)
            logger.info('HTTPError: {}'.format(e.code))
            logger.info(f"Errored: {len(sightings)}; {url}")
            return None
        except urllib.error.URLError as e:
            # Not an HTTP-specific error (e.g. connection refused)
            logger.info('URLError: {}'.format(e.reason))
            time.sleep(15)
            continue
        except:
            logger.info(f"Errored: {len(sightings)}; {url}")
            time.sleep(15)
            continue
            logger.info(f"Trying again: {len(sightings)}; {url}")
        
        if len(webpage) > 0:
            break
        
        if count >= 100:
            return None

    #Create a beautiful soup object so we can work with the data
    soup = bsoup(webpage, 'html.parser')

    #Sanity check 
    return soup

def get_iterable_data(table):
    for c in table:
        yield c.getText()

def get_sighting_detail(url_string):
    detail_page = get_page(url_string)
    if detail_page is None:
        logger.info(f"No summary page: {url_string}")
        return "Summary detail page not found."
    detail_page = detail_page.find("tbody")
    if detail_page is None:
        return None
    rows = detail_page.find_all("tr")
    d = get_iterable_data(rows)
    next(d)
    return next(d)

def monthly_report_detail(url_string, col_names, sightings, remaining):
    # func_sightings = pd.DataFrame(columns = col_names)
    soup = get_page(url_string)
    temp = dict()

    results = soup.find_all('tr', attrs={'valign': 'TOP'})
    index = len(sightings)
    count = 0
    for result in results:
        if remaining > 0:
            remaining -= 1
            count += 1
            continue
        # print(f"Length: {len(sightings)}, index: {index}, count: {count}, remaining: {remaining}")
        table_cols = result.find_all('td')    
        col_text = get_iterable_data(table_cols)
        temp["Date_Time"] = next(col_text)
        temp["City"] = next(col_text)
        temp["State"] = next(col_text)
        temp["Shape"] = next(col_text)
        temp["Duration"] = next(col_text)
        temp["Summary"] = next(col_text)
        temp["Posted"] = next(col_text)
        temp["Detail_Link"] = "http://www.nuforc.org/webreports/" + result.find("a").get("href")
        temp["Detail_Summary"] = get_sighting_detail(temp["Detail_Link"])

        # func_sightings = func_sightings.append(temp, ignore_index = True) 
        sightings.loc[index] = temp
        logger.info(f"Record added: {temp['Date_Time']} - {temp['City']}, {temp['State']}")
        print(f"Record added: {temp['City']}, {temp['State']}")
        index += 1
    return len(results)

In [None]:
# col_names = ["Date_Time", "City", "State", "Shape", "Duration", "Summary", "Posted", "Detail_Link", "Detail_Summary"]
# sightings = pd.DataFrame(columns = col_names)
file_name = "sightings.pkl"
logger.info(f"Data read from {file_name}")
sightings = pd.read_pickle(file_name)

len(sightings)

In [None]:
url_string = "http://www.nuforc.org/webreports/ndxevent.html"
monthly_report = get_page(url_string)
monthly_report_list = monthly_report.find_all('tr', attrs={'valign': 'TOP'})
# print(monthly_report_list)

remaining = len(sightings)
for report in monthly_report_list:
    cols = report.find_all("td")
    report_number = int(cols[1].getText())

    if remaining > report_number:
        remaining = remaining - report_number
        continue

    if remaining > 0:
        logger.info(f"Resuming: {len(sightings)}")

    monthly_report_url = "http://www.nuforc.org/webreports/" + report.find("a").get("href")
    logger.info(f"new report page: {monthly_report_url}; Sightings in page: {report_number}; Total: {len(sightings)}")
    sightings_added = monthly_report_detail(monthly_report_url, col_names, sightings, remaining)
    logger.info(f"added {sightings_added} sightings, total now {len(sightings)}")
    print(f"added {sightings_added} sightings, total now {len(sightings)}")

    sightings.to_pickle(file_name)

    remaining = 0

In [None]:
len(sightings)

In [None]:
sightings.head()

In [None]:
len(sightings[sightings["Detail_Summary"] == "Summary detail page not found."])

In [None]:
len(sightings[(sightings["Detail_Summary"].isnull())])

In [None]:
no_detail_summary_df = sightings[(sightings["Detail_Summary"].isnull()) | (sightings["Detail_Summary"] == "Summary detail page not found.")]
len(no_detail_summary_df)

In [None]:
count = 0
for index, row in no_detail_summary_df.iterrows():
    url = row.Detail_Link
    sighting_detail = get_sighting_detail(url)
    print(f"{count}, {row.Date_Time}: {sighting_detail}; {url}") 
    count += 1