# Part 1: Data Acquisition

At this part, I will acquire data by using the crawling method.
I will crawl the **National Centers for Environmenal Information** site - "https://www.ncdc.noaa.gov/stormevents/choosedates.jsp?statefips=-999,ALL"

Import necessary packages

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

In [None]:
# This function create and return a soup object for a given html file.    
def getHtml(url):
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")
    return soup.prettify()

In [None]:
# This function return the table in html file.
def getTable(html, numOfTable):
    try:
        table = html.find_all(lambda tag: tag.name == 'table')[numOfTable]
    except:
        print("An exception occurred: " + table)
    return table

In [None]:
# This function return all rows in the table
def getRows(table):
    rows = table.find_all('tr')
    return rows

In [None]:
# This function return all columns in the table
def getCell(rows):
    cell = list()
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 2:
            cell.append(cells[0].get_text())
            cell.append(cells[1].get_text())
    return cell

In [None]:
# Thid function get link and  completion to whole address Additionally, remove extra Spaces.
def fixUrls(links):
    url_list = list()
    for link in links:
        url = "https://www.ncdc.noaa.gov/stormevents/"+link
        url_list.append(url.strip())
    return url_list

In the site **National Centers for Environmenal Information** the Tornado storms are divided into time periods, and for each time period we have url address (main_url) with different ID, and for each ID we have html file that contain table with url addrresses (secondry_url) that represent the all Tornado storms in this time period.

In [None]:
# This function get main url of one page and return all the url in this page. 
def getAllUrl(main_url):
    html = requests.get(main_url)
    soup = BeautifulSoup(html.content, "html.parser")
    main_table = getTable(soup, 1)
    data = main_table.find_all(lambda tag: tag.name == "a")
    links = list()
    for item in data:
        if str(item.get('href')) == "#":
            continue
        links.append(str(item.get('href')))
    url_list = fixUrls(links)
    return url_list

In [None]:
# This function create list of the years that i want to based on (2010-2022)
def getYearsList(yearsList, startYear, endYear):
    years = range(startYear, endYear)
    for year in years:
        yearsList.append(year)
    return yearsList

In [None]:
# This function create list of the months (1-12)
def getMonthsList(monthsList, startMonth, endMonth):
    months = range(startMonth, endMonth)
    for month in months:
        monthsList.append(month)
    return monthsList

In [None]:
# This function return the list of main urls between 2010 to 2021
def getMainUrlBetween2010To2021(yearsList, monthsList):
    list_url = list()
    beginDay = endDay = 1
    for year in yearsList:
        beginYear = endYear = year
        for month in monthsList:
            beginMonth = month
            endMonth = month + 1
            if beginMonth < 9:
                url_by_dates = "https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=%28C%29+Tornado&beginDate_mm=0" + str(beginMonth) + "&beginDate_dd=0" + str(beginDay) + "&beginDate_yyyy=" + str(beginYear) + "&endDate_mm=0" + str(endMonth) + "&endDate_dd=0" + str(endDay) + "&endDate_yyyy=" + str(endYear) + "&hailfilter=0.00&tornfilter=0&windfilter=000&sort=DT&submitbutton=Search&statefips=-999%2CALL"
                list_url.append(url_by_dates)
            if beginMonth == 9:
                url_by_dates = "https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=%28C%29+Tornado&beginDate_mm=0" + str(beginMonth) + "&beginDate_dd=0" + str(beginDay) + "&beginDate_yyyy=" + str(beginYear) + "&endDate_mm=" + str(endMonth) + "&endDate_dd=0" + str(endDay) + "&endDate_yyyy=" + str(endYear) + "&hailfilter=0.00&tornfilter=0&windfilter=000&sort=DT&submitbutton=Search&statefips=-999%2CALL"
                list_url.append(url_by_dates)
            if beginMonth > 9:
                url_by_dates = "https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=%28C%29+Tornado&beginDate_mm=" + str(beginMonth) + "&beginDate_dd=0" + str(beginDay) + "&beginDate_yyyy=" + str(beginYear) + "&endDate_mm=" + str(endMonth) + "&endDate_dd=0" + str(endDay) + "&endDate_yyyy=" + str(endYear) + "&hailfilter=0.00&tornfilter=0&windfilter=000&sort=DT&submitbutton=Search&statefips=-999%2CALL"
                list_url.append(url_by_dates)
    return (list_url)

In [None]:
# This function return the list of main urls in 2022
def getMainUrlIn2022(yearsList, monthsList):
    list_url = list()
    beginDay = endDay = 1
    for year in yearsList:
        beginYear = endYear = year
        for month in monthsList:
            beginMonth = month
            endMonth = month + 1
            url_by_dates = "https://www.ncdc.noaa.gov/stormevents/listevents.jsp?eventType=%28C%29+Tornado&beginDate_mm=0" + str(beginMonth) + "&beginDate_dd=0" + str(beginDay) + "&beginDate_yyyy=" + str(beginYear) + "&endDate_mm=0" + str(endMonth) + "&endDate_dd=0" + str(endDay) + "&endDate_yyyy=" + str(endYear) + "&hailfilter=0.00&tornfilter=0&windfilter=000&sort=DT&submitbutton=Search&statefips=-999%2CALL"
            list_url.append(url_by_dates)
    return list_url


In [None]:
# This function create list that contain all secondry urls between 2010 to 2022
def getSecondaryUrls2010to2022(yearsList, monthsList, yearsList2022, monthsList2022):
    mainUrls = list()
    secondaryUrls = list()
    mainUrls = getMainUrlBetween2010To2021(yearsList, monthsList)
    mainUrls.extend(getMainUrlIn2022(yearsList2022, monthsList2022))
    for url in mainUrls:
        secondaryUrls.extend(getAllUrl(url))
    return secondaryUrls

In [None]:
# This function return list with data on torndo storms - 2010-2022
def getData(listOfCells, nameOfcells):
    dataList = list()
    lenNameOfCells = len(nameOfcells)
    lenListOfCells = len(listOfCells)
    for i in range(lenNameOfCells):
        for j in range(0, lenListOfCells, 2):
            if listOfCells[j] == nameOfcells[i]:
               dataList.append(listOfCells[j+1])
    dataList = addNaN(dataList, listOfCells, lenNameOfCells, nameOfcells)
    return dataList

In the site we have different tables in the part of torndo storms, so to avoid incompatibility I put NaN

In [None]:
# This function add NaN in missing places
def addNaN(dataList, listOfCells, lenNameOfCells, nameOfcells):
    for j in range(lenNameOfCells):
        b = nameOfcells[j]
        if b not in listOfCells:
            dataList.insert(j, np.nan)
    return dataList

In [None]:
# This function create the dataframe and load to csv
def createDataFrame(secondaryUrls, nameOfcells):
    dataList = list()
    listOfCells = list()
    scale = list()
    length = list()
    width = list()
    state = list()
    beginDate = list()
    beginLatLon = list()
    deaths = list()
    numOfTable = 0
    for url in secondaryUrls:
        print(url)
        i = 0
        try:
            html = requests.get(url)
        except:
            print("An exception occurred: " + url)
        sleep(1)
        soup = BeautifulSoup(html.content, "html.parser")
        table = getTable(soup, numOfTable)
        rows = getRows(table)
        listOfCells = getCell(rows)
        dataList = getData(listOfCells, nameOfcells)
        scale.insert(i, dataList[0])
        length.insert(i, dataList[1])
        width.insert(i, dataList[2])
        state.insert(i, dataList[3])
        beginDate.insert(i, dataList[4])
        beginLatLon.insert(i, dataList[5])
        deaths.insert(i, dataList[6])
        i = i + 1

    df = pd.DataFrame({'Scale': scale, 'Length': length, 'Width': width, 'Country': state, 'Begin Date': beginDate, 'Begin Lat/Lon': beginLatLon, 'Deaths Direct/Indirect': deaths})
    df.to_csv('TornadoTable.csv', mode='a', index=False, header=True)

# Main

In [None]:
 nameOfCells = ['-- Scale', '-- Length', '-- Width', 'State', 'Begin Date', 'Begin Lat/Lon', 'Deaths Direct/Indirect']
 monthsList = list()z
 yearsList = list()
 secondaryUrls = list()
 yearsList = getYearsList(yearsList, 2010, 2022)
 monthsList = getMonthsList(monthsList, 1, 12)
 yearsList2022 = [2022]
 monthsList2022 = [1, 2, 3, 4, 5]
 secondaryUrls = getSecondaryUrls2010to2022(yearsList, monthsList, yearsList2022, monthsList2022)
 createDataFrame(secondaryUrls, nameOfCells)