# Webscraping Test

The goal of webscraping the GOSA website is to retrieve the urls needed to download relevant datasets.

The urls are located in the "DATA AVAILABLE FOR DOWNLOAD" column of the table on the GOSA Downloadable Data website.

In [1]:
# Import necessary packages (code adapated from https://towardsdatascience.com/web-scraping-scraping-table-data-1665b6b2271c)
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [2]:
# GOSA Downloadable Data site URL
url = 'https://gosa.georgia.gov/dashboards-data-report-card/downloadable-data'

In [3]:
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

In [4]:
# Parse HTML code for the entire site
soup = BeautifulSoup(html_content, "lxml")
#print(soup.prettify()) # print the parsed data of html

In [5]:
#print(soup.prettify())

In reviewing the HTML above, it looks like the table class needed is "stacked-row-plus" and the columns needed are the first "DATA CATEGORY" and the third "DATA AVAILABLE FOR DOWNLOAD"

In [6]:
# On site there are 1 tables with the class "stacked-row-plus"
# The following line will generate a list of HTML content for each table
gdp = soup.find_all("table", attrs={"class": "stacked-row-plus"})
print("Number of tables on site: ",len(gdp))

Number of tables on site:  1


In [7]:
# Lets go ahead and scrape first table with HTML code gdp[0]
table1 = gdp[0]
# the head will form our column names
body = table1.find_all("tr")
# Head values (Column names) are the first items of the body list
head = body[0] # 0th item is the header row
body_rows = body[1:] # All other items becomes the rest of the rows

# Lets now iterate through the head HTML code and make list of clean headings

# Declare empty list to keep Columns names
headings = []
for item in head.find_all("th"): # loop through all th elements
    # convert the th elements to text and strip "\n"
    item = (item.text).rstrip("\n")
    # append the clean column name to headings
    headings.append(item)
print(headings)

['DATA CATEGORY', 'DESCRIPTION', 'DATA AVAILABLE FOR DOWNLOAD']


In [8]:
# Loop through all body rows 

all_rows = [] #list for all body rows
for row_num in range(len(body_rows)): # One row at a time
    row = [] # this will old entries for one row
    for row_item in body_rows[row_num].find_all("td"): #loop through all row entries
        aa = row_item 
        row.append(aa)
    all_rows.append(row)

In [9]:
# Create a dataframe using the rows and three column headers
df = pd.DataFrame(all_rows, columns=headings)

In [10]:
# Review the dataframe
df.head()

Unnamed: 0,DATA CATEGORY,DESCRIPTION,DATA AVAILABLE FOR DOWNLOAD
0,"[\n, [[ACT Scores (Highest)]], \n]","[\n, [ACT testing counts and average composite...","[\n, [ ], \n, [[2019-20], [], [2018-19], , []..."
1,[[ACT Scores (Recent)]],"[\n, [ACT testing counts and average composite...",[[2019-20]]
2,[[Advanced Placement (AP) Scores]],"[\n, [Number of students tested, number of AP ...","[\n, [ ], \n, [[2019-20], [], [2018-19], , []..."
3,[[Attendance]],"[\n, [Collected from the Student Record showin...","[\n, [ ], \n, [[2019-20], [], [2018-19], [], [..."
4,[[Certified Personnel]],[Certified Personnel data are compiled from in...,"[\n, [ ], \n, [[2019-20], [], [2018-19], [], [..."


In [11]:
# Check to see what is available in the data for download column
df['DATA AVAILABLE FOR DOWNLOAD'][0]

<td>
<p> </p>
<p><a href="https://download.gosa.ga.gov/2020/ACT_HIGHEST_2020_JUN_21_2021.csv">2019-20</a><br/><a href="https://download.gosa.ga.gov/2019/ACT_HIGHEST_2019_FEB_24_2020.csv">2018-19</a> <br/><a href="https://download.gosa.ga.gov/2018/ACT_HIGHEST_2018_FEB_24_2020.csv">2017-18</a><br/><a href="https://download.gosa.ga.gov/2017/ACT_HIGHEST_2017_FEB_24_2020.csv">2016-17</a><br/><a href="https://download.gosa.ga.gov/2016/ACT_HIGHEST_2016_FEB_24_2020.csv">2015-16</a><br/><a href="https://download.gosa.ga.gov/2015/ACT_HIGHEST_2015_FEB_24_2020.csv">2014-15</a><br/><a href="https://download.gosa.ga.gov/2014/ACT_HIGHEST_2014_FEB_24_2020.csv">2013-14</a><br/><a href="https://download.gosa.ga.gov/2013/ACT_HIGHEST_2013_FEB_24_2020.csv">2012-13</a><br/><a href="https://download.gosa.ga.gov/2012/ACT_HIGHEST_2012_FEB_24_2020.csv">2011-12</a><br/><a href="https://download.gosa.ga.gov/2011/ACT_HIGHEST_2011_FEB_24_2020.csv">2010-11</a></p>
<p> </p>
</td>

In [12]:
# TODO: reformat the Data Available for Download column

# Isolate each link and year
act_isolation = df['DATA AVAILABLE FOR DOWNLOAD'][0].find_all("a", href=True)

act_isolation

[<a href="https://download.gosa.ga.gov/2020/ACT_HIGHEST_2020_JUN_21_2021.csv">2019-20</a>,
 <a href="https://download.gosa.ga.gov/2019/ACT_HIGHEST_2019_FEB_24_2020.csv">2018-19</a>,
 <a href="https://download.gosa.ga.gov/2018/ACT_HIGHEST_2018_FEB_24_2020.csv">2017-18</a>,
 <a href="https://download.gosa.ga.gov/2017/ACT_HIGHEST_2017_FEB_24_2020.csv">2016-17</a>,
 <a href="https://download.gosa.ga.gov/2016/ACT_HIGHEST_2016_FEB_24_2020.csv">2015-16</a>,
 <a href="https://download.gosa.ga.gov/2015/ACT_HIGHEST_2015_FEB_24_2020.csv">2014-15</a>,
 <a href="https://download.gosa.ga.gov/2014/ACT_HIGHEST_2014_FEB_24_2020.csv">2013-14</a>,
 <a href="https://download.gosa.ga.gov/2013/ACT_HIGHEST_2013_FEB_24_2020.csv">2012-13</a>,
 <a href="https://download.gosa.ga.gov/2012/ACT_HIGHEST_2012_FEB_24_2020.csv">2011-12</a>,
 <a href="https://download.gosa.ga.gov/2011/ACT_HIGHEST_2011_FEB_24_2020.csv">2010-11</a>]

In [13]:
# Create a dictionary of the years and links
years = []
urls = []
url_dict = {}
for entry in act_isolation:
    years.append(entry.text)
    urls.append(entry['href'])

for year in range(len(years)):
    url_dict[years[year]] = urls[year]

url_dict

{'2019-20': 'https://download.gosa.ga.gov/2020/ACT_HIGHEST_2020_JUN_21_2021.csv',
 '2018-19': 'https://download.gosa.ga.gov/2019/ACT_HIGHEST_2019_FEB_24_2020.csv',
 '2017-18': 'https://download.gosa.ga.gov/2018/ACT_HIGHEST_2018_FEB_24_2020.csv',
 '2016-17': 'https://download.gosa.ga.gov/2017/ACT_HIGHEST_2017_FEB_24_2020.csv',
 '2015-16': 'https://download.gosa.ga.gov/2016/ACT_HIGHEST_2016_FEB_24_2020.csv',
 '2014-15': 'https://download.gosa.ga.gov/2015/ACT_HIGHEST_2015_FEB_24_2020.csv',
 '2013-14': 'https://download.gosa.ga.gov/2014/ACT_HIGHEST_2014_FEB_24_2020.csv',
 '2012-13': 'https://download.gosa.ga.gov/2013/ACT_HIGHEST_2013_FEB_24_2020.csv',
 '2011-12': 'https://download.gosa.ga.gov/2012/ACT_HIGHEST_2012_FEB_24_2020.csv',
 '2010-11': 'https://download.gosa.ga.gov/2011/ACT_HIGHEST_2011_FEB_24_2020.csv'}

In [14]:
# Replace the data available for download entry with the dictionary of years and links
#df['DATA AVAILABLE FOR DOWNLOAD'][0] = url_dict

In [15]:
df.head()

Unnamed: 0,DATA CATEGORY,DESCRIPTION,DATA AVAILABLE FOR DOWNLOAD
0,"[\n, [[ACT Scores (Highest)]], \n]","[\n, [ACT testing counts and average composite...","[\n, [ ], \n, [[2019-20], [], [2018-19], , []..."
1,[[ACT Scores (Recent)]],"[\n, [ACT testing counts and average composite...",[[2019-20]]
2,[[Advanced Placement (AP) Scores]],"[\n, [Number of students tested, number of AP ...","[\n, [ ], \n, [[2019-20], [], [2018-19], , []..."
3,[[Attendance]],"[\n, [Collected from the Student Record showin...","[\n, [ ], \n, [[2019-20], [], [2018-19], [], [..."
4,[[Certified Personnel]],[Certified Personnel data are compiled from in...,"[\n, [ ], \n, [[2019-20], [], [2018-19], [], [..."


In [18]:
# TODO: Reproduce the urls for each entry in the dataframe
for i in range(len(df['DATA AVAILABLE FOR DOWNLOAD'])):
    isolation = df['DATA AVAILABLE FOR DOWNLOAD'][i].find_all("a", href=True)

In [19]:
# Define a function to accomplish the task of making dictionaries for years and links
def parse_year_url(df_row):
    years = []
    urls = []
    url_dict = {}
    
    isolation = df['DATA AVAILABLE FOR DOWNLOAD'][df_row].find_all("a", href=True)
    
    for entry in isolation:
        years.append(entry.text)
        urls.append(entry['href'])

    for year in range(len(years)):
        url_dict[years[year]] = urls[year]

    df['DATA AVAILABLE FOR DOWNLOAD'][df_row] = url_dict

In [17]:
# TODO: Remove all the html from the text in the Data Category and Description columns