In [1]:
import pandas as pd
import numpy as np
import requests
import urllib.request
from bs4 import BeautifulSoup as bs
from bs4 import SoupStrainer as ss
import copy
import re
import datetime as dt
from time import time
from time import sleep
from warnings import warn

import pyodbc
from sqlalchemy import create_engine

### Values that will be used to create the url's to scrape data

In [2]:
base_url = 'http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/'
project_type = ['sc','ll','sm'] #state construction, local letting, state maintenance
year = [str(i) for i in range(2015,dt.datetime.today().year+1)] #data from 2015 until end of current year
month = [str(i) for i in range(1,13)] #all possible months

In [3]:
base_url = 'http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/'
headers = {"Accept-Language": "en-US, en;q=0.5"}

local_table = []
tot_data = []

curr_date = dt.datetime.today().strftime('%m/%d/%Y')

# Connecting to Azure AQL Server

In [4]:
SERVER = 'tx-dot-dev-sqlsvr.database.windows.net'
DATABASE = 'TX-DOT-DEV-DB'
USERNAME = 'sysadmin'
PWD = 'N0ru_Mu5k09a9'
TABLE = "Raw_Data_Retrieval"

driver= '{ODBC Driver 17 for SQL Server}'  

In [5]:
connection_string = 'DRIVER=' + driver + \
                    ';SERVER=' + SERVER + \
                    ';PORT=1433' + \
                    ';DATABASE=' + DATABASE + \
                    ';UID=' + USERNAME + \
                    ';PWD=' + PWD

params = urllib.parse.quote_plus(connection_string)
engine = create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)

# Scrapes All Data From Webpage

- Loops through all possible urls
- Checks that the Tab Page exists
- If so adds all the data in the table into tot_data

In [9]:
def web_scraper():
    tot_data[:] = []
    local_table[:]= []
    num_req = 0 #number of requests perfomed by html scraper
    start_time = time() #start time of running the scraper
    for proj_type in project_type:
        for yr in year:
            for mnth in month:
                response = (requests.get(base_url+proj_type+yr+'0'+mnth+'.htm',headers = headers) if(len(mnth)<2) else requests.get(base_url+proj_type+yr+mnth+'.htm',headers = headers))
                if response.status_code == 200:
                    only_table = ss('table')
                    html_parser = bs(response.text , 'lxml',parse_only = only_table)
                    info_table = html_parser.findAll('table')
                    if(len(html_parser.findAll('table')) != 0):
                        info_table = html_parser.findAll('table')[1]
                        it_rows = info_table.findAll('tr')
                        for i in range(1,len(it_rows)):
                            web_id = it_rows[i].findAll('td')[0].get_text().strip().replace('/','') + it_rows[i].findAll('td')[1].get_text().strip()
                            temp_scrape = bs(requests.get(base_url+web_id + '.htm').text,'lxml',parse_only = ss(['table','title']))
                            if(temp_scrape.find('title').get_text() == 'Page Not Found'):
                                continue
                            else:
                                num_req += 1
                                print(str(num_req) + ": " + base_url+web_id + '.htm')
                                tot_data.append((temp_scrape.findAll('table')[0:2]))
                else:
                    warn('Request: {}; Status code: {}'.format(num_req, response.status_code))

# Conversion to DataFrame

- Goes through all the scraped data and adds it to local_table under the correct columns
- Goes through both main url data and data from the Tabs url

In [16]:
def create_df():   
    count = 0
    for cols in range(0,len(tot_data[0][count].findAll('td')[1:])):
        count += 1
        if cols%2 != 0:
            local_table.append((tot_data[0][0].findAll('td')[cols].get_text().strip(),[]))
    local_table.append(('Estimate: ',[]))
    local_table.append(('Bid: ',[]))
    local_table.append(('Over / Under %' ,[]))
    local_table.append(('Bidder: ',[]))
    local_table.append(('Winning Bidder: ',[]))
    local_table.append(('Date Accessed: ',[]))
    for i in range(0,len(tot_data)):
        count = 1
        for j in range(1,len(tot_data[i][0].findAll('td'))):
            if j%2 == 0:
                local_table[count-1][1].append(tot_data[i][0].findAll('td')[j].get_text())
                count+=1
            if count >= len(local_table) -5:
                count = 1
        tot_bidders = 0
        txt = []
        for j in range(0,len(tot_data[i][1].findAll('td'))):
            if count == 1:
                local_table[15][1].append(tot_data[i][1].findAll('td')[1].get_text().strip())
                count+=1
            temp = tot_data[i][1].findAll('td')[j].get_text().strip()
            tot_bidders += temp.count('Bidder')
            txt.append(temp)
        bid_count = 1
        txt = txt[4:]
        winning_bidder = txt[3]
        for k in range(0,tot_bidders):
            if(k > 0):
                for l in range(0,len(local_table)-5):
                    local_table[l][1].append(local_table[l][1][len(local_table[l][1])-1])
            local_table[16][1].append(txt[bid_count])
            local_table[17][1].append(txt[bid_count+1])
            local_table[18][1].append(txt[bid_count+2])
            local_table[19][1].append(winning_bidder)
            local_table[20][1].append(curr_date)
            bid_count += 4
    Dict={title:column for (title,column) in local_table}
    df = pd.DataFrame(Dict)
    
    #Pushes Data to Database
    df.to_sql(TABLE, engine, if_exists='append')
    conn = pyodbc.connect(connection_string)
    cursor = conn.cursor()
    cursor.execute("{call dbo.Stage_Raw_Data}")
    cursor.commit()

In [11]:
web_scraper()

1: http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/05053055.htm
2: http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/11033001.htm
3: http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/11033002.htm
4: http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/11033003.htm
5: http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/11033004.htm
6: http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/11033007.htm
7: http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/11033008.htm
8: http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/11033009.htm
9: http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/11033010.htm
10: http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/11033011.htm
11: http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/11033013.htm
12: http://www.dot.state.tx.us/insdtdot/orgchart/cmd/cserve/bidtab/11033014.htm
13: http://www.dot.state.tx.us/insdtdot/orgchart/

KeyboardInterrupt: 

In [17]:
create_df()