In [5]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import concurrent.futures
import time

### Functions that Will Extract Location and Address Data

In [2]:
##this function extracts the location (City State and Zipcode) for each school.

def extractLocation(soup):
    address_element = soup.select_one('address') 
        #print(address_element)
        # Exclude the text within the span within the address element
    if address_element:
        city_state = address_element.find('span')
    
    else:
        #print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None  # Return None or some default value if the request fails

    return city_state.text

In [3]:
#this function extracts the address for each school.

def extractAddress(soup):
        # Find the address element
    address_element = soup.select_one('address') 
        #print(address_element)
        # Exclude the text within the span within the address element

    city_state = address_element.find('span')
    city_state.decompose()

    address_text = address_element.get_text(strip=True)
    if address_text != '':
        return address_text
    else:
        return None

In [4]:
#this function cleans up the location column by splitting it into city, state and zipcode columns.
def splitLocation(column):
    
    pattern = r'(?P<city>[^,]+).\s?(?P<state>\w\w)\s?(?P<zipcode>\d\d\d\d\d)?'
    #column = pd.Series(column)
    _city = column.str.extract(pattern)['city']
    _state = column.str.extract(pattern)['state']
    _zip = column.str.extract(pattern)['zipcode']
    
    return (_city, _state, _zip)

### Functions that extract the Score and Game info

In [5]:
#this funciton cleans up the gameinfo column by splitting it into venue, team2 and gametype (H or A) columns.
def gameInfo(column):
    pattern = r'(?P<venue>[vs@]*)(?P<name>[a-zA-Z]+(\.|-)?\s?[a-zA-Z]*\'?\w*)(?P<type>[*]*)'
    # Apply str.extract to create new columns based on the pattern
    _venue = column.str.extract(pattern)['venue']
    _team2 = column.str.extract(pattern)['name']
    _gametype = column.str.extract(pattern)['type']
    
    
    return (_venue, _team2, _gametype)

In [6]:
#this function cleans up the score column by splitting it into team, team1_score and team2_score columns.
def scoreInfo(column):
    # Define a regular expression pattern to extract W or L, and the scores
    pattern = r'([WL])(\d+)-(\d+)|([WL])\(FF\)'

    # Extract the components into separate columns
    extracted = column.str.extract(pattern)

    # Assign extracted values to new columns
    team = extracted[0].fillna(extracted[3])
    team1_score = extracted[1]#.fillna(pd.NA).astype('Int64')
    team2_score = extracted[2]#.fillna(pd.NA).astype('Int64')

    return team, team1_score, team2_score

### Functions that format the game and venue type correctly

In [7]:
#each game entry will have  *, **, or *** next to it which correponds to the game type. This function cleans up the game type column.
def cleanGameType(string):
    #string = str(len(string))
    if len(string) == 3:
        string = 'Tournament'
    elif len(string) == 2:
        string = 'Playoff'
    elif len(string) == 1:
        string = 'District'
    else:
        string = 'Regular Season'
    return string

In [8]:
def cleanVenueType(string):
    if string == '@':
        return 'A'
    return 'H'

### Making a Dict with each school's name and URL

In [9]:
#this function extracts the number of pages on Maxpreps for each state.
def get_num_pages(state):
    current_page = 1
    while True:
        url = f'https://www.maxpreps.com/{state}/basketball/21-22/rankings/{current_page}/'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        next_page_button = soup.find_all('a', class_ = 'btn btn-default')
        button = [butt.text for butt in next_page_button]
        num = str(current_page + 1)
        if num not in button:
            return int(num)- 1
        current_page += 1

In [10]:
#this function will extract 
def extract_state_ranking_page(state, current_page):
    url = f'https://www.maxpreps.com/{state}/basketball/21-22/rankings/{current_page}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text , 'html.parser')
    school_elements = soup.find_all('th', class_='school')
    school_elements.pop(0)
    school_links = [school.find('a').get('href') for school in school_elements]
    return dict(zip((school.text for school in school_elements), school_links))

In [11]:
#this function will create a dictionary of schools and their respective links given a state.
def create_school_dict(state):
    school_dict = {}
    num_pages = get_num_pages(state)
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_page = {executor.submit(extract_state_ranking_page, state, page): page for page in range(1, num_pages + 1)}
        for future in concurrent.futures.as_completed(future_to_page):
            try:
                page = future_to_page[future]
                data = future.result()
                school_dict.update(data)
            except Exception as e:
                print(f"Error fetching page {page}: {e}")
    return school_dict

### Mascot Extraction

In [12]:
#The purpose of extracting the mascot for each school is to give each school a unique indentifer that will later be used get the location and address of team 2.
def extractMascots(soup):
    src_links = []
    #soup = BeautifulSoup(html_content, 'html.parser')
    div_elements = soup.find_all('div', class_='sc-3a367303-0 guEKkV')
    
    for div in div_elements:
        img_element = div.find('img', class_='sc-e055731c-0 TgzVI photo-or-initial')
        if img_element:
            src_links.append(img_element.get('src'))
        else:
            src_links.append(None)
    
    return src_links

In [13]:
def formatMascot(string):
    if string is not None:
        output_string = string.replace('width=32&height=32', 'width=64&height=64')
        return output_string
    return None

### Sports Offered Extraction

In [14]:
#this function extracts the sports offered at a given school for both boys and girls. It takes in the school URL as input
def extractSports(url):
    url = f'http://www.maxpreps.com{url}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')

    elements = soup.find_all('div', class_ = 'sports-list-child')

    boys = []
    girls = []
    #check if school offers no sports
    if len(elements) == 0:
        return None

    for element in elements:
        gender = element.find('h2', class_ = 'sc-23c90da8-0 KWggY')
        sports = element.find_all('span', class_ = 'sport-name')
        if gender.text == 'Boys':
            for sport in sports:
                boys.append(sport.text)
        else:
            for sport in sports:
                girls.append(sport.text)
                
    return ','.join(boys), ','.join(girls)

### Merging DF

In [15]:
#This function will create a df that has the location and address of team 2. Merging was perfomed on the mascot column.
def mergeDF(df):
    df['Team 1 Mascot'] = df['Team 1 Mascot'].astype(str)
    df['Team 2 Mascot'] = df['Team 2 Mascot'].astype(str)
    df['Team 2 Mascot'] = df['Team 2 Mascot'].apply(formatMascot)
    X = df.loc[:, ['Team 1 Address', 'Team 1 City', 'Team 1 State', 'Team 1 Zipcode', 'Team 1 Mascot']]
    X = X.rename(columns={'Team 1 Address': 'Team 2 Address',
                                   'Team 1 City': 'Team 2 City',
                                   'Team 1 State': 'Team 2 State',
                                   'Team 1 Zipcode': 'Team 2 Zipcode',
                                     'Team 1 Mascot': 'Team 2 Mascot'})
    X = X.drop_duplicates()
    X = X[X['Team 2 Mascot'] != 'None']
    merged_df = pd.merge(df, X, how = 'left', left_on = 'Team 2 Mascot', right_on = 'Team 2 Mascot')
    
    return merged_df

### Put it all together

In [16]:
import concurrent.futures
from tenacity import retry, stop_after_attempt, wait_fixed
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [17]:
from tenacity import retry, stop_after_attempt, wait_fixed

In [18]:
#this function utilizes all previous functions. it will read in the school name and url, scrape the data, and clean everything up to a useable format.

@retry(stop=stop_after_attempt(5), wait=wait_fixed(3))
def scrape_school_data(school, url):
    try:
        school_url = f'http://www.maxpreps.com{url}'
        #print(school_url)
        response = requests.get(school_url).text
        soup = BeautifulSoup(response, 'lxml')
        table = soup.find('table')
        data = pd.read_html(str(table))[0]
        data = data.drop('Game Info', axis=1)
        data['Team 1'] = school
        data['Team 1 Location'] = extractLocation(soup)
        data['Team 1 Address'] = extractAddress(soup)
        data['Team 1 City'] = splitLocation(data['Team 1 Location'])[0]
        data['Team 1 State'] = splitLocation(data['Team 1 Location'])[1]
        data['Team 1 Zipcode'] = splitLocation(data['Team 1 Location'])[2]
        data['Venue'] = gameInfo(data['Opponent'])[0].apply(cleanVenueType)
        data['Team 2'] = gameInfo(data['Opponent'])[1]
        data['Game Type'] = gameInfo(data['Opponent'])[2].apply(cleanGameType)
        data['Outcome'] = scoreInfo(data['Result'])[0]
        data['Team 1 Score'] = scoreInfo(data['Result'])[1]
        data['Team 2 Score'] = scoreInfo(data['Result'])[2]
        team1_mascot = soup.find('img', class_ = 'sc-e055731c-0 ddWTnk photo-or-initial')
        if team1_mascot:  
            data['Team 1 Mascot'] = team1_mascot.get('src')
        else:
            data['Team 1 Mascot'] = None
        team_2_mascots = extractMascots(soup)
        data['Team 2 Mascot'] = team_2_mascots
        sports_url = url.rsplit('/', 4)[0] + '/'
        #print(sports_url)
        sports_offered = extractSports(sports_url)
        data['Boys Sports'] = sports_offered[0]
        data['Girls Sports'] = sports_offered[1]

        return data

    except Exception as e:
        print(f"Error processing {school}: {e}")
        return None

In [19]:
#this function is given a state and a dictionary of schools and their urls. It will scrape all the schools in the dictionary and return a dataframe with all the data.
def scrape_all_schools_async(school_dict, state):

    grand_df = pd.DataFrame()

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(scrape_school_data, school, url) for school, url in school_dict.items()]

        with tqdm(total=len(futures), desc=f"Scraping Schools for {state}", unit=" schools") as pbar:
            for future in concurrent.futures.as_completed(futures):
                data = future.result()
                if data is not None:
                    grand_df = pd.concat([grand_df, data], ignore_index=True)
                pbar.update(1)

    return grand_df

### Running It

In [28]:
cols = ['Date', 'Team 1', 'Team 2', 'Venue', 'Game Type', 'Team 1 Score', 
       'Team 2 Score', 'Outcome', 'Team 1 Address', 'Team 1 City',
       'Team 1 State', 'Team 1 Zipcode','Team 2 Address','Team 2 City',
        'Team 2 State', 'Team 2 Zipcode', 'Boys Sports', 'Girls Sports', ]
merged = merged[cols]

In [29]:
null_values = merged.isnull()

# Sum the null values for each column
null_counts = null_values.sum()

In [53]:
states = [
    "ak", "al", "ar", "az", "ca", "co", "ct", "dc", "de", "fl", "ga",
    "hi", "ia", "id", "il", "in", "ks", "ky", "la", "ma", "md",
    "me", "mi", "mn", "mo", "ms", "mt", "nc", "nd", "ne", "nh",
    "nj", "nm", "nv", "ny", "oh", "ok", "or", "pa", "ri", "sc",
    "sd", "tn", "tx", "ut", "va", "vt", "wa", "wi", "wv", "wy"
]

#this loop:
#(1) iterates through each state
#(2) in each iteration a school dictionary is created with school as key and url as value
#(3) the data is scraped for each school in the dictionary
#(4) Team 2 Location and Address is addeed using the mergeDF function
#(5) columns are reordered
#(6) the data for each state is saved in its own csv file
for state in states:
    school_dict = create_school_dict(state)
    school_dict = {k: school_dict[k] for k in sorted(school_dict)}
    #print(school_dict)
    data = scrape_all_schools_async(school_dict, state)
    merged = mergeDF(data)
    cols = ['Date', 'Team 1', 'Team 2', 'Venue', 'Game Type', 'Team 1 Score', 
       'Team 2 Score', 'Outcome', 'Team 1 Address', 'Team 1 City',
       'Team 1 State', 'Team 1 Zipcode','Team 2 Address','Team 2 City',
        'Team 2 State', 'Team 2 Zipcode', 'Boys Sports', 'Girls Sports',
        'Team 1 Mascot', 'Team 2 Mascot']
    merged = merged[cols]
    merged.to_csv(f'{state}.csv', index=False)

Scraping Schools for ak: 100%|██████████| 114/114 [00:14<00:00,  7.73 schools/s]
Scraping Schools for al: 100%|██████████| 477/477 [01:15<00:00,  6.33 schools/s]
Scraping Schools for ar: 100%|██████████| 294/294 [00:43<00:00,  6.70 schools/s]
Scraping Schools for az:  59%|█████▉    | 193/328 [00:29<00:19,  6.79 schools/s]

Error processing PHH Prep Open (Phoenix): No tables found
Error processing PHH Prep National (Phoenix): No tables found


Scraping Schools for az: 100%|██████████| 328/328 [00:48<00:00,  6.72 schools/s]
Scraping Schools for ca:  50%|████▌    | 718/1428 [01:49<02:09,  5.49 schools/s]

Error processing Liberty Charter (Alpine): HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /ca/alpine/liberty-charter-lions/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2e0ca9a00>: Failed to establish a new connection: [Errno 60] Operation timed out'))Error processing Liberty (Madera): HTTPSConnectionPool(host='www.maxpreps.com', port=443): Max retries exceeded with url: /ca/madera/liberty-hawks/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fc2dfc20f70>: Failed to establish a new connection: [Errno 60] Operation timed out'))



Scraping Schools for ca:  64%|█████▋   | 907/1428 [02:20<01:09,  7.51 schools/s]

Error processing Northgate (Walnut Creek): HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /ca/walnut-creek/northgate-broncos/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2e5c28280>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for ca: 100%|████████| 1428/1428 [03:48<00:00,  6.24 schools/s]
Scraping Schools for co: 100%|██████████| 343/343 [00:56<00:00,  6.02 schools/s]

Error processing Two Roads Charter School (Arvada): HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /co/arvada/two-roads-charter-school-falcons/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2e43d82e0>: Failed to establish a new connection: [Errno 60] Operation timed out'))



Scraping Schools for ct: 100%|██████████| 184/184 [00:27<00:00,  6.57 schools/s]
Scraping Schools for de: 100%|████████████| 60/60 [00:08<00:00,  7.33 schools/s]
Scraping Schools for fl:  26%|██▋       | 203/772 [00:29<01:26,  6.58 schools/s]

Error processing DME Academy Blue (Daytona Beach): HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /fl/daytona-beach/dme-academy-blue-dme-academy/basketball/21-22/schedule/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2e1bf9490>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for fl:  66%|██████▌   | 506/772 [01:19<00:34,  7.72 schools/s]

Error processing Osceola Christian Prep (Kissimmee): No tables found


Scraping Schools for fl:  86%|████████▌ | 665/772 [01:43<00:11,  9.03 schools/s]

Error processing Somerset Academy Central Miramar (Miramar): HTTPSConnectionPool(host='www.maxpreps.com', port=443): Max retries exceeded with url: /fl/miramar/somerset-academy-central-miramar-warriors/basketball/21-22/schedule/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fc2e6cb75e0>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for fl: 100%|██████████| 772/772 [01:59<00:00,  6.46 schools/s]
Scraping Schools for ga:  66%|██████▌   | 384/580 [00:59<00:22,  8.83 schools/s]

Error processing Montgomery County (Mt. Vernon): HTTPSConnectionPool(host='www.maxpreps.com', port=443): Max retries exceeded with url: /ga/mt-vernon/montgomery-county-eagles/basketball/21-22/schedule/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fc2e22fd160>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for ga: 100%|██████████| 580/580 [01:29<00:00,  6.48 schools/s]
Scraping Schools for hi: 100%|████████████| 57/57 [00:07<00:00,  7.93 schools/s]
Scraping Schools for ia:  67%|██████▋   | 242/362 [00:31<00:15,  7.59 schools/s]

Error processing Melcher-Dallas (Melcher): HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /ia/melcher/melcher-dallas-saints/basketball/21-22/schedule/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2fada7070>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for ia: 100%|██████████| 362/362 [00:46<00:00,  7.73 schools/s]
Scraping Schools for id: 100%|██████████| 157/157 [00:19<00:00,  8.02 schools/s]
Scraping Schools for il:  18%|█▊        | 143/774 [00:20<01:46,  5.90 schools/s]

Error processing Crossroads Christian (Big Rock): No tables found


Scraping Schools for il: 100%|██████████| 774/774 [01:51<00:00,  6.95 schools/s]
Scraping Schools for in: 100%|██████████| 460/460 [01:04<00:00,  7.09 schools/s]
Scraping Schools for ks: 100%|██████████| 372/372 [00:49<00:00,  7.51 schools/s]
Scraping Schools for ky: 100%|██████████| 281/281 [00:40<00:00,  7.02 schools/s]
Scraping Schools for la: 100%|██████████| 396/396 [00:55<00:00,  7.17 schools/s]
Scraping Schools for ma: 100%|██████████| 341/341 [00:45<00:00,  7.43 schools/s]
Scraping Schools for md: 100%|██████████| 280/280 [00:35<00:00,  7.96 schools/s]
Scraping Schools for me: 100%|██████████| 133/133 [00:17<00:00,  7.80 schools/s]
Scraping Schools for mi:  33%|███▎      | 244/735 [00:32<00:54,  9.04 schools/s]

Error processing Dundee: HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /mi/dundee/dundee-vikings/basketball/21-22/schedule/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2e3da04f0>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for mi:  52%|█████▏    | 380/735 [00:51<00:45,  7.75 schools/s]

Error processing Ida: HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /mi/ida/ida-bluestreaks/basketball/21-22/schedule/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2e5323460>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for mi:  69%|██████▉   | 507/735 [01:08<00:30,  7.59 schools/s]

Error processing Michigan Math & Science (Center Line): HTTPSConnectionPool(host='www.maxpreps.com', port=443): Max retries exceeded with url: /mi/center-line/michigan-math-and-science/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fc2fc363430>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for mi:  87%|████████▋ | 636/735 [01:27<00:17,  5.78 schools/s]

Error processing Summit Academy North (Romulus): No tables found


Scraping Schools for mi: 100%|██████████| 735/735 [01:40<00:00,  7.30 schools/s]
Scraping Schools for mn:  61%|██████    | 263/434 [00:36<00:24,  6.86 schools/s]

Error processing Litchfield: HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /mn/litchfield/litchfield-dragons/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2e22ed9d0>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for mn: 100%|██████████| 434/434 [01:00<00:00,  7.22 schools/s]
Scraping Schools for mo:  50%|████▉     | 300/601 [00:40<00:36,  8.28 schools/s]

Error processing Kingston (Cadet): HTTPSConnectionPool(host='www.maxpreps.com', port=443): Max retries exceeded with url: /mo/cadet/kingston-cougars/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fc2e1687d90>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for mo:  68%|██████▊   | 411/601 [00:55<00:20,  9.25 schools/s]

Error processing North Andrew (Rosendale): HTTPSConnectionPool(host='www.maxpreps.com', port=443): Max retries exceeded with url: /mo/rosendale/north-andrew-cardinals/basketball/21-22/schedule/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fc2e64b1dc0>: Failed to establish a new connection: [Errno 60] Operation timed out'))
Error processing Neosho: HTTPSConnectionPool(host='www.maxpreps.com', port=443): Max retries exceeded with url: /mo/neosho/neosho-wildcats/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fc2e64b1ee0>: Failed to establish a new connection: [Errno 60] Operation timed out'))
Error processing New Bloomfield: HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /mo/new-bloomfield/new-bloomfield-wildcats/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2e2f811c0>: Failed to establish a new connection: [Errno 60] Operation time

Scraping Schools for mo:  72%|███████▏  | 433/601 [00:58<00:21,  7.80 schools/s]

Error processing Northwest (Cedar Hill): HTTPSConnectionPool(host='www.maxpreps.com', port=443): Max retries exceeded with url: /mo/cedar-hill/northwest-lions/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fc2fa84d4c0>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for mo:  91%|█████████▏| 549/601 [01:15<00:07,  6.96 schools/s]

Error processing St. Francis Borgia (Washington): HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /mo/washington/st-francis-borgia-knights/basketball/21-22/schedule/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2e3f2b610>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for mo: 100%|██████████| 601/601 [01:21<00:00,  7.34 schools/s]
Scraping Schools for ms: 100%|██████████| 316/316 [00:42<00:00,  7.52 schools/s]
Scraping Schools for mt: 100%|██████████| 158/158 [00:22<00:00,  6.98 schools/s]

Error processing St. Regis: HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /mt/st-regis/st-regis-tigers/basketball/21-22/schedule/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2fc24aca0>: Failed to establish a new connection: [Errno 60] Operation timed out'))



Scraping Schools for nc:  88%|████████▊ | 574/652 [01:22<00:08,  9.27 schools/s]

Error processing Valor Preparatory Academy (Concord): No tables found


Scraping Schools for nc: 100%|██████████| 652/652 [01:34<00:00,  6.92 schools/s]
Scraping Schools for nd: 100%|██████████| 122/122 [00:15<00:00,  7.63 schools/s]
Scraping Schools for ne: 100%|██████████| 283/283 [00:38<00:00,  7.39 schools/s]
Scraping Schools for nh: 100%|████████████| 88/88 [00:11<00:00,  7.89 schools/s]
Scraping Schools for nj: 100%|██████████| 423/423 [00:57<00:00,  7.35 schools/s]
Scraping Schools for nm: 100%|██████████| 150/150 [00:21<00:00,  6.92 schools/s]
Scraping Schools for nv: 100%|██████████| 114/114 [00:21<00:00,  5.20 schools/s]

Error processing Truckee: HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /ca/truckee/truckee-wolverines/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2fb5f31f0>: Failed to establish a new connection: [Errno 60] Operation timed out'))



Scraping Schools for ny:  23%|██▎       | 138/597 [00:17<00:44, 10.31 schools/s]

Error processing Dundee: No tables found


Scraping Schools for ny:  71%|███████   | 422/597 [00:53<00:20,  8.73 schools/s]

Error processing Pittsford: No tables found


Scraping Schools for ny: 100%|██████████| 597/597 [01:16<00:00,  7.80 schools/s]
Scraping Schools for oh:  34%|███▍      | 290/856 [00:40<01:01,  9.20 schools/s]

Error processing Euclid: HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /oh/euclid/euclid-panthers/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2fa7ed4f0>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for oh: 100%|██████████| 856/856 [02:02<00:00,  7.01 schools/s]
Scraping Schools for ok: 100%|██████████| 483/483 [01:00<00:00,  7.97 schools/s]
Scraping Schools for or: 100%|██████████| 266/266 [00:34<00:00,  7.76 schools/s]
Scraping Schools for pa:  46%|████▌     | 336/738 [00:46<00:50,  7.97 schools/s]

Error processing Holy Redeemer (Wilkes-Barre): HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /pa/wilkes-barre/holy-redeemer-royals/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2e1deb340>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for pa: 100%|██████████| 738/738 [01:43<00:00,  7.16 schools/s]
Scraping Schools for ri: 100%|████████████| 52/52 [00:07<00:00,  6.74 schools/s]
Scraping Schools for sc: 100%|██████████| 301/301 [00:41<00:00,  7.23 schools/s]
Scraping Schools for sd: 100%|██████████| 158/158 [00:20<00:00,  7.80 schools/s]
Scraping Schools for tn: 100%|██████████| 432/432 [00:59<00:00,  7.31 schools/s]
Scraping Schools for tx:  13%|█▏       | 227/1688 [00:33<02:34,  9.46 schools/s]

Error processing Brookeland: HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /tx/brookeland/brookeland-wildcats/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2e666c9d0>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for tx:  44%|███▉     | 740/1688 [01:52<02:51,  5.52 schools/s]

Error processing Hondo: HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /tx/hondo/hondo-owls/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2e2b97f10>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for tx:  64%|█████   | 1080/1688 [02:49<01:47,  5.65 schools/s]

Error processing Montgomery: HTTPConnectionPool(host='www.maxpreps.com', port=80): Max retries exceeded with url: /tx/montgomery/montgomery-bears/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc2e6bffd60>: Failed to establish a new connection: [Errno 60] Operation timed out'))


Scraping Schools for tx:  68%|█████▍  | 1140/1688 [03:00<01:55,  4.73 schools/s]

Error processing Parkview Christian (Waco): No tables found


Scraping Schools for tx: 100%|████████| 1688/1688 [04:35<00:00,  6.14 schools/s]
Scraping Schools for ut: 100%|██████████| 142/142 [00:19<00:00,  7.11 schools/s]
Scraping Schools for va: 100%|██████████| 437/437 [00:58<00:00,  7.42 schools/s]
Scraping Schools for vt: 100%|████████████| 64/64 [00:09<00:00,  7.01 schools/s]
Scraping Schools for wa: 100%|██████████| 380/380 [00:49<00:00,  7.66 schools/s]
Scraping Schools for wi: 100%|██████████| 506/506 [01:09<00:00,  7.28 schools/s]
Scraping Schools for wv: 100%|██████████| 126/126 [00:16<00:00,  7.59 schools/s]
Scraping Schools for wy: 100%|████████████| 67/67 [00:09<00:00,  6.78 schools/s]
