In [18]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import concurrent.futures

In [19]:
#this function extracts the number of pages on Maxpreps for each state.
def get_num_pages(state):
    current_page = 1
    while True:
        url = f'https://www.maxpreps.com/{state}/basketball/21-22/rankings/{current_page}/'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        next_page_button = soup.find_all('a', class_ = 'btn btn-default')
        button = [butt.text for butt in next_page_button]
        #print(button)
        num = str(current_page + 1)
        if num not in button:
            return int(num)- 1
        current_page += 1

In [32]:
#this function will extract 
def extract_state_ranking_page(state, current_page):
    url = f'https://www.maxpreps.com/{state}/basketball/21-22/rankings/{current_page}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text , 'html.parser')
    
    school_elements = soup.find_all('th', class_='school')
    school_elements.pop(0)
    school_names = [school.text for school in school_elements]
    
    school_rk_elements = soup.find_all('td', class_='rank first dw')
    school_rks = [rk.text for rk in school_rk_elements]

    school_sos_elements = soup.find_all('td', class_ = 'rating sorted dw')

In [33]:
extract_state_ranking_page('tx', 1)

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25']

In [21]:
#this function will create a dictionary of schools and their respective links given a state.
def create_school_dict(state):
    school_dict = {}
    num_pages = get_num_pages(state)
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_page = {executor.submit(extract_state_ranking_page, state, page): page for page in range(1, num_pages + 1)}
        for future in concurrent.futures.as_completed(future_to_page):
            try:
                page = future_to_page[future]
                data = future.result()
                school_dict.update(data)
            except Exception as e:
                print(f"Error fetching page {page}: {e}")
    return school_dict

### Functions to Extract Strength of Schedule and Ranking

In [22]:
states = [
    "ak", "al", "ar", "az", "ca", "co", "ct", "dc", "de", "fl", "ga",
    "hi", "ia", "id", "il", "in", "ks", "ky", "la", "ma", "md",
    "me", "mi", "mn", "mo", "ms", "mt", "nc", "nd", "ne", "nh",
    "nj", "nm", "nv", "ny", "oh", "ok", "or", "pa", "ri", "sc",
    "sd", "tn", "tx", "ut", "va", "vt", "wa", "wi", "wv", "wy"
]

#this loop:
#(1) iterates through each state
#(2) in each iteration a school dictionary is created with school as key and url as value
#(3) the data is scraped for each school in the dictionary
#(4) Team 2 Location and Address is added using the mergeDF function
#(5) columns are reordered
#(6) the data for each state is saved in its own csv file
for state in states:
    school_dict = create_school_dict(state)
    school_dict = {k: school_dict[k] for k in sorted(school_dict)}
    

{'Timber Creek (Fort Worth)': '/tx/fort-worth/timber-creek-falcons/basketball/21-22/schedule/',
 'West Oso (Corpus Christi)': '/tx/corpus-christi/west-oso-bears/basketball/21-22/schedule/',
 'North Shore (Houston)': '/tx/houston/north-shore-mustangs/basketball/21-22/schedule/',
 'Southlake Carroll (Southlake)': '/tx/southlake/southlake-carroll-dragons/basketball/21-22/schedule/',
 'Grace Christian Academy (Houston)': '/tx/houston/grace-christian-academy-eagles/basketball/21-22/schedule/',
 'Flour Bluff (Corpus Christi)': '/tx/corpus-christi/flour-bluff-hornets/basketball/21-22/schedule/',
 'Round Rock HomeSchool (Round Rock)': '/tx/round-rock/round-rock-homeschool-raiders/basketball/21-22/schedule/',
 'Canton': '/tx/canton/canton-eagles/basketball/21-22/schedule/',
 'Little River Academy': '/tx/little-river-academy/little-river-academy-bumblebees/basketball/21-22/schedule/',
 'Klein Collins (Spring)': '/tx/spring/klein-collins-tigers/basketball/21-22/schedule/',
 'Klein': '/tx/klein/kl