In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
url = 'https://www.maxpreps.com/tx/duncanville/duncanville-panthers-and-pantherettes/basketball/21-22/schedule/'
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')

In [3]:
table = soup.find('table')

In [4]:
data = pd.read_html(str(table))[0]

### Making a Dictionary for each state where the keys are the school name and the values are the URL

In [5]:
url1 = 'https://www.maxpreps.com/basketball/21-22/rankings/1/'
response = requests.get(url1).text
soup = BeautifulSoup(response, 'html.parser')

In [6]:
#Creating a list that contains the name of each school
school_elements = soup.find_all('th', class_='school')

#Creating another list that contains the URL for each high school
school_links = []
for school in school_elements:
    school_links.append(school.find('a'))

In [7]:
school_elements.pop(0)
school_links.pop(0)
for i in range(len(school_elements)):
    school_elements[i] = school_elements[i].text
    school_links[i] = school_links[i].get('href')

In [8]:
school_dict = dict(zip(school_elements, school_links))

In [9]:
school_dict

{'Duncanville': '/tx/duncanville/duncanville-panthers-and-pantherettes/basketball/21-22/schedule/',
 'Centennial (Corona)': '/ca/corona/centennial-huskies/basketball/21-22/schedule/',
 'Calvary Christian Academy (Fort Lauderdale)': '/fl/fort-lauderdale/calvary-christian-academy-eagles/basketball/21-22/schedule/',
 'Roselle Catholic (Roselle)': '/nj/roselle/roselle-catholic-lions/basketball/21-22/schedule/',
 'North Little Rock': '/ar/north-little-rock/north-little-rock-charging-wildcats/basketball/21-22/schedule/',
 'Glenbard West (Glen Ellyn)': '/il/glen-ellyn/glenbard-west-hilltoppers/basketball/21-22/schedule/',
 'Cardinal Hayes (Bronx)': '/ny/bronx/cardinal-hayes-cardinals/basketball/21-22/schedule/',
 'Richardson': '/tx/richardson/richardson-eagles/basketball/21-22/schedule/',
 'Camden': '/nj/camden/camden-panthers/basketball/21-22/schedule/',
 'Paul VI (Chantilly)': '/va/chantilly/st-paul-vi-panthers/basketball/21-22/schedule/',
 'Imhotep Charter (Philadelphia)': '/pa/philadelphi

## PRACCY

In [13]:
school_dict = {'Duncanville': '/tx/duncanville/duncanville-panthers-and-pantherettes/basketball/21-22/schedule/'}
school = 'Duncanville'
url = 'https://www.maxpreps.com{}'.format(school_dict[school])
    
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')

table = soup.find('table')
if table:
    # Convert the table to DataFrame
    data = pd.read_html(str(table))[0]
    data['Team 1'] = school
    data = data.drop('Game Info', axis = 1)
    
else:
    print(f"No table found on {url}")

In [14]:
data

Unnamed: 0,Date,Opponent,Result,Team 1
0,11/12,vsShadow Creek,W89-38,Duncanville
1,11/13,vsLancaster,W102-59,Duncanville
2,11/20,vsKimball,W84-82,Duncanville
3,11/24,vsWheeler,W80-58,Duncanville
4,11/26,vsCentennial,W75-70,Duncanville
5,11/27,vsSierra Canyon,W80-73,Duncanville
6,11/30,@Rock Hill,W90-66,Duncanville
7,12/3,vsZachary***,W86-56,Duncanville
8,12/4,vsCarver Collegiate Academy***,W93-52,Duncanville
9,12/9,vsMontverde Academy,W67-66,Duncanville


#### Extracting Location Data

In [3]:
def extractLocation(school):
    url = 'https://www.maxpreps.com{}'.format(school)
    #print(url)
    # Make a request to the webpage
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the address element
        address_element = soup.select_one('address') 
        #print(address_element)
        # Exclude the text within the span within the address element
        if address_element:
            city_state = address_element.find('span')
    
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None  # Return None or some default value if the request fails

    return city_state.text

In [4]:
def extractAddress(school):
    url = 'https://www.maxpreps.com{}'.format(school)

    # Make a request to the webpage
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the address element
        address_element = soup.select_one('address') 
        #print(address_element)
        # Exclude the text within the span within the address element
        if address_element:
            city_state = address_element.find('span')
            city_state.decompose()

            address_text = address_element.get_text(strip=True)
        else:
            print("No address element found.")
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        
    return address_text

In [14]:
# get list of the URLS for Team 2

team_2_elements = soup.find_all('a', class_='raRFl')
team_2_links = []
for element in team_2_elements:
    link = element.get('href')
    link = link[24:]
    team_2_links.append(link)

In [15]:
team_2_locations = []
team_2_addresses = []
for team in team_2_links:
    team_2_locations.append(extractLocation(team))
    team_2_addresses.append(extractAddress(team))


In [16]:
import concurrent.futures

with concurrent.futures.ThreadPoolExecutor() as executor:
    team_2_locations = list(executor.map(extractLocation, team_2_links))
    team_2_addresses = list(executor.map(extractAddress, team_2_links))


In [17]:
data['Team 2 Location'] = team_2_locations
data['Team 2 Address'] = team_2_addresses

#### Extracting Venue, Score, Outcome, Game Type

In [5]:
import re
def gameInfo(column):
    pattern = r'(?P<venue>[vs@]*)(?P<name>[a-zA-Z]+(\.|-)?\s?[a-zA-Z]*\'?\w*)(?P<type>[*]*)'
    # Apply str.extract to create new columns based on the pattern
    _venue = column.str.extract(pattern)['venue']
    _team2 = column.str.extract(pattern)['name']
    _gametype = column.str.extract(pattern)['type']
    
    
    return (_venue, _team2, _gametype)

In [6]:
def cleanGameType(string):
    #string = str(len(string))
    if len(string) == 3:
        string = 'Tournament'
    elif len(string) == 2:
        string = 'Playoff'
    elif len(string) == 1:
        string = 'District'
    else:
        string = 'Regular Season'
    return string

In [7]:
def cleanVenueType(string):
    if string == '@':
        return 'A'
    return 'H'

In [8]:
def scoreInfo(column):
    pattern = r'(?P<Outcome>[WL]?)(?P<Team_1_Score>\d+)-(?P<Team_2_Score>\d+)'
    
    _outcome = column.str.extract(pattern)['Outcome']
    _t1score = column.str.extract(pattern)['Team_1_Score']
    _t2score = column.str.extract(pattern)['Team_2_Score']
    
    return (_outcome, _t1score, _t2score)

In [9]:
def splitLocation(column):
    
    pattern = r'(?P<city>[^,]+).\s?(?P<state>\w\w)\s?(?P<zipcode>\d\d\d\d\d)'

    _city = column.str.extract(pattern)['city']
    _state = column.str.extract(pattern)['state']
    _zip = column.str.extract(pattern)['zipcode']
    
    return (_city, _state, _zip)

In [23]:
data['Team 1 Location'] = extractLocation(school_dict['Duncanville'])
data['Team 1 Address'] = extractAddress(school_dict['Duncanville'])
data['Team 1 City'] = splitLocation(data['Team 1 Location'])[0]
data['Team 1 State'] = splitLocation(data['Team 1 Location'])[1]
data['Team 1 Zipcode'] = splitLocation(data['Team 1 Location'])[2]
data['Team 2 City'] = splitLocation(data['Team 2 Location'])[0]
data['Team 2 State'] = splitLocation(data['Team 2 Location'])[1]
data['Team 2 Zipcode'] = splitLocation(data['Team 2 Location'])[2]
data['Venue'] = gameInfo(data['Opponent'])[0].apply(cleanVenueType)
data['Team 2'] = gameInfo(data['Opponent'])[1]
data['Game Type'] = gameInfo(data['Opponent'])[2].apply(cleanGameType)
data['Outcome'] = scoreInfo(data['Result'])[0]
data['Team 1 Score'] = scoreInfo(data['Result'])[1]
data['Team 2 Score'] = scoreInfo(data['Result'])[2]


In [24]:
data.columns

Index(['Date', 'Opponent', 'Result', 'Team 1', 'Team 2 Location',
       'Team 2 Address', 'Team 1 Location', 'Team 1 Address', 'Team 1 City',
       'Team 1 State', 'Team 1 Zipcode', 'Team 2 City', 'Team 2 State',
       'Team 2 Zipcode', 'Venue', 'Team 2', 'Game Type', 'Outcome',
       'Team 1 Score', 'Team 2 Score'],
      dtype='object')

In [25]:
cols = ['Date', 'Team 1', 'Team 2', 'Venue', 'Game Type',
       'Team 1 Score', 'Team 2 Score', 'Outcome', 'Team 1 Address', 'Team 1 City',
       'Team 1 State', 'Team 1 Zipcode', 'Team 2 Address',
        'Team 2 City', 'Team 2 State', 'Team 2 Zipcode']

In [26]:
data = data[cols]

In [27]:
data

Unnamed: 0,Date,Team 1,Team 2,Venue,Game Type,Team 1 Score,Team 2 Score,Outcome,Team 1 Address,Team 1 City,Team 1 State,Team 1 Zipcode,Team 2 Address,Team 2 City,Team 2 State,Team 2 Zipcode
0,11/12,Duncanville,Shadow Creek,H,Regular Season,89,38,W,900 W Camp Wisdom Rd,Duncanville,TX,75116,11850 Broadway,Pearland,TX,77584
1,11/13,Duncanville,Lancaster,H,Regular Season,102,59,W,900 W Camp Wisdom Rd,Duncanville,TX,75116,200 East Wintergreen Rd.,Lancaster,TX,75146
2,11/20,Duncanville,Kimball,H,Regular Season,84,82,W,900 W Camp Wisdom Rd,Duncanville,TX,75116,3606 S Westmoreland Rd,Dallas,TX,75233
3,11/24,Duncanville,Wheeler,H,Regular Season,80,58,W,900 W Camp Wisdom Rd,Duncanville,TX,75116,375 Holt Rd NE,Marietta,GA,30068
4,11/26,Duncanville,Centennial,H,Regular Season,75,70,W,900 W Camp Wisdom Rd,Duncanville,TX,75116,1820 Rimpau Ave,Corona,CA,92881
5,11/27,Duncanville,Sierra Canyon,H,Regular Season,80,73,W,900 W Camp Wisdom Rd,Duncanville,TX,75116,20801 West Rinaldi St.,Chatsworth,CA,91311
6,11/30,Duncanville,Rock Hill,A,Regular Season,90,66,W,900 W Camp Wisdom Rd,Duncanville,TX,75116,16061 Colt Rd.,Frisco,TX,75035
7,12/3,Duncanville,Zachary,H,Tournament,86,56,W,900 W Camp Wisdom Rd,Duncanville,TX,75116,4100 Bronco Ln,Zachary,LA,70791
8,12/4,Duncanville,Carver Collegiate,H,Regular Season,93,52,W,900 W Camp Wisdom Rd,Duncanville,TX,75116,3059 Higgins Blvd,New Orleans,LA,70126
9,12/9,Duncanville,Montverde Academy,H,Regular Season,67,66,W,900 W Camp Wisdom Rd,Duncanville,TX,75116,17235 7Th St,Montverde,FL,34756


In [24]:
school_dict = {'Duncanville': '/tx/duncanville/duncanville-panthers-and-pantherettes/basketball/21-22/schedule/',
 'Richardson': '/tx/richardson/richardson-eagles/basketball/21-22/schedule/',
 'McKinney': '/tx/mckinney/mckinney-lions/basketball/21-22/schedule/',
 'Mansfield Timberview (Arlington)': '/tx/arlington/mansfield-timberview-wolves/basketball/21-22/schedule/',
 'Guyer (Denton)': '/tx/denton/guyer-wildcats/basketball/21-22/schedule/',
 'Beaumont United (Beaumont)': '/tx/beaumont/beaumont-united-timberwolves/basketball/21-22/schedule/',
 'Plano': '/tx/plano/plano-wildcats/basketball/21-22/schedule/',
 'Westlake (Austin)': '/tx/austin/westlake-chaparrals/basketball/21-22/schedule/',
 'Atascocita (Humble)': '/tx/humble/atascocita-eagles/basketball/21-22/schedule/',
 'Oak Cliff Faith Family Academy (Dallas)': '/tx/dallas/oak-cliff-faith-family-academy-eagles/basketball/21-22/schedule/'}

In [25]:
grand_df = pd.DataFrame()
for school, url in school_dict.items():
    print(school, url)
    school_url = f'https://www.maxpreps.com{url}'
    print(school_url)
    response = requests.get(school_url).text
    soup = BeautifulSoup(response, 'lxml')
    table = soup.find('table')
    data = pd.read_html(str(table))[0]
    data = data.drop('Game Info', axis = 1)
    data['Team 1'] = school
    data['Team 1 Location'] = extractLocation(url)
    data['Team 1 Address'] = extractAddress(url)
    data['Team 1 City'] = splitLocation(data['Team 1 Location'])[0]
    data['Team 1 State'] = splitLocation(data['Team 1 Location'])[1]
    data['Team 1 Zipcode'] = splitLocation(data['Team 1 Location'])[2]
    data['Venue'] = gameInfo(data['Opponent'])[0].apply(cleanVenueType)
    data['Team 2'] = gameInfo(data['Opponent'])[1]
    data['Game Type'] = gameInfo(data['Opponent'])[2].apply(cleanGameType)
    data['Outcome'] = scoreInfo(data['Result'])[0]
    data['Team 1 Score'] = scoreInfo(data['Result'])[1]
    data['Team 2 Score'] = scoreInfo(data['Result'])[2]
    data = data.drop(['Opponent', 'Result', 'Team 1 Location'], axis = 1)
    
    grand_df = pd.concat([grand_df, data], ignore_index = True)

Duncanville /tx/duncanville/duncanville-panthers-and-pantherettes/basketball/21-22/schedule/
https://www.maxpreps.com/tx/duncanville/duncanville-panthers-and-pantherettes/basketball/21-22/schedule/
Richardson /tx/richardson/richardson-eagles/basketball/21-22/schedule/
https://www.maxpreps.com/tx/richardson/richardson-eagles/basketball/21-22/schedule/
McKinney /tx/mckinney/mckinney-lions/basketball/21-22/schedule/
https://www.maxpreps.com/tx/mckinney/mckinney-lions/basketball/21-22/schedule/
Mansfield Timberview (Arlington) /tx/arlington/mansfield-timberview-wolves/basketball/21-22/schedule/
https://www.maxpreps.com/tx/arlington/mansfield-timberview-wolves/basketball/21-22/schedule/
Guyer (Denton) /tx/denton/guyer-wildcats/basketball/21-22/schedule/
https://www.maxpreps.com/tx/denton/guyer-wildcats/basketball/21-22/schedule/
Beaumont United (Beaumont) /tx/beaumont/beaumont-united-timberwolves/basketball/21-22/schedule/
https://www.maxpreps.com/tx/beaumont/beaumont-united-timberwolves/ba

In [26]:
grand_df

Unnamed: 0,Date,Team 1,Team 1 Address,Team 1 City,Team 1 State,Team 1 Zipcode,Venue,Team 2,Game Type,Outcome,Team 1 Score,Team 2 Score
0,11/12,Duncanville,900 W Camp Wisdom Rd,Duncanville,TX,75116,H,Shadow Creek,Regular Season,W,89,38
1,11/13,Duncanville,900 W Camp Wisdom Rd,Duncanville,TX,75116,H,Lancaster,Regular Season,W,102,59
2,11/20,Duncanville,900 W Camp Wisdom Rd,Duncanville,TX,75116,H,Kimball,Regular Season,W,84,82
3,11/24,Duncanville,900 W Camp Wisdom Rd,Duncanville,TX,75116,H,Wheeler,Regular Season,W,80,58
4,11/26,Duncanville,900 W Camp Wisdom Rd,Duncanville,TX,75116,H,Centennial,Regular Season,W,75,70
...,...,...,...,...,...,...,...,...,...,...,...,...
379,3/1,Oak Cliff Faith Family Academy (Dallas),300 W Kiest Blvd,Dallas,TX,75224,H,Van Alstyne,Regular Season,W,91,45
380,3/4,Oak Cliff Faith Family Academy (Dallas),300 W Kiest Blvd,Dallas,TX,75224,H,Kaufman,Playoff,W,41,36
381,3/5,Oak Cliff Faith Family Academy (Dallas),300 W Kiest Blvd,Dallas,TX,75224,H,Carter,Playoff,W,70,56
382,3/11,Oak Cliff Faith Family Academy (Dallas),300 W Kiest Blvd,Dallas,TX,75224,H,Silsbee,Regular Season,W,70,62
