# Import Libraries and Packages

In [3]:
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [4]:
# import necessary libraries
from datetime import date, timedelta
import re
from bs4 import BeautifulSoup
import requests
import urllib
import re
import numpy as np
import pandas as pd
import threading
from threading import Timer
from time import sleep
import time

# Read CSV and TXT Files

In [31]:
bad_teams = open("data/bad_teams.txt", "r").read().strip().split("\n")

In [28]:
matchups_df = pd.read_csv("data\cbb_matchups.csv")
# matchups_dict = matchups_df.to_dict('records')

### Functions for URLS

In [6]:
def getHTMLdocument(url):
    """ 
    Sends GET request to the givin url and returns the html document 
 
    Args: 
        url - string: HTTP URL to send the GET request
    Returns: 
        Response of GET request in HTML format
     
    """ 
    response = requests.get(url)

    return response.text

In [7]:
# function to test if a url returns a 404
def testURL(url):
    """ 
        Sends GET request to the givin url and returns the html document 
 
    Args: 
        url - string: HTTP URL to send the GET request
    Returns: 
        Response code of GET request along with the url
    """ 
    response_code = requests.get(url).status_code
    response_string = f'URL: {url}\nResponse: {response_code}\n'
    
    return response_string

# Create Matchups CSV

### Functions

In [14]:
def createDates(start, end):
    """ 
    Creates dates for a specified range 
 
    Args: 
        start - date: starting date
        end - date: ending date 
    Returns: 
        date_list - list<datetime.date>: list of dates from start -> end 
 
    """
    date_list = []
    delta = end - start 

    for i in range(delta.days + 1):
        day = start + timedelta(days=i)
        if day.month > 10 or day.month < 5: # Only need months 11 - 4 (November - April)
            date_list.append(day)
    
    return date_list

In [15]:
# function to create urls to use to get matchups
def createDatesURL(date_list):
    """ 
    Formats and creates urls for the dates that are given
 
    Args: 
        date_list - list<datetime.date>: list of dates 
    Returns: 
        date_url_list - list<str>: list of urls for the given dates 
        
    """
    date_url_list = []

    for date in date_list:
        
        # Format the dates into strings for the url
        year = str(date.year)
        month = str(date.month) if len(str(date.month)) == 2 else "0" + str(date.month)
        day = str(date.day) if len(str(date.day)) == 2 else "0" + str(date.day)
        season = f'{date.year}-{date.year + 1}' if len(str(date.month)) == 2 else f'{date.year - 1}-{date.year}'
        
        url = f'https://newsday.sportsdirectinc.com/sports-scores/College-Basketball-Scores-Matchups.aspx?Year={year}&Period={month}{day}&CurrentSeason={season}'
        date_url_list.append({"url": url, "date": date})
        
    return date_url_list

In [16]:
# function to scrape webpage to get all matchups
def getMatchups(date_url_list):
    """ 
    Scrapes website to gather all games that were played in a given time frame
    Will gather the teams, rankings, url for the box score, and date
 
    Args: 
        date_url_list - list<dict>: dictionary containing all the urls along with the dates 
    Returns: 
        matchups_list - list<dict>: dictionary containing necessary matchup components 
 
    """
    matchups_list = []
    
    for date_url_index in range(len(date_url_list)):
        
        html_document = getHTMLdocument(date_url_list[date_url_index]['url'])
        soup = BeautifulSoup(html_document, 'html.parser') # soup object
        results = soup.find(id='Scoreboard_7_All_Games') # all of the necessary info is in this div 

        if results is not None: # dates that do not have games played on them will be None
            
            matchup_elements = results.find_all('div', class_='sdi-so-title') # each div in matchup elements is a matchup
            
            for matchup_index in range(len(matchup_elements)):
            
                matchup_text = matchup_elements[matchup_index].text.strip()
                teams = re.split(r'( vs )|( at )', matchup_text) # ex. Purdue(1) at Michigan -> ['Purdue(1)', ' at ', 'Michigan']
                teams = [i for i in teams if i is not None]
                neutral = True if teams[1] == ' vs ' else False # ' vs ' indicates a neutral game 

                for team_index in range(0, len(teams), 2):
                    team = re.split(r'\((?=[\d])|(?<=[\d])\)', teams[team_index]) # Extracts ranking from team ex. Purdue(1) -> ['Purdue', '1']
                    team = [i for i in team if i != ""]

                    if team_index == 0: # 0 index is away team
                        away_team = team[0]
                        away_team_ranking = int(team[1]) if len(team) == 2 else 0 # If team has no ranking, assign 0

                    else: # other index is the home team
                        home_team = team[0]
                        home_team_ranking = int(team[1]) if len(team) == 2 else 0

                find_href_results = results.find_all('div', class_='onoff') # div where box score url is
                
                # checks if url exists
                links = find_href_results[matchup_index].find_all('a', href=True)
                if(len(links) > 0):
                    for i in range(len(links)):
                        print(links[i].text)
                        if links[i].text == 'boxscore': 
                            box_score_url = links[i]['href']
                            print(links[i]['href'])
                else:
                    box_score_url = 'na'

                matchups_list.append({
                    'away_team': away_team,
                    'away_team_ranking': away_team_ranking,
                    'home_team': home_team,
                    'home_team_ranking': home_team_ranking,
                    'neutral': neutral,
                    'box_score_url': box_score_url,
                    'date': date_url_list[date_url_index]['date']
                })
                # {'away_team': 'Purdue', 'away_team_ranking': 1, 'home_team', 'home_team': 'Michigan', 'home_team_ranking': 0, 'neutral': False}

    return matchups_list

In [17]:
def getBox(box_div):
    """ 
    Scrapes website to get box score of a team
    This will gather all major statistics of every player on that team's roster
 
    Args: 
        box_div - list<str>: list of the divs that contain the table which contains the statistics 
    Returns: 
        box - dict<dict>: dictionary of every player on the team; each player is represented by a dictionary of that player's stats 
 
    """
    box = {}
    for row in range(2, len(box_div) - 2):        
        stat_results = box_div[row].find_all("td")
        stats_dict = {}
        if len(stat_results) == 14:
            for stat in range(len(stat_results)):
                if stat == 0:
                    # checks if url exists
                    if(len(stat_results[stat]('a', href=True)) > 0):
                        player_url_id = stat_results[stat].find_all('a', href=True)[0]['href']
                        player_url_id = re.split(r'r(?=[\d])|(?<=[\d])\.', player_url_id)
                        stats_dict['player_url_id'] = player_url_id[1]
                    else:
                        stats_dict['player_url_id'] = 'na'
                    
                elif stat == 1:
                    stats_dict['min_played'] = int(stat_results[stat].text.strip().split(':')[0])

                elif stat == 2:
                    fg_data = stat_results[stat].text.strip().split('-')
                    stats_dict['fg'] = int(fg_data[0])
                    stats_dict['fg_a'] = int(fg_data[1])

                elif stat == 3:
                    three_data = stat_results[stat].text.strip().split('-')
                    stats_dict['two_point'] = int(stats_dict['fg']) - int(three_data[0])
                    stats_dict['two_point_a'] = int(stats_dict['fg_a']) - int(three_data[1])
                    stats_dict['three_point'] = int(three_data[0])
                    stats_dict['three_point_a'] = int(three_data[1])

                elif stat == 4:
                    ft_data = stat_results[stat].text.strip().split('-')
                    stats_dict['ft_a'] = int(ft_data[1])
                    stats_dict['ft'] = int(ft_data[0])

                elif stat == 5:
                    stats_dict['orb'] = int(stat_results[stat].text.strip())

                elif stat == 6:
                    stats_dict['drb'] = int(stat_results[stat].text.strip())    

                elif stat == 7:
                    stats_dict['tot_reb'] = int(stat_results[stat].text.strip())

                elif stat == 8:
                    stats_dict['a'] = int(stat_results[stat].text.strip())

                elif stat == 9:
                    stats_dict['pf'] = int(stat_results[stat].text.strip())

                elif stat == 10:
                    stats_dict['stl'] = int(stat_results[stat].text.strip())

                elif stat == 11:
                    stats_dict['to'] = int(stat_results[stat].text.strip())

                elif stat == 12:
                    stats_dict['blk'] = int(stat_results[stat].text.strip())

                elif stat == 13:
                    stats_dict['pts'] = int(stat_results[stat].text.strip())

                box[stat_results[0].text.strip()] = stats_dict
        else:
            stats_dict['min_played'] = 'nan'
    return box
    

In [18]:
def getGame(url):
    """ 
    Scrapes website to gather all available information about a game including location, score, player stats, etc.
 
    Args: 
        url - str: url of the website that contains game information and box score 
    Returns: 
        game_results - dict<>: dictionary containing all information and stats of the game 
 
    """
    html_document = getHTMLdocument(url)
    soup = BeautifulSoup(html_document, 'html.parser') # soup object
    results = soup.find('div', id='sdi-rail-content') # all of the necessary info is in this div
    
    if results is not None:
        when_where_div = results.find("div", class_="sdi-quickhits").text.strip() # this div contains all the information for time and place of game
        when_where_list = re.split('When: |Where: |Officials: |Attendance: ', when_where_div)
        when_where_list = [i for i in when_where_list if i != ""]

        game_time = re.split('\xa0|, ', when_where_list[0])[0] #Extracts the time of the game ex. '7:00 PM'

        where_list = re.split(', ', when_where_list[1])

        if len(where_list) == 3: # Most games are structured ['arena', 'city', 'state']
            arena = where_list[0]
            city = where_list[1]
            state = where_list[2]

        else: # Out of country games are strucured ['arena', 'city']
            arena = where_list[0]
            city = where_list[1]
            state = None

        if len(when_where_list) == 4: # some divs dont have attendance, ones that dont will not be len 4
            if when_where_list[3].strip().isnumeric:
                attendance = when_where_list[3].strip()
            else:
                attendance = None # some have blank attendance or non numeric
        else:
            attendance = None

        # divs that contains the scores and boxscorse of the game for both teams
        results_div = results.find_all("div", class_="sdi-so")     
        score_div = results_div[0].find_all(class_='sdi-datacell')

        # len of score_div depends on how many overtimes there were, that is what this is checking
        if len(score_div) == 8:
            away_1 = int(score_div[1].text)
            away_2 = int(score_div[2].text)
            away_ot = 0
            away_ot2 = 0
            away_ot3 = 0
            away_tot = away_1 + away_2 + away_ot + away_ot2 + away_ot3

            home_1 = int(score_div[5].text)
            home_2 = int(score_div[6].text)
            home_ot = 0
            home_ot2 = 0
            home_ot3 = 0
            home_tot = home_1 + home_2 + home_ot + home_ot2 + home_ot3

        elif len(score_div) == 10:
            away_1 = int(score_div[1].text)
            away_2 = int(score_div[2].text)
            away_ot = int(score_div[3].text)
            away_ot2 = 0
            away_ot3 = 0
            away_tot = away_1 + away_2 + away_ot + away_ot2 + away_ot3

            home_1 = int(score_div[6].text)
            home_2 = int(score_div[7].text)
            home_ot = int(score_div[8].text)
            home_ot2 = 0
            home_ot3 = 0
            home_tot = home_1 + home_2 + home_ot + home_ot2 + home_ot3

        elif len(score_div) == 12:
            away_1 = int(score_div[1].text)
            away_2 = int(score_div[2].text)
            away_ot = int(score_div[3].text)
            away_ot2 = int(score_div[4].text)
            away_ot3 = 0
            away_tot = away_1 + away_2 + away_ot + away_ot2 + away_ot3

            home_1 = int(score_div[7].text)
            home_2 = int(score_div[8].text)
            home_ot = int(score_div[9].text)
            home_ot2 = int(score_div[10].text)
            home_ot3 = 0
            home_tot = home_1 + home_2 + home_ot + home_ot2 + home_ot3

        elif len(score_div) == 14:
            away_1 = int(score_div[1].text)
            away_2 = int(score_div[2].text)
            away_ot = int(score_div[3].text)
            away_ot2 = int(score_div[4].text)
            away_ot3 = int(score_div[5].text)
            away_tot = away_1 + away_2 + away_ot + away_ot2 + away_ot3

            home_1 = int(score_div[8].text)
            home_2 = int(score_div[9].text)
            home_ot = int(score_div[10].text)
            home_ot2 = int(score_div[11].text)
            home_ot3 = int(score_div[12].text)
            home_tot = home_1 + home_2 + home_ot + home_ot2 + home_ot3


        if len(results_div) == 3:
            home_box = results_div[2].find_all("tr")
            away_box = results_div[1].find_all("tr")

            away = getBox(away_box)
            home = getBox(home_box)
        
        else:
            away = 'na'
            home = 'na'

        game_results = {
            'game_time': game_time,
            'arena': arena,
            'city': city,
            'state': state,
            'attendance': attendance,
            'away_1': away_1,
            'away_2': away_2,
            'away_ot': away_ot,
            'away_ot2': away_ot2,
            'away_ot3': away_ot3,
            'away_tot': away_tot,
            'home_1': home_1,
            'home_2': home_2,
            'home_ot': home_ot,
            'home_ot2': home_ot2,
            'home_ot3': home_ot3,
            'home_tot': home_tot,
            'away': away,
            'home': home
        }

        return game_results
    else:
        return {
            'game_time': 'na',
            'arena': 'na',
            'city': 'na',
            'state': 'na',
            'attendance': 'na',
            'away_1': 'na',
            'away_2': 'na',
            'away_ot': 'na',
            'away_ot2': 'na',
            'away_ot3': 'na',
            'away_tot': 'na',
            'home_1': 'na',
            'home_2': 'na',
            'home_ot': 'na',
            'home_ot2': 'na',
            'home_ot3': 'na',
            'home_tot': 'na',
            'away': 'na',
            'home': 'na'
        }
    
    

In [19]:
def createBoxScores(matchups, start, end):
    """ 
    Scrapes website to gather all games that were played in a given time frame
    Will gather the teams, rankings, url for the box score, and date
 
    Args: 
        date_url_list - list<dict>: dictionary containing all the urls along with the dates 
    Returns: 
        matchups_list - list<dict>: dictionary containing necessary matchup components 
 
    """
    box_scores_list = []
    
    for i in range(start, end):
        if matchups[i]['box_score_url'] != 'na':
            print(str(i - start) + '/' + str(end - start) + ' - ' + matchups[i]['away_team'] + ' vs ' + matchups[i]['home_team'] + ': ' + matchups[i]['date'])
            box_scores_list.append(getGame("https://newsday.sportsdirectinc.com" + matchups[i]['box_score_url']))  
    return box_scores_list

### Create dataframe of matchups

In [20]:
# date_list = createDates(date(2010, 11, 8), date(2023, 4, 3))
# date_url_list = createDatesURL(date_list)

In [21]:
# box_scores_list = createBoxScores(matchups_dict, 17398, 23389)
# df = pd.DataFrame(box_scores_list)
# df.to_csv('2013-2014.csv')

### Cleanup and organize dataframes and csv files 

In [51]:
df = pd.read_csv("data/box_scores/2010-2011.csv")
print(df)

        id game_time                   arena          city         state  \
0        0   7:00 PM  Petersen Events Center    Pittsburgh  Pennsylvania   
1        1   8:00 PM       Assembly Hall-ILL     Champaign      Illinois   
2        2   8:00 PM          Comcast Center  College Park      Maryland   
3        3   9:00 PM      Frank Erwin Center        Austin         Texas   
4        4   7:00 PM  Petersen Events Center    Pittsburgh  Pennsylvania   
...    ...       ...                     ...           ...           ...   
5765  5765   7:00 PM   Madison Square Garden      New York      New York   
5766  5766  10:00 PM    Matthew Knight Arena        Eugene        Oregon   
5767  5767   6:09 PM         Reliant Stadium       Houston         Texas   
5768  5768   8:49 PM         Reliant Stadium       Houston         Texas   
5769  5769   9:20 PM         Reliant Stadium       Houston         Texas   

      attendance  away_1  away_2  away_ot  away_ot2  away_ot3  away_tot  \
0         92

In [53]:
df['matchup_id'] = df['id']

temp = ""

for away_box in df['away']:
    temp = away_box

temp = eval(temp)

print(temp)

away = []


for player in temp:
    away.append({})
    
    print(player)


{'Matt Howard': {'player_url_id': '736508', 'min_played': 37, 'fg': 1, 'fg_a': 13, 'two_point': 0, 'two_point_a': 7, 'three_point': 1, 'three_point_a': 6, 'ft_a': 4, 'ft': 4, 'orb': 2, 'drb': 4, 'tot_reb': 6, 'a': 0, 'pf': 2, 'stl': 1, 'to': 0, 'blk': 0, 'pts': 7}, 'Shawn Vanzant': {'player_url_id': '736509', 'min_played': 36, 'fg': 2, 'fg_a': 10, 'two_point': 1, 'two_point_a': 5, 'three_point': 1, 'three_point_a': 5, 'ft_a': 0, 'ft': 0, 'orb': 2, 'drb': 6, 'tot_reb': 8, 'a': 2, 'pf': 1, 'stl': 3, 'to': 2, 'blk': 0, 'pts': 5}, 'Shelvin Mack': {'player_url_id': '740869', 'min_played': 36, 'fg': 4, 'fg_a': 15, 'two_point': 0, 'two_point_a': 4, 'three_point': 4, 'three_point_a': 11, 'ft_a': 2, 'ft': 1, 'orb': 4, 'drb': 5, 'tot_reb': 9, 'a': 1, 'pf': 1, 'stl': 0, 'to': 3, 'blk': 1, 'pts': 13}, 'Chase Stigall': {'player_url_id': '743557', 'min_played': 16, 'fg': 3, 'fg_a': 11, 'two_point': 0, 'two_point_a': 2, 'three_point': 3, 'three_point_a': 9, 'ft_a': 0, 'ft': 0, 'orb': 1, 'drb': 1, 'to

In [41]:
matchups_df

Unnamed: 0,away_team,away_team_ranking,home_team,home_team_ranking,neutral,box_score_url,date
0,Rhode Island,0,Pittsburgh,4,False,/basketball/ncaab-boxscores.aspx?page=/data/NC...,11/8/2010
1,UC Irvine,0,Illinois,16,False,/basketball/ncaab-boxscores.aspx?page=/data/NC...,11/8/2010
2,Seattle,0,Maryland,0,False,/basketball/ncaab-boxscores.aspx?page=/data/NC...,11/8/2010
3,Navy,0,Texas,25,False,/basketball/ncaab-boxscores.aspx?page=/data/NC...,11/8/2010
4,Illinois-Chicago,0,Pittsburgh,4,False,/basketball/ncaab-boxscores.aspx?page=/data/NC...,11/10/2010
...,...,...,...,...,...,...,...
76622,Utah Valley,0,UAB,0,True,/basketball/ncaab-boxscores.aspx?page=/data/NC...,3/28/2023
76623,UAB,0,North Texas,0,True,/basketball/ncaab-boxscores.aspx?page=/data/NC...,3/30/2023
76624,Florida Atlantic,9,San Diego St.,5,True,/basketball/ncaab-boxscores.aspx?page=/data/NC...,4/1/2023
76625,Miami-Florida,5,Connecticut,4,True,/basketball/ncaab-boxscores.aspx?page=/data/NC...,4/1/2023


### Create dataframe of teams

In [55]:
teams = pd.DataFrame(matchups_df['home_team'].unique(), columns=['team'])

In [56]:
teams

Unnamed: 0,team
0,Pittsburgh
1,Illinois
2,Maryland
3,Texas
4,North Texas
...,...
466,Lindenwood
467,Utah Tech
468,Southern Indiana
469,Houston Christian


In [57]:
# cleanup dataframe

teams['lookup'] = teams['team'].str.lower()
teams['lookup'] = teams['lookup'].str.replace(" ", "-", regex=True)
teams['lookup'] = teams['lookup'].str.replace("(", "", regex=True)
teams['lookup'] = teams['lookup'].str.replace(")", "", regex=True)
teams['lookup'] = teams['lookup'].str.replace(".", "", regex=True)
teams['lookup'] = teams['lookup'].str.replace("'", "", regex=True)
teams['lookup'] = teams['lookup'].str.replace("&", "", regex=True)
teams['lookup'] = teams['lookup'].str.replace("-st", "-state", regex=True)
teams['lookup'] = teams['lookup'].str.replace("-stateate", "-state", regex=True)
teams['lookup'] = teams['lookup'].str.replace("maryland---e-shore", "maryland-eastern-shore", regex=True)
teams['lookup'] = teams['lookup'].str.replace("tenn-martin", "tennessee-martin", regex=True)
teams['lookup'] = teams['lookup'].str.replace("nc-state", "north-carolina-state", regex=True)
teams['lookup'] = teams['lookup'].str.replace("vcu", "virginia-commonwealth", regex=True)
teams['lookup'] = teams['lookup'].str.replace("american-u", "american", regex=True)
teams['lookup'] = teams['lookup'].str.replace("middle-tennessee-state", "middle-tennessee", regex=True)
teams['lookup'] = teams['lookup'].str.replace("liu-brooklyn", "long-island-university", regex=True)
teams['lookup'] = teams['lookup'].str.replace("miami-florida", "miami-fl", regex=True)
teams['lookup'] = teams['lookup'].str.replace("uc-santa-barbara", "california-santa-barbara", regex=True)
teams['lookup'] = teams['lookup'].str.replace("st-josephs", "saint-josephs", regex=True)
teams['lookup'] = teams['lookup'].str.replace("loyola-maryland", "loyola-md", regex=True)
teams['lookup'] = teams['lookup'].str.replace("vmi", "virginia-military-institute", regex=True)
teams['lookup'] = teams['lookup'].str.replace("siu---edwardsville", "southern-illinois-edwardsville", regex=True)
teams['lookup'] = teams['lookup'].str.replace("albany", "albany-ny", regex=True)
teams['lookup'] = teams['lookup'].str.replace("loyola-chicago", "loyola-il", regex=True)
teams['lookup'] = teams['lookup'].str.replace("unlv", "nevada-las-vegas", regex=True)
teams['lookup'] = teams['lookup'].str.replace("st-marys", "saint-marys-ca", regex=True)
teams['lookup'] = teams['lookup'].str.replace("texas-am-cc", "texas-am-corpus-christi", regex=True)
teams['lookup'] = teams['lookup'].str.replace("new-jersey-tech", "njit", regex=True)
teams['lookup'] = teams['lookup'].str.replace("wis-milwaukee", "milwaukee", regex=True)
teams['lookup'] = teams['lookup'].str.replace("uab", "alabama-birmingham", regex=True)
teams['lookup'] = teams['lookup'].str.replace("md-baltimore-cty", "maryland-baltimore-county", regex=True)
teams['lookup'] = teams['lookup'].str.replace("nc-greensboro", "north-carolina-greensboro", regex=True)
teams['lookup'] = teams['lookup'].str.replace("st-francis-pa", "saint-francis-pa", regex=True)
teams['lookup'] = teams['lookup'].str.replace("uc-riverside", "california-riverside", regex=True)
teams['lookup'] = teams['lookup'].str.replace("elon-university", "elon", regex=True)
teams['lookup'] = teams['lookup'].str.replace("bowling-green", "bowling-green-state", regex=True)
teams['lookup'] = teams['lookup'].str.replace("monmouth-nj", "monmouth", regex=True)
teams['lookup'] = teams['lookup'].str.replace("nc-wilmington", "north-carolina-wilmington", regex=True)
teams['lookup'] = teams['lookup'].str.replace("charleston", "college-of-charleston", regex=True)
teams['lookup'] = teams['lookup'].str.replace("indiana---purdue", "iupui", regex=True)
teams['lookup'] = teams['lookup'].str.replace("prairie-view-am", "prairie-view", regex=True)
teams['lookup'] = teams['lookup'].str.replace("nocarolina-at", "north-carolina-at", regex=True)
teams['lookup'] = teams['lookup'].str.replace("st-johns", "st-johns-ny", regex=True)
teams['lookup'] = teams['lookup'].str.replace("cal-state---bakersfield", "cal-state-bakersfield", regex=True)
teams['lookup'] = teams['lookup'].str.replace("uc-davis", "california-davis", regex=True)
teams['lookup'] = teams['lookup'].str.replace("cal-poly-slo", "cal-poly", regex=True)
teams['lookup'] = teams['lookup'].str.replace("uc-irvine", "california-irvine", regex=True)
teams['lookup'] = teams['lookup'].str.replace("boston-u", "boston-university", regex=True)
teams['lookup'] = teams['lookup'].str.replace("nc-asheville", "north-carolina-asheville", regex=True)
teams['lookup'] = teams['lookup'].str.replace("csu-northridge", "cal-state-northridge", regex=True)
teams['lookup'] = teams['lookup'].str.replace("mount-state-marys", "mount-st-marys", regex=True)
teams['lookup'] = teams['lookup'].str.replace("grambling-state", "grambling", regex=True)
teams['lookup'] = teams['lookup'].str.replace("se-missouri-state", "southeast-missouri-state", regex=True)
teams['lookup'] = teams['lookup'].str.replace("william--mary", "william-mary", regex=True)
teams['lookup'] = teams['lookup'].str.replace("iupu---ft-wayne", "ipfw", regex=True)
teams['lookup'] = teams['lookup'].str.replace("central-conn-state", "central-connecticut-state", regex=True)
teams['lookup'] = teams['lookup'].str.replace("st-peters", "saint-peters", regex=True)
teams['lookup'] = teams['lookup'].str.replace("college-of-charleston-southern", "charleston-southern", regex=True)
teams['lookup'] = teams['lookup'].str.replace("umkc", "missouri-kansas-city", regex=True)
teams['lookup'] = teams['lookup'].str.replace("southern-miss", "southern-mississippi", regex=True)
teams['lookup'] = teams['lookup'].str.replace("the-citadel", "citadel", regex=True)
teams['lookup'] = teams['lookup'].str.replace("texas-rio-grande-valley", "texas-pan-american", regex=True)
teams['lookup'] = teams['lookup'].str.replace("little-rock", "arkansas-little-rock", regex=True)
teams['lookup'] = teams['lookup'].str.replace("lu-lafayette", "louisiana-lafayette", regex=True)
teams['lookup'] = teams['lookup'].str.replace("st-francis-brooklyn", "st-francis-ny", regex=True)
teams['lookup'] = teams['lookup'].str.replace("ul-monroe", "louisiana-monroe", regex=True)
teams['lookup'] = teams['lookup'].str.replace("unc-wilmington", "north-carolina-wilmington", regex=True)
teams['lookup'] = teams['lookup'].str.replace("unorth-carolina-asheville", "north-carolina-asheville", regex=True)
teams['lookup'] = teams['lookup'].str.replace("unorth-carolina-greensboro", "north-carolina-greensboro", regex=True)
teams['lookup'] = teams['lookup'].str.replace("unorth-carolina-wilmington", "north-carolina-wilmington", regex=True)
teams['lookup'] = teams['lookup'].str.replace("csu-fullerton", "cal-state-fullerton", regex=True)
teams['lookup'] = teams['lookup'].str.replace("houston-christian", "houston-baptist", regex=True)
teams['lookup'] = teams['lookup'].str.replace("queens-university-of-charlotte", "queens-nc", regex=True)

In [58]:
# find indecies of rows that need deleted
bad_team_index = []
for team in bad_teams:
    bad_team_index.append(teams.index[teams['lookup'] == team].tolist()[0])
    
print(bad_team_index)

[34, 141, 266, 290, 320, 321, 335, 344, 348, 349, 350, 351, 352, 354, 355, 356, 359, 361, 362, 363, 364, 365, 165, 367, 369, 371, 372, 373, 374, 375, 376, 377, 378, 380, 382, 383, 384, 385, 386, 390, 391, 393, 394, 395, 396, 398, 401, 402, 403, 404, 407, 408, 417, 418, 419, 420, 421, 422, 426, 427, 428, 429, 431, 432, 433, 434, 437, 444, 446, 447, 451, 455, 461, 464, 467, 370, 265, 264]


In [59]:
# delete rows of teams that shouldn't be there
for i in bad_team_index:
    teams = teams.drop(i)
teams = teams.reset_index()

In [60]:
teams

Unnamed: 0,index,team,lookup
0,0,Pittsburgh,pittsburgh
1,1,Illinois,illinois
2,2,Maryland,maryland
3,3,Texas,texas
4,4,North Texas,north-texas
...,...,...,...
388,465,Stonehill,stonehill
389,466,Lindenwood,lindenwood
390,468,Southern Indiana,southern-indiana
391,469,Houston Christian,houston-baptist


In [62]:
# function to create list of urls for the team pages
def createTeamURL():
    url_team_list = []

    for team in teams['lookup']:
#         for year in range(2011, 2024):
        url = "https://www.sports-reference.com/cbb/schools/" + team + "/men/2023.html"
        url_team_list.append(url)
        
    return url_team_list

In [61]:
teams_list = []
for i in teams['lookup']:
    teams_list.append(i)

In [64]:
url_team_list = createTeamURL()

In [None]:
# for url in url_team_list:
#     testURL(url)
#     time.sleep(3.1)

In [66]:
url_team_set = set(url_team_list)

In [67]:
print(len(url_team_set))

364


In [70]:
def getDivisions(url_list):
    divisions = []

    for url in url_list:
        # create document
        html_document = getHTMLdocument(url)

        # create soap object
        soup = BeautifulSoup(html_document, 'html.parser')

        results = soup.find(id="meta")
        # print(results)

        if results is not None:
            division_text = results.find_all("div")[1].find("p").text
            if " in " in division_text:
                division_text_before = division_text.index(" in ") + 4
                division_text_after = division_text[division_text_before : len(division_text)].index("MBB")
                division = division_text[division_text_before : division_text_after + division_text_before - 1]
            else:
                division = "Independent"
            
            
        
        time.sleep(3)
        divisions.append((division, url))
        print((division, url))
    return divisions

In [72]:
divs = getDivisions(url_team_set)

('SWAC', 'https://www.sports-reference.com/cbb/schools/arkansas-pine-bluff/men/2023.html')


KeyboardInterrupt: 

In [77]:
divs = divs
divs.sort()

In [79]:
divs2 = []
for i in divs:
    divs2.append(i[0])

In [83]:
divs2_set = set(divs2)
divs2_set

{'A-10',
 'A-Sun',
 'AAC',
 'ACC',
 'AEC',
 'Big 12',
 'Big East',
 'Big Sky',
 'Big South',
 'Big Ten',
 'Big West',
 'CAA',
 'CUSA',
 'Horizon',
 'Independent',
 'Ivy',
 'MAAC',
 'MAC',
 'MEAC',
 'MVC',
 'MWC',
 'NEC',
 'OVC',
 'Pac-12',
 'Patriot',
 'SEC',
 'SWAC',
 'Southern',
 'Southland',
 'Summit',
 'Sun Belt',
 'WAC',
 'WCC'}

In [85]:
for i in divs2_set:
    print(i, divs2.count(i))

AAC 11
Big 12 10
MAC 12
NEC 9
OVC 10
Big West 10
WCC 11
Independent 2
Big Ten 14
Southern 10
A-10 15
CUSA 11
Pac-12 12
MVC 12
MWC 11
SWAC 12
CAA 13
Horizon 11
AEC 9
MAAC 11
Big East 11
Big South 11
Sun Belt 14
A-Sun 14
Ivy 8
Patriot 10
Southland 10
ACC 15
Big Sky 10
SEC 14
MEAC 8
Summit 10
WAC 13


In [None]:
# divs_set = set(divs)

In [None]:
# for div in divs_set:
#     print(divs.count(div), div)

https://www.sports-reference.com/cbb/schools/purdue/men/2023.html

Western Illinois summit->ovc
byu wcc->big 12
houston wcc->big 12
cincinati wcc->big 12
ucf wcc->big 12