# Import Libraries and Packages

In [1]:
# import necessary libraries
from datetime import date, timedelta
import re
from bs4 import BeautifulSoup
import requests
import urllib
import re
import numpy as np
import pandas as pd
import threading
from threading import Timer
from time import sleep
import time

# Read CSV and TXT Files

In [3]:
bad_teams = open(r"C:\Users\James Seelos\Documents\Personal Projects\CBB Predictor\data\bad_teams.txt", "r").read().strip().split("\n")

In [17]:
df = pd.read_csv(r"C:\Users\James Seelos\Documents\Personal Projects\CBB Predictor\data\cbb_matchups.csv")

### Functions for URLS

In [4]:
def getHTMLdocument(url):
    """ 
    Sends GET request to the givin url and returns the html document 
 
    Args: 
        url - string: HTTP URL to send the GET request
    Returns: 
        Response of GET request in HTML format
     
    """ 
    response = requests.get(url)

    return response.text

In [5]:
# function to test if a url returns a 404
def testURL(url):
    """ 
        Sends GET request to the givin url and returns the html document 
 
    Args: 
        url - string: HTTP URL to send the GET request
    Returns: 
        Response code of GET request along with the url
    """ 
    response_code = requests.get(url).status_code
    response_string = f'URL: {url}\nResponse: {response_code}\n'
    
    return response_string

# Create Matchups CSV

### Functions

In [6]:
def createDates(start, end):
    """ 
    Creates dates for a specified range 
 
    Args: 
        start - date: starting date
        end - date: ending date 
    Returns: 
        date_list - list<datetime.date>: list of dates from start -> end 
 
    """
    date_list = []
    delta = end - start 

    for i in range(delta.days + 1):
        day = start + timedelta(days=i)
        if day.month > 10 or day.month < 5: # Only need months 11 - 4 (November - April)
            date_list.append(day)
    
    return date_list

In [25]:
# function to create urls to use to get matchups
def createDatesURL(date_list):
    """ 
    Formats and creates urls for the dates that are given
 
    Args: 
        date_list - list<datetime.date>: list of dates 
    Returns: 
        date_url_list - list<str>: list of urls for the given dates 
        
    """
    date_url_list = []

    for date in date_list:
        
        # Format the dates into strings for the url
        year = str(date.year)
        month = str(date.month) if len(str(date.month)) == 2 else "0" + str(date.month)
        day = str(date.day) if len(str(date.day)) == 2 else "0" + str(date.day)
        season = f'{date.year}-{date.year + 1}' if len(str(date.month)) == 2 else f'{date.year - 1}-{date.year}'
        
        url = f'https://newsday.sportsdirectinc.com/sports-scores/College-Basketball-Scores-Matchups.aspx?Year={year}&Period={month}{day}&CurrentSeason={season}'
        date_url_list.append({"url": url, "date": date})
        
    return date_url_list

In [23]:
# function to scrape webpage to get all matchups
def getMatchups(date_url_list):
    """ 
    Scrapes website to gather all games that were played in a given time frame
    Will gather the teams, rankings, url for the box score, and date
 
    Args: 
        date_url_list - list<dict>: dictionary containing all the urls along with the dates 
    Returns: 
        matchups_list - list<dict>: dictionary containing necessary matchup components 
 
    """
    matchups_list = []
    
    for date_url_index in range(len(date_url_list)):
        
        html_document = getHTMLdocument(date_url_list[date_url_index]['url'])
        soup = BeautifulSoup(html_document, 'html.parser') # soup object
        results = soup.find(id='Scoreboard_7_All_Games') # all of the necessary info is in this div 

        if results is not None: # dates that do not have games played on them will be None
            
            matchup_elements = results.find_all('div', class_='sdi-so-title') # each div in matchup elements is a matchup
            
            for matchup_index in range(len(matchup_elements)):
            
                matchup_text = matchup_elements[matchup_index].text.strip()
                teams = re.split(r'( vs )|( at )', matchup_text) # ex. Purdue(1) at Michigan -> ['Purdue(1)', ' at ', 'Michigan']
                teams = [i for i in teams if i is not None]
                neutral = True if teams[1] == ' vs ' else False # ' vs ' indicates a neutral game 

                for team_index in range(0, len(teams), 2):
                    team = re.split(r'\((?=[\d])|(?<=[\d])\)', teams[team_index]) # Extracts ranking from team ex. Purdue(1) -> ['Purdue', '1']
                    team = [i for i in team if i != ""]

                    if team_index == 0: # 0 index is away team
                        away_team = team[0]
                        away_team_ranking = int(team[1]) if len(team) == 2 else 0 # If team has no ranking, assign 0

                    else: # other index is the home team
                        home_team = team[0]
                        home_team_ranking = int(team[1]) if len(team) == 2 else 0

                find_href_results = results.find_all('div', class_='onoff') # div where box score url is
                
                # checks if url exists
                if(len(find_href_results[matchup_index].find_all('a', href=True)) > 0): 
                    box_score_url = find_href_results[matchup_index].find_all('a', href=True)[0]['href']
                else:
                    box_score_url = 'na'

                matchups_list.append({'away_team': away_team,
                                   'away_team_ranking': away_team_ranking,
                                   'home_team': home_team,
                                   'home_team_ranking': home_team_ranking,
                                   'neutral': neutral,
                                   'box_score_url': box_score_url,
                                   'date': date_url_list[date_url_index]['date']})
                # {'away_team': 'Purdue', 'away_team_ranking': 1, 'home_team', 'home_team': 'Michigan', 'home_team_ranking': 0, 'neutral': False}

    return matchups_list

In [19]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,away_team,away_team_ranking,home_team,home_team_ranking,neutral,box_score_url,date
0,0,Rhode Island,0,Pittsburgh,4,False,/basketball/ncaab-boxscores.aspx?page=/data/NC...,2010-11-08


In [None]:
def createBoxScores(matchups):
    """ 
    Scrapes website to gather all games that were played in a given time frame
    Will gather the teams, rankings, url for the box score, and date
 
    Args: 
        date_url_list - list<dict>: dictionary containing all the urls along with the dates 
    Returns: 
        matchups_list - list<dict>: dictionary containing necessary matchup components 
 
    """
    box_scores_list = []
    
    for matchup in matchups:
        if matchup['box_score_url'] != 'na':
            box_scores_list.append(getGame("https://newsday.sportsdirectinc.com" + matchup['box_score_url'], matchup[], matchup[0], matchup[3]))  
    return box_scores_list

In [None]:
box_scores = []

for matchup in matchups:
    if matchup[4] != 'na':
#         time.sleep(1)
        box_scores.append(getGame("https://newsday.sportsdirectinc.com" + matchup[4], matchup[1], matchup[0], matchup[3]))

In [39]:
getGame('https://newsday.sportsdirectinc.com/basketball/ncaab-boxscores.aspx?page=/data/NCAAB/results/2010-2011/boxscore796041.html', 'hello', 'hello', 'hello')

When: 7:00 PM ET, Monday, November 8, 2010Where: Petersen Events Center, Pittsburgh, PennsylvaniaOfficials: 
  # Bryan Kersey, # Mike Eades, # Ray NatiliAttendance: 
    9256
['7:00 PM\xa0ET,\xa0Monday, November 8, 2010', 'Petersen Events Center, Pittsburgh, Pennsylvania', '\n  # Bryan Kersey, # Mike Eades, # Ray Natili', '\n    9256']
7:00 PM


In [38]:
def getGame(url, home_team, away_team, date):
    """ 
    Scrapes website to gather all games that were played in a given time frame
    Will gather the teams, rankings, url for the box score, and date
 
    Args: 
        date_url_list - list<dict>: dictionary containing all the urls along with the dates 
    Returns: 
        matchups_list - list<dict>: dictionary containing necessary matchup components 
 
    """
    html_document = getHTMLdocument(url)
    soup = BeautifulSoup(html_document, 'html.parser') # soup object
    results = soup.find('div', id='sdi-rail-content') # all of the necessary info is in this div
    
    when_where_div = results.find("div", class_="sdi-quickhits").text.strip() # this div contains all the information for time and place of game
    when_where_list = re.split('When: |Where: |Officials: |Attendance: ', when_where_div)
    when_where_list = [i for i in when_where_list if i != ""]
    print(when_where_list)
    game_time = re.split('\xa0|, ', when_where_list[0])[0]
    where_list = re.split(', ', when_where_list[1])
#     if len(where_list) == 3:
#         arena = where_list[0]
#         city = where_list[1]
#         state = where_list[2]
#     else:
#         arena = where_list[0]
#         city = where_list[1]
#         state = 'na'
#     if len(when_where_list) < 6:
#         if when_where_list[4].strip().isnumeric:
#             attendance = when_where_list[4].strip()
#         else:
#             attendance = 0
#     else:
#         attendance = 0
    
    
#     div_results = results.find_all("div", class_="sdi-so")
    
    
#     score_div = div_results[0].find_all(class_='sdi-datacell')

#     if len(score_div) == 8:
#         away_1 = int(score_div[1].text)
#         away_2 = int(score_div[2].text)
#         away_ot = 0
#         away_ot2 = 0
#         away_ot3 = 0
#         away_tot = away_1 + away_2 + away_ot + away_ot2 + away_ot3

#         home_1 = int(score_div[5].text)
#         home_2 = int(score_div[6].text)
#         home_ot = 0
#         home_ot2 = 0
#         home_ot3 = 0
#         home_tot = home_1 + home_2 + home_ot + home_ot2 + home_ot3
        
#     elif len(score_div) == 10:
#         away_1 = int(score_div[1].text)
#         away_2 = int(score_div[2].text)
#         away_ot = int(score_div[3].text)
#         away_ot2 = 0
#         away_ot3 = 0
#         away_tot = away_1 + away_2 + away_ot + away_ot2 + away_ot3

#         home_1 = int(score_div[6].text)
#         home_2 = int(score_div[7].text)
#         home_ot = int(score_div[8].text)
#         home_ot2 = 0
#         home_ot3 = 0
#         home_tot = home_1 + home_2 + home_ot + home_ot2 + home_ot3
        
#     elif len(score_div) == 12:
#         away_1 = int(score_div[1].text)
#         away_2 = int(score_div[2].text)
#         away_ot = int(score_div[3].text)
#         away_ot2 = int(score_div[4].text)
#         away_ot3 = 0
#         away_tot = away_1 + away_2 + away_ot + away_ot2 + away_ot3

#         home_1 = int(score_div[7].text)
#         home_2 = int(score_div[8].text)
#         home_ot = int(score_div[9].text)
#         home_ot2 = int(score_div[10].text)
#         home_ot3 = 0
#         home_tot = home_1 + home_2 + home_ot + home_ot2 + home_ot3
        
#     elif len(score_div) == 14:
#         away_1 = int(score_div[1].text)
#         away_2 = int(score_div[2].text)
#         away_ot = int(score_div[3].text)
#         away_ot2 = int(score_div[4].text)
#         away_ot3 = int(score_div[5].text)
#         away_tot = away_1 + away_2 + away_ot + away_ot2 + away_ot3

#         home_1 = int(score_div[8].text)
#         home_2 = int(score_div[9].text)
#         home_ot = int(score_div[10].text)
#         home_ot2 = int(score_div[11].text)
#         home_ot3 = int(score_div[12].text)
#         home_tot = home_1 + home_2 + home_ot + home_ot2 + home_ot3

    
#     away = {}
#     home = {}
    
    
#     away_box = div_results[1].find_all("tr")
     
#     for row in range(2, len(away_box) - 2):        
#         stat_results = away_box[row].find_all("td")
#         stats_list = []
#         for stat in range(len(stat_results)):
#             if stat == 1:
#                 min_played = int(stat_results[stat].text.strip().split(':')[0])
#                 stats_list.append(min_played)
                
#             elif stat == 2:
#                 fg_data = stat_results[stat].text.strip().split('-')
#                 fg = int(fg_data[0])
#                 fga = int(fg_data[1])
#                 stats_list.append(fg)
#                 stats_list.append(fga)
                
#             elif stat == 3:
#                 three_data = stat_results[stat].text.strip().split('-')
#                 three_point = int(three_data[0])
#                 three_point_a = int(three_data[1])
#                 two_point = int(stats_list[1]) - three_point
#                 two_point_a = int(stats_list[2]) - three_point_a
#                 stats_list.append(two_point)
#                 stats_list.append(two_point_a)
#                 stats_list.append(three_point)
#                 stats_list.append(three_point_a)
                
#             elif stat == 4:
#                 ft_data = stat_results[stat].text.strip().split('-')
#                 ft = int(ft_data[0])
#                 fta = int(ft_data[1])
#                 stats_list.append(ft)
#                 stats_list.append(fta)
                
#             elif stat == 5:
#                 orb = int(stat_results[stat].text.strip())
#                 stats_list.append(orb)
                
#             elif stat == 6:
#                 drb = int(stat_results[stat].text.strip())
#                 stats_list.append(drb)    
                
#             elif stat == 7:
#                 tot_reb = int(stat_results[stat].text.strip())
#                 stats_list.append(tot_reb)
                
#             elif stat == 8:
#                 a = int(stat_results[stat].text.strip())
#                 stats_list.append(a)
                
#             elif stat == 9:
#                 pf = int(stat_results[stat].text.strip())
#                 stats_list.append(pf)
                
#             elif stat == 10:
#                 stl = int(stat_results[stat].text.strip())
#                 stats_list.append(stl)
                
#             elif stat == 11:
#                 to = int(stat_results[stat].text.strip())
#                 stats_list.append(to)
                
#             elif stat == 12:
#                 blk = int(stat_results[stat].text.strip())
#                 stats_list.append(blk)
                
#             elif stat == 13:
#                 pts = int(stat_results[stat].text.strip())
#                 stats_list.append(pts)
            
#             away[stat_results[0].text.strip()] = stats_list
            
            
#     home_box = div_results[2].find_all("tr")
     
#     for row in range(2, len(home_box) - 2):        
#         stat_results = home_box[row].find_all("td")
#         stats_list = []
#         for stat in range(len(stat_results)):
#             if stat == 1:
#                 min_played = int(stat_results[stat].text.strip().split(':')[0])
#                 stats_list.append(min_played)
                
#             elif stat == 2:
#                 fg_data = stat_results[stat].text.strip().split('-')
#                 fg = int(fg_data[0])
#                 fga = int(fg_data[1])
#                 stats_list.append(fg)
#                 stats_list.append(fga)
                
#             elif stat == 3:
#                 three_data = stat_results[stat].text.strip().split('-')
#                 three_point = int(three_data[0])
#                 three_point_a = int(three_data[1])
#                 two_point = int(stats_list[1]) - three_point
#                 two_point_a = int(stats_list[2]) - three_point_a
#                 stats_list.append(two_point)
#                 stats_list.append(two_point_a)
#                 stats_list.append(three_point)
#                 stats_list.append(three_point_a)
                
#             elif stat == 4:
#                 ft_data = stat_results[stat].text.strip().split('-')
#                 ft = int(ft_data[0])
#                 fta = int(ft_data[1])
#                 stats_list.append(ft)
#                 stats_list.append(fta)
                
#             elif stat == 5:
#                 orb = int(stat_results[stat].text.strip())
#                 stats_list.append(orb)
                
#             elif stat == 6:
#                 drb = int(stat_results[stat].text.strip())
#                 stats_list.append(drb)    
                
#             elif stat == 7:
#                 tot_reb = int(stat_results[stat].text.strip())
#                 stats_list.append(tot_reb)
                
#             elif stat == 8:
#                 a = int(stat_results[stat].text.strip())
#                 stats_list.append(a)
                
#             elif stat == 9:
#                 pf = int(stat_results[stat].text.strip())
#                 stats_list.append(pf)
                
#             elif stat == 10:
#                 stl = int(stat_results[stat].text.strip())
#                 stats_list.append(stl)
                
#             elif stat == 11:
#                 to = int(stat_results[stat].text.strip())
#                 stats_list.append(to)
                
#             elif stat == 12:
#                 blk = int(stat_results[stat].text.strip())
#                 stats_list.append(blk)
                
#             elif stat == 13:
#                 pts = int(stat_results[stat].text.strip())
#                 stats_list.append(pts)
            
#             home[stat_results[0].text.strip()] = stats_list
            
# #     print(away_team, home_team, date)
#     return (game_time, date, arena, city, state, attendance, away_team, home_team, away_1, away_2, away_ot, away_ot2, away_ot3, away_tot, home_1, home_2, home_ot, home_ot2, home_ot3, home_tot, away, home)
    
    

In [None]:
# def getGame(url, home_team, away_team, date):

#     # create document
#     html_document = getHTMLdocument(url)

#     # create soap object
#     soup = BeautifulSoup(html_document, 'html.parser')

#     results = soup.find("div", id="sdi-rail-content")
    
#     when_where_div = results.find("div", class_="sdi-quickhits").text
    
#     when_where_list = re.split('When: |Where: |Officials: |Attendance: ', when_where_div)
#     game_time = re.split('\xa0|, ', when_where_list[1])[0]
#     where_list = re.split(', ', when_where_list[2])
#     if len(where_list) == 3:
#         arena = where_list[0]
#         city = where_list[1]
#         state = where_list[2]
#     else:
#         arena = where_list[0]
#         city = where_list[1]
#         state = 'na'
#     if len(when_where_list) < 6:
#         if when_where_list[4].strip().isnumeric:
#             attendance = when_where_list[4].strip()
#         else:
#             attendance = 'na'
#     else:
#         attendance = 'na'
    
    
#     div_results = results.find_all("div", class_="sdi-so")
    
    
#     score_div = div_results[0].find_all(class_='sdi-datacell')

#     if len(score_div) == 8:
#         away_1 = int(score_div[1].text)
#         away_2 = int(score_div[2].text)
#         away_ot = 0
#         away_ot2 = 0
#         away_ot3 = 0
#         away_tot = away_1 + away_2 + away_ot + away_ot2 + away_ot3

#         home_1 = int(score_div[5].text)
#         home_2 = int(score_div[6].text)
#         home_ot = 0
#         home_ot2 = 0
#         home_ot3 = 0
#         home_tot = home_1 + home_2 + home_ot + home_ot2 + home_ot3
        
#     elif len(score_div) == 10:
#         away_1 = int(score_div[1].text)
#         away_2 = int(score_div[2].text)
#         away_ot = int(score_div[3].text)
#         away_ot2 = 0
#         away_ot3 = 0
#         away_tot = away_1 + away_2 + away_ot + away_ot2 + away_ot3

#         home_1 = int(score_div[6].text)
#         home_2 = int(score_div[7].text)
#         home_ot = int(score_div[8].text)
#         home_ot2 = 0
#         home_ot3 = 0
#         home_tot = home_1 + home_2 + home_ot + home_ot2 + home_ot3
        
#     elif len(score_div) == 12:
#         away_1 = int(score_div[1].text)
#         away_2 = int(score_div[2].text)
#         away_ot = int(score_div[3].text)
#         away_ot2 = int(score_div[4].text)
#         away_ot3 = 0
#         away_tot = away_1 + away_2 + away_ot + away_ot2 + away_ot3

#         home_1 = int(score_div[7].text)
#         home_2 = int(score_div[8].text)
#         home_ot = int(score_div[9].text)
#         home_ot2 = int(score_div[10].text)
#         home_ot3 = 0
#         home_tot = home_1 + home_2 + home_ot + home_ot2 + home_ot3
        
#     elif len(score_div) == 14:
#         away_1 = int(score_div[1].text)
#         away_2 = int(score_div[2].text)
#         away_ot = int(score_div[3].text)
#         away_ot2 = int(score_div[4].text)
#         away_ot3 = int(score_div[5].text)
#         away_tot = away_1 + away_2 + away_ot + away_ot2 + away_ot3

#         home_1 = int(score_div[8].text)
#         home_2 = int(score_div[9].text)
#         home_ot = int(score_div[10].text)
#         home_ot2 = int(score_div[11].text)
#         home_ot3 = int(score_div[12].text)
#         home_tot = home_1 + home_2 + home_ot + home_ot2 + home_ot3

    
#     away = {}
#     home = {}
    
    
#     away_box = div_results[1].find_all("tr")
     
#     for row in range(2, len(away_box) - 2):        
#         stat_results = away_box[row].find_all("td")
#         stats_list = []
#         for stat in range(len(stat_results)):
#             if stat == 1:
#                 min_played = int(stat_results[stat].text.strip().split(':')[0])
#                 stats_list.append(min_played)
                
#             elif stat == 2:
#                 fg_data = stat_results[stat].text.strip().split('-')
#                 fg = int(fg_data[0])
#                 fga = int(fg_data[1])
#                 stats_list.append(fg)
#                 stats_list.append(fga)
                
#             elif stat == 3:
#                 three_data = stat_results[stat].text.strip().split('-')
#                 three_point = int(three_data[0])
#                 three_point_a = int(three_data[1])
#                 two_point = int(stats_list[1]) - three_point
#                 two_point_a = int(stats_list[2]) - three_point_a
#                 stats_list.append(two_point)
#                 stats_list.append(two_point_a)
#                 stats_list.append(three_point)
#                 stats_list.append(three_point_a)
                
#             elif stat == 4:
#                 ft_data = stat_results[stat].text.strip().split('-')
#                 ft = int(ft_data[0])
#                 fta = int(ft_data[1])
#                 stats_list.append(ft)
#                 stats_list.append(fta)
                
#             elif stat == 5:
#                 orb = int(stat_results[stat].text.strip())
#                 stats_list.append(orb)
                
#             elif stat == 6:
#                 drb = int(stat_results[stat].text.strip())
#                 stats_list.append(drb)    
                
#             elif stat == 7:
#                 tot_reb = int(stat_results[stat].text.strip())
#                 stats_list.append(tot_reb)
                
#             elif stat == 8:
#                 a = int(stat_results[stat].text.strip())
#                 stats_list.append(a)
                
#             elif stat == 9:
#                 pf = int(stat_results[stat].text.strip())
#                 stats_list.append(pf)
                
#             elif stat == 10:
#                 stl = int(stat_results[stat].text.strip())
#                 stats_list.append(stl)
                
#             elif stat == 11:
#                 to = int(stat_results[stat].text.strip())
#                 stats_list.append(to)
                
#             elif stat == 12:
#                 blk = int(stat_results[stat].text.strip())
#                 stats_list.append(blk)
                
#             elif stat == 13:
#                 pts = int(stat_results[stat].text.strip())
#                 stats_list.append(pts)
            
#             away[stat_results[0].text.strip()] = stats_list
            
            
#     home_box = div_results[2].find_all("tr")
     
#     for row in range(2, len(home_box) - 2):        
#         stat_results = home_box[row].find_all("td")
#         stats_list = []
#         for stat in range(len(stat_results)):
#             if stat == 1:
#                 min_played = int(stat_results[stat].text.strip().split(':')[0])
#                 stats_list.append(min_played)
                
#             elif stat == 2:
#                 fg_data = stat_results[stat].text.strip().split('-')
#                 fg = int(fg_data[0])
#                 fga = int(fg_data[1])
#                 stats_list.append(fg)
#                 stats_list.append(fga)
                
#             elif stat == 3:
#                 three_data = stat_results[stat].text.strip().split('-')
#                 three_point = int(three_data[0])
#                 three_point_a = int(three_data[1])
#                 two_point = int(stats_list[1]) - three_point
#                 two_point_a = int(stats_list[2]) - three_point_a
#                 stats_list.append(two_point)
#                 stats_list.append(two_point_a)
#                 stats_list.append(three_point)
#                 stats_list.append(three_point_a)
                
#             elif stat == 4:
#                 ft_data = stat_results[stat].text.strip().split('-')
#                 ft = int(ft_data[0])
#                 fta = int(ft_data[1])
#                 stats_list.append(ft)
#                 stats_list.append(fta)
                
#             elif stat == 5:
#                 orb = int(stat_results[stat].text.strip())
#                 stats_list.append(orb)
                
#             elif stat == 6:
#                 drb = int(stat_results[stat].text.strip())
#                 stats_list.append(drb)    
                
#             elif stat == 7:
#                 tot_reb = int(stat_results[stat].text.strip())
#                 stats_list.append(tot_reb)
                
#             elif stat == 8:
#                 a = int(stat_results[stat].text.strip())
#                 stats_list.append(a)
                
#             elif stat == 9:
#                 pf = int(stat_results[stat].text.strip())
#                 stats_list.append(pf)
                
#             elif stat == 10:
#                 stl = int(stat_results[stat].text.strip())
#                 stats_list.append(stl)
                
#             elif stat == 11:
#                 to = int(stat_results[stat].text.strip())
#                 stats_list.append(to)
                
#             elif stat == 12:
#                 blk = int(stat_results[stat].text.strip())
#                 stats_list.append(blk)
                
#             elif stat == 13:
#                 pts = int(stat_results[stat].text.strip())
#                 stats_list.append(pts)
            
#             home[stat_results[0].text.strip()] = stats_list
            
#     print(away_team, home_team, date)
#     return (game_time, date, arena, city, state, attendance, away_team, home_team, away_1, away_2, away_ot, away_ot2, away_ot3, away_tot, home_1, home_2, home_ot, home_ot2, home_ot3, home_tot, away, home)
    
    

### Create dataframe of matchups

In [9]:
date_list = createDates(date(2010, 11, 8), date(2023, 4, 3))
date_url_list = createDatesURL(date_list)

In [10]:
# WILL CREATE LIST OF ALL GAMES PLAYED
# NOT NECESSARY TO RUN IF CSV FILE IS IMPORTED

# cbb_matchups = getMatchups(date_url_list)
# df.Dataframe.from_dict(cbb_matchups)

### Create dataframe of teams

In [None]:
teams = pd.DataFrame(cbb_matchups['home_team'].unique(), columns=['team'])

In [None]:
teams

In [None]:
# function to create list of urls for the team pages
def createTeamURL():
    url_team_list = []

    for team in teams['lookup']:
#         for year in range(2011, 2024):
        url = "https://www.sports-reference.com/cbb/schools/" + team + "/men/2023.html"
        url_team_list.append(url)
        
    return url_team_list

In [None]:
# cleanup dataframe

teams['lookup'] = teams['team'].str.lower()
teams['lookup'] = teams['lookup'].str.replace(" ", "-", regex=True)
teams['lookup'] = teams['lookup'].str.replace("(", "", regex=True)
teams['lookup'] = teams['lookup'].str.replace(")", "", regex=True)
teams['lookup'] = teams['lookup'].str.replace(".", "", regex=True)
teams['lookup'] = teams['lookup'].str.replace("'", "", regex=True)
teams['lookup'] = teams['lookup'].str.replace("&", "", regex=True)
teams['lookup'] = teams['lookup'].str.replace("-st", "-state", regex=True)
teams['lookup'] = teams['lookup'].str.replace("-stateate", "-state", regex=True)
teams['lookup'] = teams['lookup'].str.replace("maryland---e-shore", "maryland-eastern-shore", regex=True)
teams['lookup'] = teams['lookup'].str.replace("tenn-martin", "tennessee-martin", regex=True)
teams['lookup'] = teams['lookup'].str.replace("nc-state", "north-carolina-state", regex=True)
teams['lookup'] = teams['lookup'].str.replace("vcu", "virginia-commonwealth", regex=True)
teams['lookup'] = teams['lookup'].str.replace("american-u", "american", regex=True)
teams['lookup'] = teams['lookup'].str.replace("middle-tennessee-state", "middle-tennessee", regex=True)
teams['lookup'] = teams['lookup'].str.replace("liu-brooklyn", "long-island-university", regex=True)
teams['lookup'] = teams['lookup'].str.replace("miami-florida", "miami-fl", regex=True)
teams['lookup'] = teams['lookup'].str.replace("uc-santa-barbara", "california-santa-barbara", regex=True)
teams['lookup'] = teams['lookup'].str.replace("st-josephs", "saint-josephs", regex=True)
teams['lookup'] = teams['lookup'].str.replace("loyola-maryland", "loyola-md", regex=True)
teams['lookup'] = teams['lookup'].str.replace("vmi", "virginia-military-institute", regex=True)
teams['lookup'] = teams['lookup'].str.replace("siu---edwardsville", "southern-illinois-edwardsville", regex=True)
teams['lookup'] = teams['lookup'].str.replace("albany", "albany-ny", regex=True)
teams['lookup'] = teams['lookup'].str.replace("loyola-chicago", "loyola-il", regex=True)
teams['lookup'] = teams['lookup'].str.replace("unlv", "nevada-las-vegas", regex=True)
teams['lookup'] = teams['lookup'].str.replace("st-marys", "saint-marys-ca", regex=True)
teams['lookup'] = teams['lookup'].str.replace("texas-am-cc", "texas-am-corpus-christi", regex=True)
teams['lookup'] = teams['lookup'].str.replace("new-jersey-tech", "njit", regex=True)
teams['lookup'] = teams['lookup'].str.replace("wis-milwaukee", "milwaukee", regex=True)
teams['lookup'] = teams['lookup'].str.replace("uab", "alabama-birmingham", regex=True)
teams['lookup'] = teams['lookup'].str.replace("md-baltimore-cty", "maryland-baltimore-county", regex=True)
teams['lookup'] = teams['lookup'].str.replace("nc-greensboro", "north-carolina-greensboro", regex=True)
teams['lookup'] = teams['lookup'].str.replace("st-francis-pa", "saint-francis-pa", regex=True)
teams['lookup'] = teams['lookup'].str.replace("uc-riverside", "california-riverside", regex=True)
teams['lookup'] = teams['lookup'].str.replace("elon-university", "elon", regex=True)
teams['lookup'] = teams['lookup'].str.replace("bowling-green", "bowling-green-state", regex=True)
teams['lookup'] = teams['lookup'].str.replace("monmouth-nj", "monmouth", regex=True)
teams['lookup'] = teams['lookup'].str.replace("nc-wilmington", "north-carolina-wilmington", regex=True)
teams['lookup'] = teams['lookup'].str.replace("charleston", "college-of-charleston", regex=True)
teams['lookup'] = teams['lookup'].str.replace("indiana---purdue", "iupui", regex=True)
teams['lookup'] = teams['lookup'].str.replace("prairie-view-am", "prairie-view", regex=True)
teams['lookup'] = teams['lookup'].str.replace("nocarolina-at", "north-carolina-at", regex=True)
teams['lookup'] = teams['lookup'].str.replace("st-johns", "st-johns-ny", regex=True)
teams['lookup'] = teams['lookup'].str.replace("cal-state---bakersfield", "cal-state-bakersfield", regex=True)
teams['lookup'] = teams['lookup'].str.replace("uc-davis", "california-davis", regex=True)
teams['lookup'] = teams['lookup'].str.replace("cal-poly-slo", "cal-poly", regex=True)
teams['lookup'] = teams['lookup'].str.replace("uc-irvine", "california-irvine", regex=True)
teams['lookup'] = teams['lookup'].str.replace("boston-u", "boston-university", regex=True)
teams['lookup'] = teams['lookup'].str.replace("nc-asheville", "north-carolina-asheville", regex=True)
teams['lookup'] = teams['lookup'].str.replace("csu-northridge", "cal-state-northridge", regex=True)
teams['lookup'] = teams['lookup'].str.replace("mount-state-marys", "mount-st-marys", regex=True)
teams['lookup'] = teams['lookup'].str.replace("grambling-state", "grambling", regex=True)
teams['lookup'] = teams['lookup'].str.replace("se-missouri-state", "southeast-missouri-state", regex=True)
teams['lookup'] = teams['lookup'].str.replace("william--mary", "william-mary", regex=True)
teams['lookup'] = teams['lookup'].str.replace("iupu---ft-wayne", "ipfw", regex=True)
teams['lookup'] = teams['lookup'].str.replace("central-conn-state", "central-connecticut-state", regex=True)
teams['lookup'] = teams['lookup'].str.replace("st-peters", "saint-peters", regex=True)
teams['lookup'] = teams['lookup'].str.replace("college-of-charleston-southern", "charleston-southern", regex=True)
teams['lookup'] = teams['lookup'].str.replace("umkc", "missouri-kansas-city", regex=True)
teams['lookup'] = teams['lookup'].str.replace("southern-miss", "southern-mississippi", regex=True)
teams['lookup'] = teams['lookup'].str.replace("the-citadel", "citadel", regex=True)
teams['lookup'] = teams['lookup'].str.replace("texas-rio-grande-valley", "texas-pan-american", regex=True)
teams['lookup'] = teams['lookup'].str.replace("little-rock", "arkansas-little-rock", regex=True)
teams['lookup'] = teams['lookup'].str.replace("lu-lafayette", "louisiana-lafayette", regex=True)
teams['lookup'] = teams['lookup'].str.replace("st-francis-brooklyn", "st-francis-ny", regex=True)
teams['lookup'] = teams['lookup'].str.replace("ul-monroe", "louisiana-monroe", regex=True)
teams['lookup'] = teams['lookup'].str.replace("unc-wilmington", "north-carolina-wilmington", regex=True)
teams['lookup'] = teams['lookup'].str.replace("unorth-carolina-asheville", "north-carolina-asheville", regex=True)
teams['lookup'] = teams['lookup'].str.replace("unorth-carolina-greensboro", "north-carolina-greensboro", regex=True)
teams['lookup'] = teams['lookup'].str.replace("unorth-carolina-wilmington", "north-carolina-wilmington", regex=True)
teams['lookup'] = teams['lookup'].str.replace("csu-fullerton", "cal-state-fullerton", regex=True)
teams['lookup'] = teams['lookup'].str.replace("houston-christian", "houston-baptist", regex=True)
teams['lookup'] = teams['lookup'].str.replace("queens-university-of-charlotte", "queens-nc", regex=True)

In [None]:
# find indecies of rows that need deleted
bad_team_index = []
for team in bad_teams:
    bad_team_index.append(teams.index[teams['lookup'] == team].tolist()[0])

print(bad_team_index)

In [None]:
# delete rows of teams that shouldn't be there
for i in bad_team_index:
    teams = teams.drop(i)
teams = teams.reset_index()

In [None]:
teams

In [None]:
url_team_list = createTeamURL()

In [None]:
# for url in url_team_list:
#     testURL(url)
#     time.sleep(3.1)

In [None]:
url_team_set = set(url_team_list)

In [None]:
print(len(url_team_set))

In [None]:
def getDivisions(url_list):
    divisions = []

    for url in url_list:
        # create document
        html_document = getHTMLdocument(url)

        # create soap object
        soup = BeautifulSoup(html_document, 'html.parser')

        results = soup.find(id="meta")
        # print(results)

        if results is not None:
            division_text = results.find_all("div")[1].find("p").text
            if " in " in division_text:
                division_text_before = division_text.index(" in ") + 4
                division_text_after = division_text[division_text_before : len(division_text)].index("MBB")
                division = division_text[division_text_before : division_text_after + division_text_before - 1]
            else:
                division = "Independent"
            
            divisions.append(division)
        
        time.sleep(3.1)
        print(url, division)
    return divisions

In [None]:
# divs = getDivisions(url_team_set)

In [None]:
# divs

In [None]:
# divs_set = set(divs)

In [None]:
# divs_set

In [None]:
# for div in divs_set:
#     print(divs.count(div), div)

https://www.sports-reference.com/cbb/schools/purdue/men/2023.html

Western Illinois summit->ovc
byu wcc->big 12
houston wcc->big 12
cincinati wcc->big 12
ucf wcc->big 12

### Create dataframes for seasons

{
    'angelo stuart': (
                        'mp': 27,
                        'fg': 1,
                        'fga': 11,
                        '2p': 1,
                        '2a': 2,
                        '3p': 0,
                        '3a': 9,
                        'ft': 0,
                        'fta': 0,
                        'orb: 0,
                        'drb': 1,
                        'totreb': 1,
                        'ast': 0,
                        'stl': 0,
                        'blk': 0,
                        'tov': 1,
                        'pf': 2,
                     )
     'bj freeman': {etc...
                        

('Johnson University (FL)', 'Stetson', 'https://newsday.sportsdirectinc.com/sports-scores/College-Basketball-Scores-Matchups.aspx?Year=2022&Period=1110&CurrentSeason=2022-2023',
  datetime.date(2022, 11, 10),
  'na',
  False)

In [None]:
box_scores

In [None]:
test = pd.DataFrame(box_scores)

In [None]:
test.to_csv('file_name.csv')