In [2]:
import requests
from bs4 import BeautifulSoup as bs
import csv
import pandas as pd

In [3]:
#Base URL for all data
base_url = "https://www.sports-reference.com/cbb/"
#Data from all years
years = ["2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
#Create add_on based on the year
add_on = "seasons/men/"+years[9]+"-advanced-school-stats.html"

#Example data extraction with year 2023
response = requests.get(base_url + add_on)

Explanation of Variables:
    - Success of Team in March Maddness is evaluated by the amount of wins:
        0 - first round
        1 - second round
        2 - sweet sixteen
        3 - elite eight
        4 - final four
        5 - final
        6 - championship
    - Simple Rating System: 
        A rating that takes into account average point differential and strength of schedule. The rating is denominated in points above/below average, where zero is average. Non-Division I games are excluded from the ratings
    - Strength of Schedule:
        A rating of strength of schedule. The rating is denominated in points above/below average, where zero is average. Non-Division I games are excluded from the ratings
    - Pace Factor:
        An estimate of school possesion per 40 mins
    - Offensive Rating:
        An estimate of points scored (for teams) or points produced (for players) per 100 possessions
    - Free Throw Attempt Rate:
        Number of FT Attempts Per FG Attempt
    - Three-Point Attempt Rate:
        Percentage of FG Attempts from Three-Point Range
    - True Shooting Percentage:
        A measure of shooting efficiency that takes into account 2-point field goals, 3-point field goals, and free throws
    - Total Rebound Percentage:
        An estimate of the percentage of available rebounds a player grabbed while they were on the floor
    - Assist Percentage:
        An estimate of the percentage of teammate field goals a player assisted while they were on the floor
    - Steal Percentage:
        An estimate of the percentage of opponent possessions that end with a steal by the player while they were on the floor
    - Block Percentage:
        An estimate of the percentage of opponent two-point field goal attempts blocked by the player while they were on the floor
    - Effective Field Goal Percentage:
        This statistic adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal
    - Turnover Percentage:
        An estimate of turnovers per 100 plays
    - Offensive Rebound Percentage:
        An estimate of the percentage of available offensive rebounds a player grabbed while they were on the floor


In [75]:
#Gathering Data from the 2023 CSV file
team_name_2023 = []
tournament_success_2023 = []

with open('2023_TournamentCSV.csv', 'r') as csv_file_2023:
    csv_reader = csv.reader(csv_file_2023)
    for row in csv_reader:
        team_name_2023.append(row[0])
        tournament_success_2023.append(int(row[1]))


simple_rating_system_2023 = []
strength_of_schedule_2023 = []
pace_factor_2023 = []
offensive_rating_2023 = []
free_throw_attempt_rate_2023 = []
three_point_attempt_rate_2023 = []
true_shooting_percentage_2023 = []
total_rebound_percentage_2023 = []
assist_percentage_2023 = []
steal_percentage_2023 = []
block_percentage_2023 = []
effective_field_goal_percentage_2023 = []
turnover_percentage_2023 = []
offensive_rebound_percentage_2023 = []





In [76]:
#Webscraping using BeautifulSoup Object
soup_2023 = bs(response.content, 'html.parser')
table_lst_2023 = soup_2023.select('tbody')[0].select('tr')

#Loopthrough Function
def loop_through(tag):
    return tag.get_text().replace('\xa0NCAA', '').strip()

Dict_2023 = {}

#for team in table_lst_2023:
for teams in table_lst_2023:
    team = list(map(loop_through, teams))
    team_name = team[1]
    if team_name in team_name_2023:
        Srs = float(team[6])
        Sos = float(team[7])
        Pace = float(team[21])
        ORtg = float(team[22])
        Ftr = float(team[23])
        PAr3 = float(team[24])
        Ts = float(team[25])
        Trb = float(team[26])
        Ast = float(team[27])
        Stl = float(team[28])
        Blk = float(team[29])
        eFG = float(team[30])
        Tov = float(team[31])
        Orb = float(team[32])
        Dict_2023[team_name] = [Srs, Sos, Pace, ORtg, Ftr, PAr3, Ts, Trb, Ast, Stl, Blk, eFG, Tov, Orb]

print(len(Dict_2023.keys()))


64


In [77]:
for team_name in team_name_2023:
    data = Dict_2023[team_name]
    simple_rating_system_2023.append(data[0])
    strength_of_schedule_2023.append(data[1])
    pace_factor_2023.append(data[2])
    offensive_rating_2023.append(data[3])
    free_throw_attempt_rate_2023.append(data[4])
    three_point_attempt_rate_2023.append(data[5])
    true_shooting_percentage_2023.append(data[6])
    total_rebound_percentage_2023.append(data[7])
    assist_percentage_2023.append(data[8])
    steal_percentage_2023.append(data[9])
    block_percentage_2023.append(data[10])
    effective_field_goal_percentage_2023.append(data[11])
    turnover_percentage_2023.append(data[12])
    offensive_rebound_percentage_2023.append(data[13])

In [78]:
#Pandas Dataframe for 2023
data_2023 = {
    "team_name" : team_name_2023,
    "tournament_success" : tournament_success_2023,
    "simple_rating_system" : simple_rating_system_2023,
    "strength_of_schedule" : strength_of_schedule_2023,
    "pace_factor" : pace_factor_2023,
    "offensive_rating" : offensive_rating_2023,
    "free_throw_attempt_rate" : free_throw_attempt_rate_2023,
    "three_point_attempt_rate" : three_point_attempt_rate_2023,
    "true_shooting_percentage" : true_shooting_percentage_2023,
    "total_rebound_percentage" : total_rebound_percentage_2023,
    "assist_percentage" : assist_percentage_2023,
    "steal_percentage" : steal_percentage_2023,
    "block_percentage" : block_percentage_2023,
    "effective_field_goal_percentage" : effective_field_goal_percentage_2023,
    "turnover_percentage" : turnover_percentage_2023,
    "offensive_rebound_percentage" : offensive_rebound_percentage_2023  
}

df_2023 = pd.DataFrame(data=data_2023)


In [79]:

df_2023


Unnamed: 0,team_name,tournament_success,simple_rating_system,strength_of_schedule,pace_factor,offensive_rating,free_throw_attempt_rate,three_point_attempt_rate,true_shooting_percentage,total_rebound_percentage,assist_percentage,steal_percentage,block_percentage,effective_field_goal_percentage,turnover_percentage,offensive_rebound_percentage
0,Alabama,2,23.18,9.64,72.6,110.4,0.366,0.472,0.557,54.4,54.3,8.2,11.3,0.521,15.9,34.2
1,Texas A&M-Corpus Christi,0,-2.58,-7.03,71.7,110.9,0.348,0.342,0.559,52.6,55.2,11.7,5.3,0.513,15.2,33.7
2,Maryland,1,14.59,8.36,64.5,107.7,0.343,0.369,0.545,51.1,46.2,8.4,10.1,0.508,14.3,30.0
3,West Virginia,0,15.94,10.88,69.6,108.8,0.394,0.363,0.557,52.0,50.1,9.6,8.6,0.515,16.0,34.0
4,Virginia,0,13.28,5.98,62.5,108.1,0.347,0.356,0.544,50.2,65.4,10.9,13.8,0.512,12.1,25.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,Boise State,0,12.78,6.20,66.8,106.8,0.303,0.378,0.553,52.6,43.8,7.5,8.0,0.521,14.6,27.4
60,Gonzaga,3,18.99,8.04,71.9,119.5,0.337,0.323,0.602,54.0,50.7,10.0,9.3,0.582,13.0,31.4
61,Grand Canyon,0,4.42,1.27,66.9,111.5,0.358,0.418,0.573,52.1,48.4,7.3,8.1,0.542,15.4,30.8
62,TCU,1,15.60,8.77,70.9,105.8,0.341,0.299,0.533,50.7,58.5,11.3,12.5,0.500,14.4,32.2
