In [82]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# Open the target URL
def get_html_from_torvik_players(year):

    last_year = year - 1

    url = f"https://barttorvik.com/playerstat.php?link=y&sIndex=53&year={year}&minmin=5&start={last_year}1101&end={year}0501"

    driver = webdriver.Chrome()  # or webdriver.Firefox() if you prefer

    iters = 0

    driver.get(url)
    while True and iters < 40:
        try:
            # Wait up to 10 seconds for the "Show 100 more" element to be clickable
            expand_element = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.ID, "expand"))
            )
            # Locate the <a> tag within the <th id="expand">
            show_more_button = expand_element.find_element(By.TAG_NAME, "a")
            
            # Click the button
            show_more_button.click()
            
            # Optionally wait a bit for the new content to load before the next click.
            time.sleep(2)
            iters += 1
            
        except Exception as e:
            # If the button is no longer found or clickable, exit the loop.
            print("No more 'Show 100 more' button found or an error occurred:", e)
            break

    # Once the loop is complete, you can scrape the loaded content.
    html_source = driver.page_source
    # For example, use BeautifulSoup to parse the html_source if needed.

    driver.quit()

    return html_source


In [70]:
from bs4 import BeautifulSoup
import pandas as pd


In [75]:
def get_data_from_html(html_source):

    # Parse the HTML
    soup = BeautifulSoup(html_source, "html.parser")

    # Locate the table. Here we search for the table by its style attribute.
    table = soup.find("table", {"style": "white-space:nowrap;margin:auto;table-layout:fixed"})
    if not table:
        raise ValueError("Table not found!")

    def extract_complete_row(row):
        to_use = [0,2,3,4,6,7,10,11,13,16, 18, 19, 20, 21, 22, 23, 24, 26,27,28]
        complete_row = [x for i,x in enumerate(row) if i in to_use]
        return complete_row
        

    data = []
    tbody = table.find("tbody")
    rows = tbody.find_all("tr") if tbody else table.find_all("tr")
    for row in rows:
        # Use both <td> and <th> in case some rows use header cells for data
        cells = row.find_all(["td", "th"])
        row_data = [cell.get_text(strip=True) for cell in cells]
        row_data = extract_complete_row(row_data)
        if row_data:
            data.append(row_data)

    headers = ["Rk", "Class", "Height", "Player", "Team", 
            "Conf", "Min%", 'PRPG!', 'BPM', 'ORTG', 
            "USG", "EFG", "TS", "OR", "DR", "AST", "TO",
            "BLK", "STL", "FTR"]

    df = pd.DataFrame(data, columns=headers)
    return df


In [85]:
years = [i for i in range(2008, 2024)]

In [88]:
df_2024["Season"] = 2024

In [89]:
all_dfs = [df_2024]

import tqdm

for year in tqdm.tqdm(years):
    html = get_html_from_torvik_players(year)
    tmp_df = get_data_from_html(html)
    tmp_df["Season"] = year
    all_dfs.append(tmp_df)


100%|██████████| 16/16 [37:21<00:00, 140.08s/it]


In [91]:
final_df = pd.concat(all_dfs, axis=0)

In [98]:
final_df = final_df[final_df["Min%"].notnull()]

In [99]:
final_df.to_csv("torvik_player_data.csv")

In [128]:
tmp = final_df[(final_df.Team == "Houston")
         & (final_df.Season == 2023)].sort_values(by="PRPG!", ascending=False)

In [None]:
tmp["Min%"] = tmp["Min%"].astype(float)
tmp["PRPG!"] = tmp["PRPG!"].astype(float)
tmp["total_value"] = tmp["Min%"] * tmp["PRPG!"]
tmp["total_value_perc"] = tmp["total_value"] / tmp["total_value"].sum() 

In [106]:
final_df[(final_df.Team == "Connecticut")
         & (final_df.Season == 2009)].sort_values(by="PRPG!", ascending=False)

Unnamed: 0,Rk,Class,Height,Player,Team,Conf,Min%,PRPG!,BPM,ORTG,...,EFG,TS,OR,DR,AST,TO,BLK,STL,FTR,Season
11,12,Jr,7-3,Hasheem Thabeet,Connecticut,BE,75.1,4.0,11.2,118.7,...,64.4,64.8,13.4,21.7,3.0,17.5,11.9,1.2,78.3,2009
216,217,Sr,6-2,A.J. Price,Connecticut,BE,73.0,3.7,5.7,108.5,...,50.7,53.9,2.1,9.2,28.5,20.2,0.0,1.2,30.3,2009
219,220,Sr,6-7,Jeff Adrien,Connecticut,BE,82.0,3.6,5.7,110.0,...,51.1,53.4,10.5,19.4,9.3,14.5,3.0,0.9,50.8,2009
133,134,Jr,6-3,Jerome Dyson,Connecticut,BE,47.7,3.1,6.7,106.8,...,45.5,51.0,4.9,9.5,20.5,16.7,1.0,3.7,45.5,2009
244,245,Fr,6-1,Kemba Walker,Connecticut,BE,59.3,2.3,5.4,107.6,...,50.0,55.4,4.0,10.3,21.0,21.7,0.6,2.5,54.7,2009
336,337,Sr,6-3,Craig Austrie,Connecticut,BE,60.7,2.1,4.6,113.3,...,43.7,50.5,2.8,4.7,15.2,13.6,1.1,1.5,41.4,2009
181,182,Jr,6-9,Stanley Robinson,Connecticut,BE,44.9,2.0,6.1,104.7,...,51.9,54.2,9.2,14.9,8.8,20.5,4.1,1.5,30.6,2009
123,124,Jr,6-9,Gavin Edwards,Connecticut,BE,28.1,1.4,6.8,123.9,...,64.6,67.6,12.5,14.6,5.2,19.3,5.9,1.7,59.5,2009
85,86,So,6-4,Donnell Beverly,Connecticut,BE,5.5,0.7,7.7,147.5,...,64.3,68.3,5.7,10.9,25.7,11.0,0.0,2.9,42.9,2009


### Scrape ESPN

In [None]:
# import requests
# from datetime import datetime, timedelta

# def fetch_game_ids(start_date, end_date):
#     """Fetch all NCAA tournament First Round game IDs within a date range."""
#     game_ids = []
    
#     current_date = datetime.strptime(start_date, "%Y-%m-%d")
#     end_date = datetime.strptime(end_date, "%Y-%m-%d")

#     while current_date <= end_date:
#         formatted_date = current_date.strftime("%Y%m%d")
#         url = f"https://site.api.espn.com/apis/site/v2/sports/basketball/mens-college-basketball/scoreboard?dates={formatted_date}"
#         headers = {"User-Agent": "Mozilla/5.0"}

#         response = requests.get(url, headers=headers)
#         if response.status_code == 200:
#             data = response.json()
#             for event in data.get("events", []):
#                 game_id = event.get("id")
#                 game_note = event.get("name", "").lower()  # Check if it's First Round

#                 if "first round" in game_note:  # Filter for First Round games
#                     game_ids.append(game_id)
        
#         current_date += timedelta(days=1)

#     return game_ids


TypeError: fetch_game_ids() missing 1 required positional argument: 'end_date'

In [None]:
# import requests

# # Set the date to March 22, 2024
# date = "20240322"
# url = f"https://site.api.espn.com/apis/site/v2/sports/basketball/mens-college-basketball/scoreboard?dates={date}"
# headers = {"User-Agent": "Mozilla/5.0"}

# # Fetch data from ESPN API
# response = requests.get(url, headers=headers)

# if response.status_code == 200:
#     data = response.json()

#     games = data.get("events", [])  # Get list of games

#     for game in games:
#         game_id = game.get("id", "N/A")  # Game ID
#         game_name = game.get("name", "N/A")  # Game title

#         # Extract teams and scores
#         competitors = game.get("competitions", [{}])[0].get("competitors", [])
#         teams = []
#         for team in competitors:
#             team_name = team["team"]["displayName"]
#             score = team.get("score", "N/A")
#             home_away = team.get("homeAway", "N/A")
#             teams.append(f"{team_name} ({home_away}) - {score}")

#         # Extract game status
#         status = game.get("status", {}).get("type", {}).get("description", "Unknown")

#         print(f"Game ID: {game_id}")
#         print(f"Matchup: {game_name}")
#         print(f"Teams: {', '.join(teams)}")
#         print(f"Status: {status}")
#         print("-" * 40)
# else:
#     print(f"Failed to fetch data: {response.status_code}")


Game ID: 401638599
Matchup: Stetson Hatters at UConn Huskies
Teams: UConn Huskies (home) - 91, Stetson Hatters (away) - 52
Status: Final
----------------------------------------
Game ID: 401638601
Matchup: Grambling Tigers at Purdue Boilermakers
Teams: Purdue Boilermakers (home) - 78, Grambling Tigers (away) - 50
Status: Final
----------------------------------------
Game ID: 401638600
Matchup: Longwood Lancers at Houston Cougars
Teams: Houston Cougars (home) - 86, Longwood Lancers (away) - 46
Status: Final
----------------------------------------
Game ID: 401638607
Matchup: Western Kentucky Hilltoppers at Marquette Golden Eagles
Teams: Marquette Golden Eagles (home) - 87, Western Kentucky Hilltoppers (away) - 69
Status: Final
----------------------------------------
Game ID: 401638602
Matchup: Colgate Raiders at Baylor Bears
Teams: Baylor Bears (home) - 92, Colgate Raiders (away) - 67
Status: Final
----------------------------------------
Game ID: 401638612
Matchup: Yale Bulldogs at A

In [260]:
all_dates = [
    "20080321", "20080320", "20090320", "20090319", "20100319", "20100318",
    "20110318", "20110317", "20120322", "20120321", "20130321", "20130320",
    "20140321", "20140320", "20150320", "20150319", "20160318", "20160317",
    "20170317", "20170316", "20180316", "20180315", "20190322", "20190321",
    "20200320", "20200319", "20210319", "20210318", "20220318", "20220317",
    "20230317", "20230316", "20240322", "20240321"
]


In [None]:
import requests

def get_game_ids(date: str):
    url = f"https://site.api.espn.com/apis/site/v2/sports/basketball/mens-college-basketball/scoreboard?dates={date}"
    headers = {"User-Agent": "Mozilla/5.0"}
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        return [game.get("id") for game in data.get("events", [])]
    else:
        print(f"Failed to fetch data: {response.status_code}")
        return []


['401638599', '401638601', '401638600', '401638607', '401638602', '401638612', '401638609', '401638604', '401638613', '401638610', '401638605', '401638603', '401638608', '401638614', '401638611', '401638606']


In [None]:
def get_box_score(game_id):

    url = f'https://site.api.espn.com/apis/site/v2/sports/basketball/mens-college-basketball/summary?event={game_id}'

    response = requests.get(url)

    data = json.loads(response.text)

    columns = ['team', 'player_name'] + data['boxscore']['players'][0]['statistics'][0]["names"]
    team1 = data['boxscore']['players'][0]['statistics'][0]["athletes"]
    team1_name = data['boxscore']['players'][0]['team']['location']
    team2 = data['boxscore']['players'][1]['statistics'][0]["athletes"]
    team2_name = data['boxscore']['players'][1]['team']['location']

    all_data = []

    for player in team1:

        if len(player['stats']) > 0:
            name = player["athlete"]["displayName"]
            stats = player['stats']
            all_data.append([team1_name, name] + stats)

    for player in team2:

        if len(player['stats']) > 0:
            name = player["athlete"]["displayName"]
            stats = player['stats']
            all_data.append([team2_name, name] + stats)

    return pd.DataFrame(all_data,columns=columns)


In [263]:
bs = []

for date in tqdm.tqdm(all_dates):
    game_ids = get_game_ids(date)
    for game in game_ids:
        b = get_box_score(game)
        b["Season"] = date[0:4]
        bs.append(b)
    

100%|██████████| 34/34 [04:00<00:00,  7.07s/it]


In [265]:
ncaam_box_scores = pd.concat(bs, axis = 0)

In [267]:
ncaam_box_scores.to_csv("ncaam_box_scores.csv")

In [277]:
tmp_box = ncaam_box_scores[(ncaam_box_scores.Season == "2024") & 
                 (ncaam_box_scores.team == "Kansas")]

In [274]:
tmp = final_df[(final_df.Team == "Kansas")
         & (final_df.Season == 2024)].copy()

In [275]:
tmp["Min%"] = tmp["Min%"].astype(float)
tmp["PRPG!"] = tmp["PRPG!"].astype(float)
tmp["total_value"] = tmp["Min%"] * tmp["PRPG!"]
tmp["total_value_perc"] = tmp["total_value"] / tmp["total_value"].sum() 

In [276]:
tmp

Unnamed: 0,Rk,Class,Height,Player,Team,Conf,Min%,PRPG!,BPM,ORTG,...,OR,DR,AST,TO,BLK,STL,FTR,Season,total_value,total_value_perc
74,75,Sr,7-2,Hunter Dickinson,Kansas,B12,77.8,4.3,7.8,111.3,...,10.1,27.5,15.5,12.1,5.1,1.6,24.9,2024,334.54,0.263571
152,153,Sr,6-7,Kevin McCullar Jr.,Kansas,B12,64.5,4.2,6.2,107.5,...,2.8,15.7,21.8,15.1,1.1,2.4,42.3,2024,270.9,0.213431
222,223,Fr,6-9,Johnny Furphy,Kansas,B12,58.7,2.6,5.4,117.5,...,6.6,16.3,7.1,12.1,1.6,2.0,40.3,2024,152.62,0.120243
278,279,Jr,6-7,KJ Adams Jr.,Kansas,B12,83.8,3.5,4.9,113.2,...,5.7,9.9,16.6,16.1,2.0,2.0,40.5,2024,293.3,0.23108
284,285,Sr,6-10,Parker Braun,Kansas,B12,18.6,0.6,4.9,116.8,...,2.5,19.1,7.2,21.3,10.1,1.2,19.1,2024,11.16,0.008793
677,678,Jr,6-2,Dajuan Harris Jr.,Kansas,B12,89.0,2.0,2.8,103.3,...,1.2,4.7,29.4,21.9,1.1,2.4,22.5,2024,178.0,0.140239
837,838,Fr,6-4,Jamari McDowell,Kansas,B12,16.0,0.4,2.3,100.9,...,8.8,11.1,9.8,14.3,1.6,1.6,38.0,2024,6.4,0.005042
1261,1262,Sr,6-4,Nicolas Timberlake,Kansas,B12,38.5,0.7,0.8,100.2,...,0.5,13.4,7.0,13.3,0.4,1.9,29.2,2024,26.95,0.021233
2294,2295,Fr,6-3,Elmarko Jackson,Kansas,B12,46.1,-0.1,-1.9,86.6,...,1.8,6.3,15.2,27.5,0.9,2.4,29.8,2024,-4.61,-0.003632


In [278]:
tmp_box

Unnamed: 0,team,player_name,MIN,FG,3PT,FT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,Season
11,Kansas,KJ Adams Jr.,39,10-13,0-0,0-4,1,3,4,6,0,1,3,2,20,2024
12,Kansas,Johnny Furphy,35,5-9,2-5,4-6,3,5,8,3,1,1,4,3,16,2024
13,Kansas,Hunter Dickinson,37,9-14,0-1,1-4,2,18,20,5,2,4,0,3,19,2024
14,Kansas,Dajuan Harris Jr.,36,3-8,1-5,6-6,1,3,4,7,0,0,4,4,13,2024
15,Kansas,Nicolas Timberlake,26,5-8,3-6,6-7,0,2,2,3,1,0,0,4,19,2024
16,Kansas,Parker Braun,2,0-0,0-0,0-0,0,0,0,0,0,1,0,0,0,2024
17,Kansas,Jamari McDowell,3,0-0,0-0,0-0,0,0,0,0,0,0,1,1,0,2024
18,Kansas,Elmarko Jackson,22,3-6,0-1,0-0,0,3,3,1,1,0,5,4,6,2024


In [286]:
all_players = tmp["Player"].unique().tolist()

In [289]:
played = tmp_box["player_name"].unique().tolist()

In [292]:
from itertools import product
import Levenshtein  # Install with: pip install python-Levenshtein

def find_players_with_high_distance(list1, list2, min_threshold=3):
    high_distance_players = []
    
    for str1 in list1:
        min_dist, best_match = min(
            ((Levenshtein.distance(str1, str2), str2) for str2 in list2),
            key=lambda x: x[0]
        )
        if min_dist >= min_threshold:
            high_distance_players.append(str1)
    
    return high_distance_players

players_with_high_distance = find_players_with_high_distance(all_players, played)

print(players_with_high_distance)


['Kevin McCullar Jr.']
