## Ken Pom Stats
- using the kenpompy library to load advanced stats that aren't in the Kaggle dataset
- extracting height, experience, and off. and def. adjusted efficiency (which accounts for strength of schedule)
- requires a kenpom subscription to run. Create a file `login.txt`, & put your account email in line 1 and password in line 2
- this notebook creates the `data_2025/kp/` folder that is used in `data_preprocessing.ipynb` to add kenpom features to our dataset

In [2]:
with open("../login.txt", "r") as f:
    email = f.readline().strip()
    password = f.readline().strip()

In [None]:
from kenpompy.summary import get_height, get_efficiency
from kenpompy.utils import login
import pandas as pd

team_spellings = pd.read_csv("../data_2025/MTeamSpellings.csv", encoding="Windows-1252")
team_spellings['TeamNameSpelling'] = team_spellings['TeamNameSpelling'].str.replace('.', '', regex=False).str.lower()
team_spellings = team_spellings.drop_duplicates(subset=['TeamNameSpelling'], keep='first')

years = [str(yr) for yr in range(2008, 2024)]
browser = login(email, password)
for yr in years:
    all_data = pd.DataFrame()
    height_df = get_height(browser, yr)[["Team", "AvgHgt", "EffHgt", "Experience"]]
    eff_df = get_efficiency(browser=browser, season=yr)[["Team", "Tempo-Adj", "Off. Efficiency-Adj", "Def. Efficiency-Adj"]]
    
    # Merge height_df and eff_df on "Team" using an outer join
    merged_df = pd.merge(height_df, eff_df, on="Team", how="outer")
    
    # Preprocess merged_df['Team']
    merged_df['Team'] = merged_df['Team'].str.replace('.', '', regex=False).str.lower()
    
    # Standardize team names in merged_df using team_spellings
    merged_df['Team'] = merged_df['Team'].map(team_spellings.set_index('TeamNameSpelling')['TeamID']).fillna(merged_df['Team'])

    merged_df
    all_data = pd.concat([all_data, merged_df], ignore_index = True)
    all_data.to_csv(f"../data_2025/kp/height-exp-eff-{yr}.csv", index=False)


In [None]:
import os
import pandas as pd

data_dir = "../data_2025/kp/"
team_names = set()

for filename in os.listdir(data_dir):
    if filename.endswith(".csv"):
        filepath = os.path.join(data_dir, filename)
        try:
            df = pd.read_csv(filepath)
            if "Team" in df.columns:
                team_names.update(df["Team"].tolist())
            else:
                print(f"Warning: 'Team' column not found in {filename}")
        except Exception as e:
            print(f"Error reading {filename}: {e}")

In [None]:
# manual dictionary of team -> ids for where the auto mapping failed
m = {"arkansas pine bluff": 1115, "bethune cookman": 1126, "cal st bakersfield": 1167, "illinois chicago": 1227, "liu": 1254, "louisiana monroe": 1419, "texas a&m corpus chris": 1394}

for filename in os.listdir(data_dir):
    filepath = os.path.join(data_dir, filename)
    df = pd.read_csv(filepath)
    df["Team"] = df["Team"].replace(m)  # Replace team names with IDs
    df.to_csv(filepath, index=False)  # Save the modified DataFrame back to the file