## Assuming you've already ran 'Lineup_Merge.py' and have your progression data in a folder called 'output'

In [1]:
import pandas as pd
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Load Data Function

In [8]:


teams_list = ['Akron', 'Ball St.', 'Bowling Green', 'Buffalo', 'Central Mich.',
            'Eastern Mich.', 'Kent St.', 'Miami (OH)', 'NIU', 'Ohio',
            'Toledo', 'Western Mich.','Brown', 'Columbia', 'Cornell', 'Dartmouth',
            'Harvard', 'Penn', 'Princeton', 'Yale','Gonzaga', 'LMU (CA)', 'Oregon St.', 'Pacific',
            'Pepperdine', 'Portland', "Saint Mary's (CA)", 'San Diego',
            'San Francisco', 'Santa Clara', 'Washington St.']

CONFERENCE_MAPPING = {
    'MAC': ['Akron', 'Ball St.', 'Bowling Green', 'Buffalo', 'Central Mich.',
            'Eastern Mich.', 'Kent St.', 'Miami (OH)', 'NIU', 'Ohio',
            'Toledo', 'Western Mich.'],
    'IVY': ['Brown', 'Columbia', 'Cornell', 'Dartmouth',
            'Harvard', 'Penn', 'Princeton', 'Yale'],
    'WCC': ['Gonzaga', 'LMU (CA)', 'Oregon St.', 'Pacific',
            'Pepperdine', 'Portland', "Saint Mary's (CA)", 'San Diego',
            'San Francisco', 'Santa Clara', 'Washington St.']
}

def load_all_team_data(teams_list):
    """Load and merge all team progression and top lineup data"""
    # Initialize containers
    all_progression = []
    all_top_lineups = []
    TEAM_TO_CONFERENCE = {}
    for conf, teams in CONFERENCE_MAPPING.items():
        for t in teams:
            TEAM_TO_CONFERENCE[t] = conf
    for team in teams_list:
        # Load progression data
        prog_file = f"output/{team.replace('/', '_')}_progression.csv"  # Handle special chars
        if Path(prog_file).exists():
            prog_df = pd.read_csv(prog_file)
            prog_df['team'] = team
            # Add conference column using mapping
            prog_df['conference'] = TEAM_TO_CONFERENCE.get(team, 'Unknown')
            all_progression.append(prog_df)
        
        # Load top lineups data
        top_file = f"output/{team.replace('/', '_')}_top_lineups.csv"
        if Path(top_file).exists():
            top_df = pd.read_csv(top_file)
            top_df['team'] = team
            # Add conference column using mapping
            top_df['conference'] = TEAM_TO_CONFERENCE.get(team, 'Unknown')
            all_top_lineups.append(top_df)
        
    # Combine all data
    combined_progression = pd.concat(all_progression, ignore_index=True)
    #drop 'interval' column if it exists
    if 'interval' in combined_progression.columns:
        combined_progression.drop(columns=['interval'], inplace=True)
    combined_top_lineups = pd.concat(all_top_lineups, ignore_index=True)
    
    combined_progression.fillna(0, inplace=True)
    combined_top_lineups.fillna(0, inplace=True)
    
    return combined_progression, combined_top_lineups


### Load all data

In [9]:
combined_progression, combined_top_lineups = load_all_team_data(teams_list)

## Lineups with more than delta = 50 minutes in any given interval

In [12]:
delta = 50

lineups_over_50_per_interval = combined_progression[combined_progression['minutes'] > delta][['lineup','conference', 'team', 'interval_num', 'minutes', 'plusminus_per40']].sort_values(
    by='plusminus_per40', ascending=False
)


styled = lineups_over_50_per_interval.style.background_gradient(
    cmap = 'RdYlGn',
    subset = ['plusminus_per40']
).format(
    {
    'plusminus_per40': '{:.1f}',
    'minutes': '{:.0f}'
    }
)
display(styled)
print('Number of distinct teams : ',len(combined_progression['team'].unique()))
print('\n'*2)
print('Number of Lineups with over 50 minutes in an interval : ', len(lineups_over_50_per_interval))


Unnamed: 0,lineup,conference,team,interval_num,minutes,plusminus_per40
4605,HTUR-GAND-LCHA-SGLE-EROD,IVY,Harvard,3,68,44.4
6855,MMEE-AMAR-ESHE-MBUR-THUL,WCC,Portland,2,68,36.5
3823,MAVL-KHEN-RWEI-CCOL-SRAF,IVY,Columbia,3,54,24.6
6947,MMEE-AMAR-ESHE-MBUR-THUL,WCC,Portland,3,114,23.8
4991,ACHE-SBEL-OHUT-FTAL-PHIL,IVY,Princeton,1,56,23.6
6750,MMEE-AMAR-ESHE-MBUR-THUL,WCC,Portland,1,88,23.3
6315,AJAM-LSMI-SWAR-EELL-MRAD,WCC,Pacific,2,63,21.7
624,LFLE-AVEL-PKOH-EPOR-KMCG,MAC,Bowling Green,2,67,19.1
8490,EVIL-ATUH-JVIL-DMEN-TWAL,WCC,Washington St.,3,60,18.7
7006,MMEE-AMAR-ESHE-MBUR-THUL,WCC,Portland,4,147,17.7


Number of distinct teams :  31



Number of Lineups with over 50 minutes in an interval :  75


## Lineups with more than delta = 50 minutes in any given interval (sorted by interval)

In [16]:
delta = 50

lineups_over_50_per_interval = combined_progression[combined_progression['minutes'] > delta][['lineup', 'team', 'interval_num', 'minutes', 'plusminus_per40']].sort_values(
    by='interval_num', ascending=True
)


styled = lineups_over_50_per_interval.style.background_gradient(
    cmap = 'RdYlGn',
    subset = ['plusminus_per40']
).format(
    {
    'plusminus_per40': '{:.1f}',
    'minutes': '{:.0f}'
    }
)
display(styled)

print(lineups_over_50_per_interval.aggregate(
    {
        'minutes': ['mean','std','max']
    }
))
print('Number of distinct teams : ',len(combined_progression['team'].unique()))
print('\n'*2)
print('Number of Lineups with over 50 minutes in an interval : ', len(lineups_over_50_per_interval))

Unnamed: 0,lineup,team,interval_num,minutes,plusminus_per40
289,LAUS-ABEC-MBIS-ESTU-MKIE,Ball St.,1,69,-9.3
4166,VPAG-ZOZE-AELD-OAUS-CMEY,Dartmouth,1,73,-17.1
4029,AKIL-KLAN-CJAC-SPAR-EPAP,Cornell,1,72,3.9
3979,AKIL-KLAN-CJAC-RKAU-EPAP,Cornell,1,56,-0.7
5286,ALEE-MCHA-KCAP-MEGG-GTHY,Yale,1,70,-11.4
3648,KHEN-RWEI-PPAG-CCOL-SRAF,Columbia,1,101,-4.8
5593,ATUR-IBET-CO'C-YEJI-MHUI,Gonzaga,1,74,10.3
6068,KSHU-TBOL-CFER-AMAR-KREE,Oregon St.,1,56,13.5
6294,AJAM-LSMI-SWAR-EELL-MRAD,Pacific,1,71,-11.2
4422,HTUR-GAND-LCHA-SGLE-EROD,Harvard,1,112,2.9


         minutes
mean   76.311200
std    24.733915
max   154.890000
Number of distinct teams :  31



Number of Lineups with over 50 minutes in an interval :  75


## Team and Lineups with more than 50 minutes played full season

In [18]:
# First, compute total minutes per lineup
lineup_total_minutes = (
    combined_progression.groupby(['team', 'lineup'])['minutes']
    .sum()
    .reset_index()
    .rename(columns={'minutes': 'total_minutes'})
)

# Filter to only lineups that played more than 50 minutes total
full_season_lineups = lineup_total_minutes[lineup_total_minutes['total_minutes'] > delta]

# Count lineups per team
team_full_season_counts = (
    full_season_lineups.groupby('team')
    .size()
    .reset_index(name='num_lineups_over_50')
    .sort_values(by= 'num_lineups_over_50', ascending = True)
)

display(team_full_season_counts)
print('Average number of lineups with more than 50 minutes: ',team_full_season_counts['num_lineups_over_50'].mean().round(2))
print(team_full_season_counts.aggregate(
    {
        'num_lineups_over_50': ['mean','std','max']
    }
))


Unnamed: 0,team,num_lineups_over_50
29,Western Mich.,1
11,Harvard,1
30,Yale,2
14,Miami (OH),2
5,Central Mich.,2
16,Ohio,3
13,LMU (CA),3
12,Kent St.,3
22,Princeton,3
9,Eastern Mich.,3


Average number of lineups with more than 50 minutes:  3.9
      num_lineups_over_50
mean             3.903226
std              1.660418
max              8.000000


## Each Team's Top 2 Lineups (By Plusminus-Per40)

In [19]:
# Get top 2 lineups per team by plusminus_per40_mean
top2_per_team = (
    combined_top_lineups
    .sort_values(['team', 'plusminus_per40'], ascending=[True, False])
    .groupby('team')
    .head(2)
    [['team', 'lineup', 'minutes', 'plusminus_per40']]
    
)

# Style the table
styled_top2 = top2_per_team.style.background_gradient(
    cmap='RdYlGn',
    subset=['plusminus_per40']
).format({
    'plusminus_per40': '{:.1f}',
    'minutes': '{:.0f}',
})

display(styled_top2)

Unnamed: 0,team,lineup,minutes,plusminus_per40
9,Akron,EHAL-AMOB-SBRO-MVEJ-LTAP,33,14.6
5,Akron,EHAL-AMOB-SBRO-MVEJ-NCLA,42,11.4
20,Ball St.,LAUS-ABEC-GKIN-ARIC-MKIE,35,32.3
13,Ball St.,LAUS-ABEC-ARIC-ESTU-MKIE,90,23.2
31,Bowling Green,AVEL-JDON-PKOH-JFEA-EPOR,31,18.1
34,Bowling Green,AVEL-JDON-PKOH-EPOR-KMCG,24,13.5
154,Brown,IMAU-GARN-GPOW-AMOR-GAIE,19,19.3
149,Brown,IMAU-GARN-GPOW-OYOU-GAIE,47,13.7
43,Buffalo,CWAT-PLOP-THAR-KLEW-ADAV,44,41.8
47,Buffalo,LCOR-CWAT-THAR-SGIN-ADAV,31,28.5


## Get top 2 lineups per team by plusminus_per40, but only for lineups with at least 50 minutes played

In [21]:
# Get top 2 lineups per team by plusminus_per40, but only for lineups with at least 50 minutes played

min_minutes = 50

top2_per_team_50min = (
    combined_top_lineups[combined_top_lineups['minutes'] >= min_minutes]
    .sort_values(['team', 'plusminus_per40'], ascending=[True, False])
    .groupby('team')
    .head(2)
    [['team', 'lineup', 'minutes', 'plusminus_per40']]
)

# Style the table for better visualization
styled_top2_50min = top2_per_team_50min.style.background_gradient(
    cmap='RdYlGn',
    subset=['plusminus_per40']
).format({
    'plusminus_per40': '{:.1f}',
    'minutes': '{:.0f}',
})

display(styled_top2_50min)

Unnamed: 0,team,lineup,minutes,plusminus_per40
0,Akron,EHAL-AMOB-ZRAS-SBRO-NCLA,80,0.0
1,Akron,EHAL-ZRAS-SBRO-MVEJ-NCLA,80,-3.5
13,Ball St.,LAUS-ABEC-ARIC-ESTU-MKIE,90,23.2
12,Ball St.,LAUS-ABEC-MBIS-ARIC-MKIE,279,10.0
25,Bowling Green,LFLE-AVEL-PKOH-EPOR-KMCG,134,11.7
24,Bowling Green,LFLE-AVEL-JDON-PKOH-EPOR,197,6.9
144,Brown,IMAU-GARN-OYOU-ENEL-GAIE,103,8.9
146,Brown,IMAU-GARN-AMOR-OYOU-ENEL,57,1.4
38,Buffalo,LCOR-CWAT-THAR-KLEW-JBEA,104,18.0
40,Buffalo,LCOR-CWAT-THAR-KLEW-ASEA,97,16.9


## Conference-Wide Analysis Pipeline

In [50]:
import numpy as np
def analyze_conference(progression_df, top_lineups_df):
    """Complete analysis pipeline for WCC"""
    # 1. Filter to only include top lineups from each team
    top_lineup_names = top_lineups_df.groupby('team')['lineup'].unique()
    progression_top = progression_df[
        progression_df.apply(lambda x: x['lineup'] in top_lineup_names[x['team']], axis=1)
    ]
    
    # 2. Calculate stability metrics with weighted stats
    def weighted_mean(df):
        return np.average(df['plusminus_per40'],weights = df['minutes'])
    def weighted_std(df):
        avg = weighted_mean(df)
        variance = np.average((df['plusminus_per40'] - avg) ** 2, weights=df['minutes'])
        return np.sqrt(variance)

    stability_stats = progression_top.groupby(['team', 'lineup'],group_keys = False).apply(
        lambda x : pd.Series({
            'minutes_sum': x['minutes'].sum(),
            'minutes_count': len(x),
            'minutes_std': x['minutes'].std(),
            'plusminus_per40_mean': weighted_mean(x),
            'plusminus_per40_std': weighted_std(x),
            'netrating_mean': np.average(x['netrating'], weights=x['minutes']),
            'netrating_std': np.sqrt(np.average((x['netrating']-np.average(x['netrating'], weights=x['minutes']))**2, 
                                              weights=x['minutes']))
        })
    ).reset_index()

    # Flatten multi-index columns
   # stability_stats.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col for col in stability_stats.columns.values]
    
    # 3. Calculate coefficient of variation
    stability_stats['plusminus_per40_cv'] = (stability_stats['plusminus_per40_std'] / 
                                     stability_stats['plusminus_per40_mean']).abs().fillna(0)
    stability_stats['netrating_cv'] = (stability_stats['netrating_std'] /
                                     stability_stats['netrating_mean']).abs().fillna(0)
    
    
    # 4. Determine minimum sample sizes
    min_samples = []
    for (team, lineup), group in progression_top.groupby(['team', 'lineup']):
        # Calculate PROPER cumulative plusminus_per40
        cum_minutes = group['minutes'].cumsum()
        cum_plusminus = group['plusminus'].cumsum()
        cum_p40 = (cum_plusminus/cum_minutes)*40
        
        # Check for stabilization
        stabilized = False
        # Iterate through cumulative minutes and p40
        for i in range(1, len(cum_minutes)):
            if abs(cum_p40.iloc[i] - cum_p40.iloc[i-1]) < 5 and abs(cum_minutes.iloc[i]-cum_minutes.iloc[i-1])>2:  # <5 p40 change and more than 2 minutes change
                min_samples.append({
                    'team': team,
                    'lineup': lineup,
                    'stabilized_at': cum_minutes.iloc[i],
                    'stabilized_p40': cum_p40.iloc[i],
                    'final_plusminus_per40': (group['plusminus'].sum() / group['minutes'].sum()) * 40,
                    'total_minutes': cum_minutes.iloc[-1]
                })
                stabilized = True
                #print(f"Stabilized: {team} - {lineup} at {cum_minutes.iloc[i]} minutes with net rating {cum_p40.iloc[-1]}")
                break
        
        
        
        min_samples_df = pd.DataFrame(min_samples)
    # Calculate % of lineups that stabilized
    num_stabilized = len(min_samples)
    num_total = stability_stats.shape[0]
    percent_stabilized = round(100 * num_stabilized / num_total, 2)
    print(f"Percentage of lineups that stabilized: {percent_stabilized}%")

    return stability_stats, min_samples_df



## Run analysis pipeline - Min_Samples Table

In [51]:
stability_stats, min_samples = analyze_conference(combined_progression, combined_top_lineups)
numerical_cols = stability_stats.select_dtypes(include=['float64']).columns
stability_stats[numerical_cols] = stability_stats[numerical_cols].round(2)

numerical_cols = min_samples.select_dtypes(include=['float64']).columns
min_samples[numerical_cols] = min_samples[numerical_cols].round(2)
display(min_samples)
#print('\n',min_samples.iloc[149])

Percentage of lineups that stabilized: 56.45%


  avg = avg_as_array = np.multiply(a, wgt,
  stability_stats = progression_top.groupby(['team', 'lineup'],group_keys = False).apply(


Unnamed: 0,team,lineup,stabilized_at,stabilized_p40,final_plusminus_per40,total_minutes
0,Akron,EHAL-AMOB-SBRO-MVEJ-NCLA,42.13,11.39,11.39,42.13
1,Akron,EHAL-AMOB-ZRAS-MVEJ-NCLA,47.51,-25.26,-25.26,47.51
2,Akron,EHAL-AMOB-ZRAS-SBRO-LTAP,50.36,-6.35,-8.22,72.95
3,Akron,EHAL-AMOB-ZRAS-SBRO-NCLA,49.15,-4.07,0.00,79.56
4,Akron,EHAL-ZRAS-SBRO-MVEJ-LTAP,36.08,9.98,9.98,36.08
...,...,...,...,...,...,...
205,Western Mich.,MWAG-MASE-AKOU-HSPI-ECAR,22.97,-24.38,-24.38,22.97
206,Yale,ALEE-CMOO-KODU-MEGG-AGUI,17.82,-13.47,-13.47,17.82
207,Yale,ALEE-KCAP-KODU-MEGG-GTHY,24.21,-19.83,-18.79,29.81
208,Yale,ALEE-MCHA-CMOO-KCAP-MEGG,53.54,-17.18,-17.18,53.54


### In case you want to reference the progression stats

In [35]:

display(combined_progression)

#print('\n',combined_progression.iloc[5075])
#print('\n',combined_progression.iloc[5093])
#print('\n',combined_progression.iloc[5222])
#print('\n',combined_progression.iloc[5249])

Unnamed: 0,lineup,interval_num,possessions,minutes,plusminus_per40,netrating,plusminus,team,conference
0,EHAL-AMOB-SBRO-MVEJ-NCLA,1,1.56,0.82,0.00,0.00,0,Akron,MAC
1,KRHO-ZRAS-TCLA-SBRO-LTAP,1,7.40,3.83,-20.89,-27.03,-2,Akron,MAC
2,KRHO-ZRAS-TCLA-MVEJ-LTAP,1,10.20,5.97,-53.60,-78.47,-8,Akron,MAC
3,EHAL-ZRAS-TCLA-SBRO-NCLA,1,0.40,0.00,inf,500.00,2,Akron,MAC
4,EHAL-ZRAS-SBRO-MVEJ-LTAP,1,0.40,0.27,296.30,500.00,2,Akron,MAC
...,...,...,...,...,...,...,...,...,...
8565,EVIL-MALS-CABR-JVIL-DMEN,4,6.50,3.82,41.88,61.54,4,Washington St.,WCC
8566,EVIL-ATUH-CABR-JVIL-TWAL,4,17.73,10.16,66.93,95.87,17,Washington St.,WCC
8567,EVIL-ATUH-JVIL-DMEN-TWAL,4,97.80,56.27,11.37,16.36,16,Washington St.,WCC
8568,ATUH-KGAR-CABR-JVIL-TWAL,4,9.29,5.65,0.00,0.00,0,Washington St.,WCC


### In case you want to reference top_lineup stats

In [36]:
display(combined_top_lineups)

Unnamed: 0,lineup,possessions,minutes,plusminus,plusminus_per40,netrating,offrating,defrating,team,conference
0,EHAL-AMOB-ZRAS-SBRO-NCLA,134.63,79.56,0,0.00,0.00,89.14,89.14,Akron,MAC
1,EHAL-ZRAS-SBRO-MVEJ-NCLA,131.04,80.24,-7,-3.49,-5.34,84.71,90.05,Akron,MAC
2,EHAL-AMOB-ZRAS-SBRO-LTAP,124.94,72.95,-15,-8.22,-12.01,87.25,99.25,Akron,MAC
3,EHAL-AMOB-ZRAS-MVEJ-LTAP,94.88,54.97,-32,-23.29,-33.73,95.91,129.64,Akron,MAC
4,EHAL-AMOB-ZRAS-MVEJ-NCLA,82.41,47.51,-30,-25.26,-36.41,61.89,98.29,Akron,MAC
...,...,...,...,...,...,...,...,...,...,...
367,EVIL-ATUH-MALS-DMEN-TWAL,43.34,26.22,14,21.36,32.30,129.20,96.90,Washington St.,WCC
368,EVIL-ATUH-MALS-TWAL-ACOV,40.52,24.84,-1,-1.61,-2.47,96.24,98.71,Washington St.,WCC
369,EVIL-KGAR-CABR-DMEN-TWAL,31.01,18.33,5,10.91,16.12,99.97,83.85,Washington St.,WCC
370,ATUH-CABR-JVIL-TWAL-ACOV,29.90,16.34,-4,-9.79,-13.38,70.23,83.61,Washington St.,WCC


### Let's Look at lineups stabilized with at least 35 minutes and the DIFF between stabilized p40 and final_p40

In [38]:
min_samples35 = min_samples[min_samples['stabilized_at']>35]
min_samples35['p40_diff'] = min_samples35['final_plusminus_per40'] - min_samples35['stabilized_p40']
min_samples35['p40_diff'] = min_samples35['p40_diff'].round(2)
display(min_samples35.sort_values(by='total_minutes',ascending = False))
print(min_samples35.aggregate(
    {'p40_diff':['mean','std','max']
    }
))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  min_samples35['p40_diff'] = min_samples35['final_plusminus_per40'] - min_samples35['stabilized_p40']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  min_samples35['p40_diff'] = min_samples35['p40_diff'].round(2)


Unnamed: 0,team,lineup,stabilized_at,stabilized_p40,final_plusminus_per40,total_minutes,p40_diff
141,Portland,MMEE-AMAR-ESHE-MBUR-THUL,269.88,26.83,23.62,416.59,-3.21
113,Oregon St.,KSHU-TBOL-CFER-AMAR-KREE,269.14,-2.38,0.80,348.67,3.18
76,LMU (CA),BWIL-NEVA-ACLA-MHER-CHEI,317.64,-4.91,-4.91,317.64,0.00
70,Kent St.,DGRA-MBAB-JTYL-JBAT-BDUN,192.89,9.95,10.07,285.89,0.12
64,Harvard,HTUR-GAND-LCHA-SGLE-EROD,132.86,6.02,14.85,269.28,8.83
...,...,...,...,...,...,...,...
4,Akron,EHAL-ZRAS-SBRO-MVEJ-LTAP,36.08,9.98,9.98,36.08,0.00
54,Eastern Mich.,MAMA-OSMI-KLEW-OWES-SELE,35.97,6.67,6.67,35.97,0.00
42,Cornell,AKIL-KLAN-PENG-SPAR-RKAU,35.44,-15.80,-15.80,35.44,0.00
164,San Diego,ARAN-HRHO-JRHO-DMOO-KHOR,35.40,-24.86,-24.86,35.40,0.00


       p40_diff
mean  -0.502500
std    4.480154
max   13.950000


## Stability Stats

In [39]:
display(stability_stats)

Unnamed: 0,team,lineup,minutes_sum,minutes_count,minutes_std,plusminus_per40_mean,plusminus_per40_std,netrating_mean,netrating_std,plusminus_per40_cv,netrating_cv
0,Akron,EHAL-AMOB-SBRO-MVEJ-LTAP,32.87,3.0,10.39,14.60,37.92,25.70,41.43,2.60,1.61
1,Akron,EHAL-AMOB-SBRO-MVEJ-NCLA,42.13,4.0,9.23,11.39,13.26,16.30,19.38,1.16,1.19
2,Akron,EHAL-AMOB-ZRAS-MVEJ-LTAP,54.97,4.0,10.28,-23.28,44.17,-31.77,58.03,1.90,1.83
3,Akron,EHAL-AMOB-ZRAS-MVEJ-NCLA,47.51,4.0,7.45,-25.26,29.25,-38.32,41.90,1.16,1.09
4,Akron,EHAL-AMOB-ZRAS-NCLA-LTAP,28.46,3.0,13.36,2.81,1.06,4.56,1.72,0.38,0.38
...,...,...,...,...,...,...,...,...,...,...,...
367,Yale,ALEE-MCHA-KODU-MEGG-AGUI,23.01,3.0,11.56,24.34,28.37,31.92,46.07,1.17,1.44
368,Yale,ALON-MCHA-KCAP-MEGG-GTHY,15.10,2.0,7.24,-21.19,19.55,-36.06,43.29,0.92,1.20
369,Yale,MCHA-CMOO-KCAP-KODU-MEGG,15.75,1.0,,-33.02,0.00,-47.23,0.00,0.00,0.00
370,Yale,MCHA-CMOO-KCAP-MEGG-MSCH,14.53,2.0,3.44,30.28,13.69,38.62,16.42,0.45,0.43


### In case you want to reference progression stats

In [40]:
display(combined_progression)

Unnamed: 0,lineup,interval_num,possessions,minutes,plusminus_per40,netrating,plusminus,team,conference
0,EHAL-AMOB-SBRO-MVEJ-NCLA,1,1.56,0.82,0.00,0.00,0,Akron,MAC
1,KRHO-ZRAS-TCLA-SBRO-LTAP,1,7.40,3.83,-20.89,-27.03,-2,Akron,MAC
2,KRHO-ZRAS-TCLA-MVEJ-LTAP,1,10.20,5.97,-53.60,-78.47,-8,Akron,MAC
3,EHAL-ZRAS-TCLA-SBRO-NCLA,1,0.40,0.00,inf,500.00,2,Akron,MAC
4,EHAL-ZRAS-SBRO-MVEJ-LTAP,1,0.40,0.27,296.30,500.00,2,Akron,MAC
...,...,...,...,...,...,...,...,...,...
8565,EVIL-MALS-CABR-JVIL-DMEN,4,6.50,3.82,41.88,61.54,4,Washington St.,WCC
8566,EVIL-ATUH-CABR-JVIL-TWAL,4,17.73,10.16,66.93,95.87,17,Washington St.,WCC
8567,EVIL-ATUH-JVIL-DMEN-TWAL,4,97.80,56.27,11.37,16.36,16,Washington St.,WCC
8568,ATUH-KGAR-CABR-JVIL-TWAL,4,9.29,5.65,0.00,0.00,0,Washington St.,WCC


## Generate Key Insights

In [52]:
def generate_wcc_insights(stats_df, min_samples_df):
    """Print actionable conference insights"""
    # Conference averages
    # Calculate without extreme values
    q1 = stats_df['plusminus_per40_cv'].quantile(0.25)
    q3 = stats_df['plusminus_per40_cv'].quantile(0.75)
    iqr = q3 - q1
    filtered = stats_df[stats_df['plusminus_per40_cv'] <= (q3 + 1.5*iqr)]
    #print(filtered['plusminus_per40_cv'].median())


    avg_stabilization = min_samples_df['stabilized_at'].median()
    avg_cv = filtered['plusminus_per40_cv'].median()

    print(f"=== WCC Conference Insights ===\n")
    print(f"1. Lineups typically stabilize after {avg_stabilization:.0f} minutes")
    print(f"   - Median coefficient of variation: {avg_cv:.2f}")
    
    # Team rankings
    team_stability = filtered.groupby('team')['plusminus_per40_cv'].mean().sort_values()
    print("\n2. Teams by lineup stability (lower CV is better):")
    print(team_stability.round(2))
    
    # Sample size recommendations
    print("\n3. Evaluation guidelines:")
    print(f"   - <{avg_stabilization:.0f} minutes: Insufficient sample")
    print(f"   - {avg_stabilization:.0f}-100 minutes: Preliminary assessment")
    print("   - 100+ minutes: Reliable evaluation")



In [53]:
generate_wcc_insights(stability_stats, min_samples)

=== WCC Conference Insights ===

1. Lineups typically stabilize after 35 minutes
   - Median coefficient of variation: 1.17

2. Teams by lineup stability (lower CV is better):
team
Princeton            0.80
San Francisco        1.01
Buffalo              1.07
Harvard              1.10
San Diego            1.15
Bowling Green        1.19
NIU                  1.22
Miami (OH)           1.26
Ohio                 1.28
Cornell              1.31
Dartmouth            1.36
Akron                1.38
Yale                 1.53
Eastern Mich.        1.54
Saint Mary's (CA)    1.57
Gonzaga              1.60
Santa Clara          1.71
Columbia             1.73
Western Mich.        1.75
Pepperdine           1.81
Ball St.             1.85
Toledo               2.01
Washington St.       2.03
Portland             2.04
Pacific              2.23
Oregon St.           2.27
Brown                2.30
LMU (CA)             2.32
Central Mich.        2.34
Penn                 2.41
Kent St.             2.57
Name: plusmin

## "Use" Actionable Insights

#### Note that there are 372 distinct lineups in this dataset

In [59]:
#lineups with more than avg_stabilization minutes played full season sorted by plusminus_per40 and with a cv of less than 3
avg_stabilization = 35

lineups_over_50_full_season = stability_stats[(stability_stats['minutes_sum'] > avg_stabilization) & (stability_stats['plusminus_per40_cv'] < 3)][['lineup', 'team', 'minutes_sum', 'plusminus_per40_mean','plusminus_per40_cv']]
lineups_over_50_full_season_sorted = lineups_over_50_full_season.sort_values(by='plusminus_per40_mean', ascending=False)
styled_full_season_sorted = lineups_over_50_full_season_sorted.style.background_gradient(
    cmap='RdYlGn',
    subset=['plusminus_per40_mean']
). format({
    'plusminus_per40_mean': '{:.1f}',
    'minutes_sum': '{:.0f}',
    'plusminus_per40_cv': '{:.1f}'
})
display(styled_full_season_sorted)

print("\n Number of lineups in this criteria: ", len(lineups_over_50_full_season))
print("'%' of lineups that meet this criteria", round(len(lineups_over_50_full_season)/372*100,2),'%')

Unnamed: 0,lineup,team,minutes_sum,plusminus_per40_mean,plusminus_per40_cv
132,AROC-HTUR-GAND-LCHA-EROD,Harvard,43,49.9,0.8
262,RMOG-AMAR-ESHE-MBUR-LSPE,Portland,40,48.6,0.8
140,KWHI-HTUR-GAND-SGLE-EROD,Harvard,44,42.2,0.4
50,CWAT-PLOP-THAR-KLEW-ADAV,Buffalo,44,41.8,0.6
127,ATUR-TDAL-CO'C-YEJI-ELIT,Gonzaga,87,30.4,0.1
338,EVIL-ATUH-CABR-JVIL-TWAL,Washington St.,37,30.1,1.0
301,EPAP-EPOR-AZIA-DSAN-FWER,San Francisco,37,30.1,0.8
142,KWHI-HTUR-SGLE-AWRI-EROD,Harvard,47,29.2,0.2
328,DROB-KGOS-SMIK-FFED-NGAR,Toledo,38,24.3,0.4
325,DROB-KCAR-KGOS-SMIK-JCOO,Toledo,70,24.1,1.0



 Number of lineups in this criteria:  141
'%' of lineups that meet this criteria 37.9 %


## Lastly, just for fun, let's look at team combo pairs

In [62]:
all_team_pairs = []

for team, group in stability_stats.groupby('team'):
    # Step 1: Get top 2 lineups by minutes for this team
    top_2_lineups = (
        group.sort_values('minutes_sum', ascending=False)
        .head(2)['lineup'].tolist()
    )
    top_2_data = group[group['lineup'].isin(top_2_lineups)]

    # Step 2: Get players in top 2 lineups (ordered, deduped)
    top_players_ordered = []
    for lu in top_2_lineups:
        top_players_ordered += lu.split('-')
    top_players_ordered = list(dict.fromkeys(top_players_ordered))

    # Step 3: Build player pair combos for all top 2 lineups
    player_combos = []
    for _, row in top_2_data.iterrows():
        players = row['lineup'].split('-')
        for i in range(len(players)):
            for j in range(i+1, len(players)):
                player_combos.append({
                    'team': team,
                    'player1': players[i],
                    'player2': players[j],
                    'plusminus_per40': row['plusminus_per40_mean'],
                    'minutes': row['minutes_sum']
                })
    combo_df = pd.DataFrame(player_combos)
    if combo_df.empty:
        continue
    combo_df = combo_df[combo_df['minutes']>150]
    # Step 4: Aggregate across all appearances (mean plusminus, sum minutes)
    top_pairs = (
        combo_df.groupby(['team', 'player1', 'player2'])
        .agg({'plusminus_per40': 'mean', 'minutes': 'sum'})
        .reset_index()
    )

    # Step 5: Filter to pairs where both players are in top 2 lineups
    filtered_pairs = top_pairs[
        top_pairs['player1'].isin(top_players_ordered) &
        top_pairs['player2'].isin(top_players_ordered)
    ].copy()

    # Step 6: Sort by player1’s appearance order
    filtered_pairs['player1'] = pd.Categorical(filtered_pairs['player1'],
                                               categories=top_players_ordered,
                                               ordered=True)
    filtered_pairs = filtered_pairs.sort_values(['player1', 'minutes'], ascending=[True, False])

    all_team_pairs.append(filtered_pairs)

# Combine all teams
final_pairs = pd.concat(all_team_pairs, ignore_index=True)

# Style and display
styled_pairs = final_pairs.style.background_gradient(
    cmap='RdYlGn',
    subset=['plusminus_per40']
).format({
    'plusminus_per40': '{:.1f}',
    'minutes': '{:.0f}',
})

display(styled_pairs)

Unnamed: 0,team,player1,player2,plusminus_per40,minutes
0,Ball St.,LAUS,ABEC,10.0,279
1,Ball St.,LAUS,ARIC,10.0,279
2,Ball St.,LAUS,MBIS,10.0,279
3,Ball St.,LAUS,MKIE,10.0,279
4,Ball St.,ABEC,ARIC,10.0,279
5,Ball St.,ABEC,MBIS,10.0,279
6,Ball St.,ABEC,MKIE,10.0,279
7,Ball St.,MBIS,ARIC,10.0,279
8,Ball St.,MBIS,MKIE,10.0,279
9,Ball St.,ARIC,MKIE,10.0,279


## Save Results

In [30]:

stability_stats.to_csv('output/wcc_stability_stats.csv', index=False)
min_samples.to_csv('output/wcc_min_samples.csv', index=False)