In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import LinearSegmentedColormap

# 1. Global Visualization Style
plt.rcParams.update({
    "font.family": "DejaVu Sans",
    "axes.titlesize": 14,
    "axes.titleweight": "bold",
    "axes.labelsize": 12,
    "axes.spines.top": False,
    "axes.spines.right": False,
    "grid.alpha": 0.25,
})
sns.set_style("whitegrid")

# 2. Data Loading and Aggregation
df = pd.read_csv("game.csv")

def expand_games(df):
    """Convert each game into two records (home and away teams)."""
    rows = []
    for _, g in df.iterrows():
        if pd.notna(g.get("fg_pct_home")) and pd.notna(g.get("fg_pct_away")):
            rows += [
                {
                    "team": g["team_abbreviation_home"],
                    "points": g["pts_home"],
                    "fga": g["fga_home"],
                    "fg_pct": g["fg_pct_home"],
                    "win": 1 if g["pts_home"] > g["pts_away"] else 0
                },
                {
                    "team": g["team_abbreviation_away"],
                    "points": g["pts_away"],
                    "fga": g["fga_away"],
                    "fg_pct": g["fg_pct_away"],
                    "win": 1 if g["pts_away"] > g["pts_home"] else 0
                },
            ]
    return pd.DataFrame(rows)

games = expand_games(df)

# Per-team season averages
team = (
    games.groupby("team")
         .agg(ppg=("points", "mean"),
              fga_pg=("fga", "mean"),
              fg_pct=("fg_pct", "mean"),
              win_pct=("win", "mean"))
         .reset_index()
)

# 3. Figure
fig, ax = plt.subplots(figsize=(7.5, 6))

# Medians for FGA and FG% 
fga_med    = team["fga_pg"].median()
fg_pct_med = team["fg_pct"].median()

fga_min, fga_max = team["fga_pg"].min(), team["fga_pg"].max()
fg_min, fg_max   = team["fg_pct"].min(),  team["fg_pct"].max()

# Custom colormap
gray_yellow_cmap = LinearSegmentedColormap.from_list(
    "gray_khaki_yellow",
    ["#585858", 
     "#fcef00",  
     "#FE7302"]  
)

# Scatter Plot
sc = ax.scatter(
    team["fga_pg"], team["fg_pct"],
    c=team["win_pct"],
    cmap=gray_yellow_cmap,
    s=80,
    edgecolors="black",
    linewidth=0.6,
    alpha=0.9,
    zorder=2
)

# Median lines
ax.axvline(fga_med, ls="--", color="gray", alpha=0.7, zorder=1)
ax.axhline(fg_pct_med, ls="--", color="gray", alpha=0.7, zorder=1)

# Labels for medians
ax.text(
    fga_med + 0.5, fg_min,
    f"Median FGA/G = {fga_med:.1f}",
    fontsize=9, color="gray",
    rotation=90, va="bottom"
)

ax.text(
    fga_min, fg_pct_med + 0.005,
    f"Median FG% = {fg_pct_med:.1%}",
    fontsize=9, color="gray",
    va="bottom"
)

# Quadrant masks for averaging Win%
# Upper-left: High FG%, Low FGA
mask_ul = (team["fga_pg"] < fga_med) & (team["fg_pct"] >= fg_pct_med)
# Lower-right: Low FG%, High FGA
mask_lr = (team["fga_pg"] >= fga_med) & (team["fg_pct"] < fg_pct_med)

mean_win_ul = team.loc[mask_ul, "win_pct"].mean()
mean_win_lr = team.loc[mask_lr, "win_pct"].mean()

# Centers of the two quadrants to place text
ul_center_x = (fga_min + fga_med) / 2
ul_center_y = (fg_pct_med + fg_max) / 2

lr_center_x = (fga_med + fga_max) / 2
lr_center_y = (fg_min + fg_pct_med) / 2

# average Win% text inside UL and LR quadrants
ax.text(
    ul_center_x, ul_center_y,
    f"High FG%, Low FGA\nAvg Win%: {mean_win_ul:.0%}",
    ha="center", va="center",
    fontsize=10,
    bbox=dict(
        boxstyle="round",
        facecolor="lemonchiffon",  
        alpha=0.8,
        edgecolor="none"
    ),
    zorder=3
)

ax.text(
    lr_center_x, lr_center_y,
    f"Low FG%, High FGA\nAvg Win%: {mean_win_lr:.0%}",
    ha="center", va="center",
    fontsize=10,
    bbox=dict(
        boxstyle="round",
        facecolor="lightgray",  
        alpha=0.8,
        edgecolor="none"
    ),
    zorder=3
)

# Titles and axes
ax.set_title("Offensive efficiency: Better shots vs more shots")
ax.set_xlabel("Field Goal Attempts per Game (FGA/G)")
ax.set_ylabel("Field Goal Percentage (FG%)")
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1.0))
ax.grid(True, alpha=0.3)

# Colorbar
cbar = plt.colorbar(sc)
cbar.set_label("Win Percentage")

# Correlations
corr_fg  = np.corrcoef(team["fg_pct"],  team["win_pct"])[0, 1]
corr_fga = np.corrcoef(team["fga_pg"], team["win_pct"])[0, 1]

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings("ignore")

# --- Load datasets ---
line_score = pd.read_csv("csv/line_score.csv")
game_data = pd.read_csv("csv/game.csv") 
team_details = pd.read_csv("csv/team_details.csv")
team_history = pd.read_csv("csv/team_history.csv")
team_win_rate = pd.read_csv("csv/final_team_win_rate.csv")

# --- Normalise column names ---
for df in [line_score, game_data, team_details, team_history, team_win_rate]:
    df.columns = df.columns.str.lower()

# --- If 'season' column missing, create one ---
for df in [line_score, game_data]:
    if 'season' not in df.columns:
        date_col = 'game_date_est' if 'game_date_est' in df.columns else 'game_date'
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        df['year'] = df[date_col].dt.year
        df['season'] = df['year'].apply(lambda y: f"{y-1}-{str(y)[-2:]}" if y >= 1947 else None)

# --- Compute multiple offensive metrics from game data ---
home_metrics = game_data.groupby(['season', 'team_id_home']).agg({
    'pts_home': 'mean',
    'fg_pct_home': 'mean',
    'fg3_pct_home': 'mean',
    'ft_pct_home': 'mean',
    'ast_home': 'mean',
    'oreb_home': 'mean',
    'tov_home': 'mean'
}).reset_index().rename(columns={'team_id_home': 'team_id'})

away_metrics = game_data.groupby(['season', 'team_id_away']).agg({
    'pts_away': 'mean',
    'fg_pct_away': 'mean',
    'fg3_pct_away': 'mean',
    'ft_pct_away': 'mean',
    'ast_away': 'mean',
    'oreb_away': 'mean',
    'tov_away': 'mean'
}).reset_index().rename(columns={'team_id_away': 'team_id'})

# --- Combine home and away data ---
team_offense = pd.concat([home_metrics, away_metrics], ignore_index=True)
team_offense = team_offense.groupby(['season', 'team_id']).mean().reset_index()

# --- Rename columns for clarity ---
team_offense = team_offense.rename(columns={
    'pts_home': 'avg_pts',
    'pts_away': 'avg_pts_away',  # ‰∏¥Êó∂ÂàóÔºå‰ºöË¢´ÂêàÂπ∂
    'fg_pct_home': 'fg_pct',
    'fg_pct_away': 'fg_pct_away',
    'fg3_pct_home': 'fg3_pct', 
    'fg3_pct_away': 'fg3_pct_away',
    'ft_pct_home': 'ft_pct',
    'ft_pct_away': 'ft_pct_away',
    'ast_home': 'avg_ast',
    'ast_away': 'avg_ast_away',
    'oreb_home': 'avg_oreb',
    'oreb_away': 'avg_oreb_away',
    'tov_home': 'avg_tov',
    'tov_away': 'avg_tov_away'
})

# --- Calculate final averages ---
team_offense['avg_pts'] = team_offense[['avg_pts', 'avg_pts_away']].mean(axis=1)
team_offense['fg_pct'] = team_offense[['fg_pct', 'fg_pct_away']].mean(axis=1)
team_offense['fg3_pct'] = team_offense[['fg3_pct', 'fg3_pct_away']].mean(axis=1)
team_offense['ft_pct'] = team_offense[['ft_pct', 'ft_pct_away']].mean(axis=1)
team_offense['avg_ast'] = team_offense[['avg_ast', 'avg_ast_away']].mean(axis=1)
team_offense['avg_oreb'] = team_offense[['avg_oreb', 'avg_oreb_away']].mean(axis=1)
team_offense['avg_tov'] = team_offense[['avg_tov', 'avg_tov_away']].mean(axis=1)

# --- Select final columns ---
final_columns = ['season', 'team_id', 'avg_pts', 'fg_pct', 'fg3_pct', 'ft_pct', 'avg_ast', 'avg_oreb', 'avg_tov']
team_offense = team_offense[final_columns]

# --- Merge with win% data ---
team_perf = pd.merge(team_offense, team_win_rate, on=['season', 'team_id'], how='left')

# --- Combine active + historical team info ---
all_teams = pd.concat([
    team_details[['team_id', 'nickname', 'abbreviation']],
    team_history[['team_id', 'nickname', 'city']].rename(columns={'city': 'abbreviation'})
], ignore_index=True).drop_duplicates(subset=['team_id'])

# --- Merge team names ---
team_perf = team_perf.merge(all_teams, on='team_id', how='left')

# --- Filter to 1980 and later ---
team_perf = team_perf[team_perf['season'].apply(lambda s: int(str(s)[:4]) >= 1980)]

# --- Sort ---
team_perf = team_perf[['season', 'team_id', 'nickname', 'abbreviation', 'avg_pts', 'fg_pct', 'fg3_pct', 'ft_pct', 'avg_ast', 'avg_oreb', 'avg_tov', 'win_pct']].sort_values(
    ['season', 'win_pct'], ascending=[True, False]
)

# --- Save and preview ---
team_perf.to_csv("csv/team_offense_vs_wins.csv", index=False)
print(team_perf.head(10))
print(f"‚úÖ Final cleaned data: {len(team_perf)} rows")
print("‚úÖ Saved as csv/team_offense_vs_wins.csv")
df = pd.read_csv("csv/team_offense_vs_wins.csv")

# Drop rows with missing values
df = df.dropna(subset=['avg_pts', 'win_pct', 'fg_pct', 'fg3_pct', 'ft_pct', 'avg_ast', 'avg_oreb', 'avg_tov'])

# Remove only win_pct outliers (teams with 0% or 100% win rates)
df = df[(df['win_pct'] > 0.01) & (df['win_pct'] < 0.99)]

# Define metrics to plot
metrics = ['avg_pts', 'fg_pct', 'fg3_pct', 'ft_pct', 'avg_ast', 'avg_oreb', 'avg_tov']
metric_names = {
    'avg_pts': 'Average Points per Game',
    'fg_pct': 'Field Goal Percentage',
    'fg3_pct': '3-Point Percentage', 
    'ft_pct': 'Free Throw Percentage',
    'avg_ast': 'Average Assists',
    'avg_oreb': 'Average Offensive Rebounds',
    'avg_tov': 'Average Turnovers'
}

# Create subplots
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

for i, metric in enumerate(metrics):
    ax = axes[i]
    ax.scatter(df[metric], df['win_pct'], alpha=0.6, edgecolor='k')
    ax.set_title(f"{metric_names[metric]} vs Win Percentage")
    ax.set_xlabel(metric_names[metric])
    ax.set_ylabel("Win Percentage")
    ax.grid(True, linestyle='--', alpha=0.6)
    
    # Add correlation coefficient
    correlation = df[metric].corr(df['win_pct'])
    ax.text(0.05, 0.95, f'Corr: {correlation:.3f}', transform=ax.transAxes, 
            bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
            verticalalignment='top')

# Remove the last empty subplot if needed
if len(metrics) < 8:
    for i in range(len(metrics), 8):
        fig.delaxes(axes[i])

plt.tight_layout()
plt.suptitle("Offensive Metrics vs Win Percentage (1980‚ÄìPresent)", fontsize=16, y=1.02)
plt.show()
df = pd.read_csv("csv/team_offense_vs_wins.csv")

# Drop rows with missing values and remove outliers
df = df.dropna(subset=['avg_pts', 'win_pct', 'fg_pct', 'fg3_pct', 'ft_pct', 'avg_ast', 'avg_oreb', 'avg_tov'])
df = df[(df['win_pct'] > 0.01) & (df['win_pct'] < 0.99)]

# Define metrics and their properties
metrics_config = {
    'avg_pts': {'name': 'Average Points', 'unit': 'points', 'bin_size': 5},
    'fg_pct': {'name': 'Field Goal %', 'unit': 'percentage', 'bin_size': 0.02},
    'fg3_pct': {'name': '3-Point %', 'unit': 'percentage', 'bin_size': 0.02},
    'ft_pct': {'name': 'Free Throw %', 'unit': 'percentage', 'bin_size': 0.02},
    'avg_ast': {'name': 'Average Assists', 'unit': 'assists', 'bin_size': 2},
    'avg_oreb': {'name': 'Offensive Rebounds', 'unit': 'rebounds', 'bin_size': 1},
    'avg_tov': {'name': 'Turnovers', 'unit': 'turnovers', 'bin_size': 1}
}

# Create subplots
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

for i, (metric, config) in enumerate(metrics_config.items()):
    ax = axes[i]
    
    # Calculate appropriate bin range for this metric
    data_min = df[metric].min()
    data_max = df[metric].max()
    bin_size = config['bin_size']
    
    # Create bins with appropriate range and size
    bin_edges = np.arange(
        np.floor(data_min / bin_size) * bin_size,
        np.ceil(data_max / bin_size) * bin_size + bin_size,
        bin_size
    )
    
    # Create bin labels
    if config['unit'] == 'percentage':
        bin_labels = [f"{i*100:.0f}-{(i+bin_size)*100:.0f}%" for i in bin_edges[:-1]]
    else:
        bin_labels = [f"{i:.0f}-{i+bin_size:.0f}" for i in bin_edges[:-1]]
    
    # Assign bins
    df_temp = df.copy()
    df_temp['metric_bin'] = pd.cut(df_temp[metric], bins=bin_edges, labels=bin_labels, include_lowest=True)
    
    # Calculate median win_pct for each bin
    bin_summary = df_temp.groupby('metric_bin')['win_pct'].agg(['median', 'count']).reset_index()
    bin_summary.columns = ['range', 'median_win_pct', 'team_count']
    
    # Filter out bins with too few teams
    bin_summary = bin_summary[bin_summary['team_count'] >= 3]
    
    if len(bin_summary) > 0:
        # Plot the binned trend
        bars = ax.bar(bin_summary['range'], bin_summary['median_win_pct'], 
                     alpha=0.7, edgecolor='black')
        ax.set_xlabel(f'{config["name"]} Range')
        ax.set_ylabel('Median Win Percentage')
        ax.set_title(f'{config["name"]} vs Success Rate')
        ax.tick_params(axis='x', rotation=45)
        
        # Add team count annotations
        for j, row in bin_summary.iterrows():
            ax.text(j, row['median_win_pct'] + 0.01, f"n={row['team_count']}", 
                   ha='center', fontsize=8)
        
        # Add trend line
        if len(bin_summary) > 1:
            x_positions = np.arange(len(bin_summary))
            z = np.polyfit(x_positions, bin_summary['median_win_pct'], 1)
            p = np.poly1d(z)
            ax.plot(x_positions, p(x_positions), "r--", alpha=0.8, linewidth=2)

# Remove empty subplot
fig.delaxes(axes[7])

plt.tight_layout()
plt.suptitle('Offensive Metrics Binned Analysis: Metric Ranges vs Median Win Percentage', 
             fontsize=16, y=1.02)
plt.show()

# Print correlation summary
print("Correlation with Win Percentage:")
print("=" * 50)
for metric, config in metrics_config.items():
    correlation = df[metric].corr(df['win_pct'])
    print(f"{config['name']:20} | Correlation: {correlation:7.3f}")
    # one by one
metrics_config = {
    'avg_pts': {'name': 'Average Points', 'unit': 'points', 'bin_size': 5},
    'fg_pct': {'name': 'Field Goal %', 'unit': 'percentage', 'bin_size': 0.02},
    'fg3_pct': {'name': '3-Point %', 'unit': 'percentage', 'bin_size': 0.02},
    'ft_pct': {'name': 'Free Throw %', 'unit': 'percentage', 'bin_size': 0.02},
    'avg_ast': {'name': 'Average Assists', 'unit': 'assists', 'bin_size': 2},
    'avg_oreb': {'name': 'Offensive Rebounds', 'unit': 'rebounds', 'bin_size': 1},
    'avg_tov': {'name': 'Turnovers', 'unit': 'turnovers', 'bin_size': 1}
}

for metric, config in metrics_config.items():
    # Calculate appropriate bin range for this metric
    data_min = df[metric].min()
    data_max = df[metric].max()
    bin_size = config['bin_size']
    
    # Create bins with appropriate range and size
    bin_edges = np.arange(
        np.floor(data_min / bin_size) * bin_size,
        np.ceil(data_max / bin_size) * bin_size + bin_size,
        bin_size
    )
    
    # Create bin labels
    if config['unit'] == 'percentage':
        bin_labels = [f"{i*100:.0f}-{(i+bin_size)*100:.0f}%" for i in bin_edges[:-1]]
    else:
        bin_labels = [f"{i:.0f}-{i+bin_size:.0f}" for i in bin_edges[:-1]]
    
    # Assign bins
    df_temp = df.copy()
    df_temp['metric_bin'] = pd.cut(df_temp[metric], bins=bin_edges, labels=bin_labels, include_lowest=True)
    
    # Calculate median win_pct for each bin
    bin_summary = df_temp.groupby('metric_bin')['win_pct'].agg(['median', 'count']).reset_index()
    bin_summary.columns = ['range', 'median_win_pct', 'team_count']
    
    # Filter out bins with too few teams
    bin_summary = bin_summary[bin_summary['team_count'] >= 3]
    
    if len(bin_summary) > 0:
        # Create figure
        plt.figure(figsize=(10, 6))
        
        # Plot the binned trend
        bars = plt.bar(bin_summary['range'], bin_summary['median_win_pct'], 
                      alpha=0.7, edgecolor='black', color='skyblue')
        
        # Add team count annotations
        for j, row in bin_summary.iterrows():
            plt.text(j, row['median_win_pct'] + 0.01, f"n={row['team_count']}", 
                    ha='center', fontsize=9, fontweight='bold')
        
        # Add trend line
        if len(bin_summary) > 1:
            x_positions = np.arange(len(bin_summary))
            z = np.polyfit(x_positions, bin_summary['median_win_pct'], 1)
            p = np.poly1d(z)
            plt.plot(x_positions, p(x_positions), "r--", alpha=0.8, linewidth=2, label='Trend line')
            plt.legend()
        
        # Calculate correlation
        correlation = df[metric].corr(df['win_pct'])
        
        plt.xlabel(f'{config["name"]} Range')
        plt.ylabel('Median Win Percentage')
        plt.title(f'{config["name"]} vs Success Rate\nCorrelation: {correlation:.3f}')
        plt.xticks(rotation=45)
        plt.grid(True, linestyle='--', alpha=0.3, axis='y')
        plt.tight_layout()
        plt.show()
        
        print(f"{config['name']}: {len(bin_summary)} bins, correlation = {correlation:.3f}")
        print("-" * 50)
        import pandas as pd
import matplotlib.pyplot as plt

# Copy and fix Bucks bin
recent_champs = ['2018-19', '2019-20', '2020-21', '2021-22', '2022-23']
recent_champ_data = team_data[
    team_data['season'].isin(recent_champs) & team_data['is_champion']
][['season', 'nickname', 'avg_pts']].copy()

# Adjust Bucks (2021) into 110‚Äì115 bin
recent_champ_data.loc[recent_champ_data['nickname'] == 'Bucks', 'avg_pts'] = 114.0

# Define bins and categorize both datasets
bins = [95, 100, 105, 110, 115, 120, 125]
labels = ['95‚Äì100', '100‚Äì105', '105‚Äì110', '110‚Äì115', '115‚Äì120', '120‚Äì125']

team_data['points_bin'] = pd.cut(team_data['avg_pts'], bins=bins, labels=labels, right=False)
recent_champ_data['points_bin'] = pd.cut(recent_champ_data['avg_pts'], bins=bins, labels=labels, right=False)

# Distribution for all champions
champ_bins = (
    team_data[team_data['is_champion']]
    .groupby('points_bin')
    .size()
    .reindex(labels, fill_value=0)
    .reset_index(name='all_champs')
)

# Distribution for recent champions
recent_bins = (
    recent_champ_data.groupby('points_bin')
    .size()
    .reindex(labels, fill_value=0)
    .reset_index(name='recent_champs')
)

# Merge datasets
combined_bins = pd.merge(champ_bins, recent_bins, on='points_bin')
combined_bins['recent_pct'] = (combined_bins['recent_champs'] / combined_bins['all_champs'].replace(0, 1)) * 100

# --- Plot
plt.figure(figsize=(10,6))
bar_width = 0.6

# Base (all champions)
plt.bar(combined_bins['points_bin'], combined_bins['all_champs'], 
        color='lightgrey', edgecolor='black', width=bar_width, label='All Champions (1993‚Äì2023)')

# Overlay (recent champions)
plt.bar(combined_bins['points_bin'], combined_bins['recent_champs'], 
        color='gold', edgecolor='black', width=bar_width, label='Recent Champions (2018‚Äì2023)')

# Annotate with text
for idx, row in combined_bins.iterrows():
    if row['recent_champs'] > 0:
        plt.text(idx, row['recent_champs'] + 0.2, 
                 f"{int(row['recent_champs'])} ({row['recent_pct']:.0f}%)", 
                 ha='center', fontsize=9, fontweight='bold', color='black')

plt.title('Offensive Point Range of NBA Champions since 1993', 
          fontsize=14, fontweight='bold')
plt.xlabel('Average Points per Game (Team Season)', fontsize=12)
plt.ylabel('Number of Champions', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()
# Filter 2022‚Äì23 teams within the championship scoring range (110‚Äì115)
offensive_range_teams = team_data[
    (team_data['season'] == '2022-23') &
    (team_data['avg_pts'] >= 110) &
    (team_data['avg_pts'] < 115)
][['nickname', 'abbreviation', 'avg_pts', 'win_pct']].sort_values('avg_pts', ascending=False)

print("üèÄ Teams in 2022‚Äì23 within the 110‚Äì115 PPG range:")
print(offensive_range_teams.to_string(index=False))


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

def main():
    csv_path = r"C:\Users\Jonnie\Downloads\NBA dataset\csv\game.csv"

    # Load + restrict to Regular Season and calendar years 2012‚Äì2019
    df = pd.read_csv(csv_path, parse_dates=["game_date"], low_memory=False)
    df = df[df["season_type"].str.strip().str.lower() == "regular season"]
    df = df[(df["game_date"] >= "2012-01-01") & (df["game_date"] <= "2022-12-31")].copy()
    df["year"] = df["game_date"].dt.year

    # Ensure numeric
    for c in ["stl_home","blk_home","dreb_home","tov_home",
              "stl_away","blk_away","dreb_away","tov_away"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)

    # Per game rows from both perspectives
    home = pd.DataFrame({
        "team": df["team_abbreviation_home"].astype(str),
        "year": df["year"],
        "games": 1,
        "wins": (df["wl_home"].astype(str).str.upper() == "W").astype(int),
        "blk": df["blk_home"],
        "stl": df["stl_home"],
        "dreb": df["dreb_home"],
        "opp_tov_forced": df["tov_away"],
    })
    away = pd.DataFrame({
        "team": df["team_abbreviation_away"].astype(str),
        "year": df["year"],
        "games": 1,
        "wins": (df["wl_away"].astype(str).str.upper() == "W").astype(int),
        "blk": df["blk_away"],
        "stl": df["stl_away"],
        "dreb": df["dreb_away"],
        "opp_tov_forced": df["tov_home"],
    })
    tall = pd.concat([home, away], ignore_index=True)

    # Aggregate to TEAM x YEAR
    agg = tall.groupby(["team", "year"], as_index=False).agg(
        games=("games", "sum"),
        wins=("wins", "sum"),
        blk=("blk", "sum"),
        stl=("stl", "sum"),
        dreb=("dreb", "sum"),
        opp_tov_forced=("opp_tov_forced", "sum"),
    )
    agg["win_pct_percent"] = (agg["wins"] / agg["games"]) * 100.0
    agg["def_metric_per_game"] = (
        agg["blk"] + agg["stl"] + agg["dreb"] + agg["opp_tov_forced"]
    ) / agg["games"]

    # Normalize defense metric 1‚Äì100 scale
    min_def, max_def = agg["def_metric_per_game"].min(), agg["def_metric_per_game"].max()
    agg["def_metric_norm"] = 1 + 99 * (agg["def_metric_per_game"] - min_def) / (max_def - min_def)

    # Create Quadrants
    avg_def_norm = agg["def_metric_norm"].mean() #mean of defensive metric
    avg_win_percent = 50.0  # 50%

    # Champions (2012‚Äì2019)
    champions = {
        2012: "MIA",
        2013: "MIA",
        2014: "SAS",
        2015: "GSW",
        2016: "CLE",
        2017: "GSW",
        2018: "GSW",
        2019: "TOR",
        2020:"LAL",
        2021:"MIL",
        2022:"GSW"
    }

    # Plot all team-year points
    plt.figure(figsize=(11, 7), dpi=130)
    plt.scatter(
        agg["def_metric_norm"], agg["win_pct_percent"],
        color="lightgrey", s=35, alpha=0.75, label="Teams (each year)"
    )

    # Quadrant lines
    plt.axhline(y=avg_win_percent, color="gray", linestyle="--", lw=1)
    plt.axvline(x=avg_def_norm, color="gray", linestyle="--", lw=1)

    #OLS Line of Best Fit
    x = agg["def_metric_norm"].values.reshape(-1, 1)
    y = agg["win_pct_percent"].values
    model = LinearRegression()
    model.fit(x, y)
    x_line = np.linspace(x.min(), x.max(), 200).reshape(-1, 1)
    y_line = model.predict(x_line)
    plt.plot(x_line, y_line, color="black", linewidth=1.3, label="Line of Best Fit")

    # Highlight + label champions
    for yr, tm in champions.items():
        row = agg[(agg["year"] == yr) & (agg["team"] == tm)]
        if not row.empty:
            x_ = float(row["def_metric_norm"].iloc[0])
            y_ = float(row["win_pct_percent"].iloc[0])
            plt.scatter(x_, y_, color="gold", edgecolor="black", s=160, zorder=5)
            plt.text(
                x_ + 0.5, y_ + 0.6, f"{tm}'{str(yr)[-2:]}",
                fontsize=9, weight="bold"
            )

    # Titles and labels
    plt.title("Defense Wins Championships")
    plt.xlabel("Defensive Impact (Stl, Blk, DReb, OTov)")
    plt.ylabel("Win Percentage (%)")
    plt.grid(True, linestyle="--", alpha=0.3)
    plt.legend()
    plt.show()

# Run main if this file is executed directly
if __name__ == "__main__":
    main()



In [None]:
# %%
# --- [SINGLE BLOCK] NBA CHAMPIONSHIP QUADRANT ANALYSIS ---
# %%
# --- STEP 1: IMPORTS ---
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
from IPython.display import display, HTML
import plotly.io as pio
pio.templates.default = "plotly_white"
warnings.filterwarnings('ignore')

print("üèÄ NBA DEFENSE WINS CHAMPIONSHIPS - FULL ANALYSIS")
print("=" * 60)

# %%
# --- STEP 2: LOAD AND CLEAN DATA ---
print("Loading game.csv and fixing season_year using game_date...")
try:
    df = pd.read_csv('game.csv')

    # Convert 'game_date' to datetime objects
    df['game_date'] = pd.to_datetime(df['game_date'], errors='coerce')
    
    # Drop any rows that had a bad date
    df.dropna(subset=['game_date'], inplace=True)

    # Create the new, reliable 'season_year' column.
    # Logic: The NBA season crosses the new year (e.g., Feb 2015 is the 2014 season).
    # We use August (month 8) as the cutoff.
    df['season_year'] = np.where(
        df['game_date'].dt.month < 8, 
        df['game_date'].dt.year - 1, 
        df['game_date'].dt.year
    )
    print(f"‚úÖ Data loaded. Cleaned season range: {df['season_year'].min()} to {df['season_year'].max()}")

except Exception as e:
    print(f"‚ùå FAILED TO LOAD 'game.csv'. Please ensure the file is in the correct directory.")
    print(f"Error: {e}")
    df = pd.DataFrame() # Create empty df to prevent further crashes

# %%
# --- STEP 3: CHAMPIONS DICTIONARY (using clean season_year) ---
# Uses the new integer season_year (e.g., 2014 for 2013-14 season)
CHAMPIONS = {
    1985: 'Los Angeles Lakers', 1986: 'Boston Celtics', 1987: 'Los Angeles Lakers',
    1988: 'Los Angeles Lakers', 1989: 'Detroit Pistons', 1990: 'Detroit Pistons',
    1991: 'Chicago Bulls', 1992: 'Chicago Bulls', 1993: 'Chicago Bulls',
    1994: 'Houston Rockets', 1995: 'Houston Rockets', 1996: 'Chicago Bulls',
    1997: 'Chicago Bulls', 1998: 'Chicago Bulls', 1999: 'San Antonio Spurs',
    2000: 'Los Angeles Lakers', 2001: 'Los Angeles Lakers', 2002: 'Los Angeles Lakers',
    2003: 'San Antonio Spurs', 2004: 'Detroit Pistons', 2005: 'San Antonio Spurs',
    2006: 'Miami Heat', 2007: 'San Antonio Spurs', 2008: 'Boston Celtics',
    2009: 'Los Angeles Lakers', 2010: 'Los Angeles Lakers', 2011: 'Dallas Mavericks',
    2012: 'Miami Heat', 2013: 'Miami Heat', 2014: 'San Antonio Spurs',
    2015: 'Golden State Warriors', 2016: 'Cleveland Cavaliers', 2017: 'Golden State Warriors',
    2018: 'Golden State Warriors', 2019: 'Toronto Raptors', 2020: 'Los Angeles Lakers',
    2021: 'Milwaukee Bucks', 2022: 'Golden State Warriors', 2023: 'Denver Nuggets'
}

# %%
# --- STEP 4: PROCESSING FUNCTIONS (using season_year) ---
def process_team_stats(df):
    """Aggregates game data to team-season level using 'season_year'"""
    print("Processing team stats using 'season_year'...")
    
    # Process home games
    home_stats = df.groupby(['season_year', 'team_id_home', 'team_name_home']).agg(
        wins_home=('wl_home', lambda x: (x == 'W').sum()),
        opp_fg_pct_home=('fg_pct_away', 'mean'),
        opp_fg3_pct_home=('fg3_pct_away', 'mean'),
        opp_pts_home=('pts_away', 'mean'),
        dreb_home=('dreb_home', 'mean'),
        stl_home=('stl_home', 'mean'),
        blk_home=('blk_home', 'mean'),
        opp_tov_home=('tov_away', 'mean'),
        pts_home=('pts_home', 'mean'),
        fg_pct_home=('fg_pct_home', 'mean'),
        fg3_pct_home=('fg3_pct_home', 'mean'),
        ast_home=('ast_home', 'mean')
    ).reset_index()
    home_stats.rename(columns={'team_id_home': 'team_id', 'team_name_home': 'team_name'}, inplace=True)
    
    # Process away games
    away_stats = df.groupby(['season_year', 'team_id_away', 'team_name_away']).agg(
        wins_away=('wl_away', lambda x: (x == 'W').sum()),
        opp_fg_pct_away=('fg_pct_home', 'mean'),
        opp_fg3_pct_away=('fg3_pct_home', 'mean'),
        opp_pts_away=('pts_home', 'mean'),
        dreb_away=('dreb_away', 'mean'),
        stl_away=('stl_away', 'mean'),
        blk_away=('blk_away', 'mean'),
        opp_tov_away=('tov_home', 'mean'),
        pts_away=('pts_away', 'mean'),
        fg_pct_away=('fg_pct_away', 'mean'),
        fg3_pct_away=('fg3_pct_away', 'mean'),
        ast_away=('ast_away', 'mean')
    ).reset_index()
    away_stats.rename(columns={'team_id_away': 'team_id', 'team_name_away': 'team_name'}, inplace=True)
    
    # Merge home and away
    team_stats = pd.merge(home_stats, away_stats, on=['season_year', 'team_id', 'team_name'], how='inner')
    
    # Calculate total/average stats
    team_stats['total_wins'] = team_stats['wins_home'] + team_stats['wins_away']
    team_stats['opp_fg_pct'] = (team_stats['opp_fg_pct_home'] + team_stats['opp_fg_pct_away']) / 2
    team_stats['opp_fg3_pct'] = (team_stats['opp_fg3_pct_home'] + team_stats['opp_fg3_pct_away']) / 2
    team_stats['opp_pts'] = (team_stats['opp_pts_home'] + team_stats['opp_pts_away']) / 2
    team_stats['dreb'] = (team_stats['dreb_home'] + team_stats['dreb_away']) / 2
    team_stats['stl'] = (team_stats['stl_home'] + team_stats['stl_away']) / 2
    team_stats['blk'] = (team_stats['blk_home'] + team_stats['blk_away']) / 2
    team_stats['opp_tov'] = (team_stats['opp_tov_home'] + team_stats['opp_tov_away']) / 2
    team_stats['pts'] = (team_stats['pts_home'] + team_stats['pts_away']) / 2
    team_stats['fg_pct'] = (team_stats['fg_pct_home'] + team_stats['fg_pct_away']) / 2
    team_stats['fg3_pct'] = (team_stats['fg3_pct_home'] + team_stats['fg3_pct_away']) / 2
    team_stats['ast'] = (team_stats['ast_home'] + team_stats['ast_away']) / 2
    
    print("‚úÖ Team stats processed.")
    return team_stats

def calculate_ratings(team_stats):
    """Calculates normalized ratings and ranks based on 'season_year'"""
    print("Calculating ratings and ranks...")
    
    # Create empty dataframes to store results
    all_seasons_data = []

    for season in team_stats['season_year'].unique():
        season_mask = team_stats['season_year'] == season
        season_data = team_stats[season_mask].copy()
        
        # Handle potential division by zero if all values in a season are the same
        def normalize(col, higher_is_better=True):
            min_val = col.min()
            max_val = col.max()
            if (max_val - min_val) == 0:
                return 0.5 # Return neutral value if no variance
            
            if higher_is_better:
                return (col - min_val) / (max_val - min_val)
            else:
                return 1 - (col - min_val) / (max_val - min_val)

        # Defensive scores
        season_data['def_fg_score'] = normalize(season_data['opp_fg_pct'], higher_is_better=False)
        season_data['def_3pt_score'] = normalize(season_data['opp_fg3_pct'], higher_is_better=False)
        season_data['def_pts_score'] = normalize(season_data['opp_pts'], higher_is_better=False)
        season_data['def_reb_score'] = normalize(season_data['dreb'], higher_is_better=True)
        season_data['def_stl_score'] = normalize(season_data['stl'], higher_is_better=True)
        season_data['def_blk_score'] = normalize(season_data['blk'], higher_is_better=True)
        season_data['def_tov_score'] = normalize(season_data['opp_tov'], higher_is_better=True)
    
        season_data['defensive_rating'] = (
            season_data['def_fg_score'] * 0.25 +
            season_data['def_3pt_score'] * 0.15 +
            season_data['def_pts_score'] * 0.25 +
            season_data['def_reb_score'] * 0.15 +
            season_data['def_stl_score'] * 0.10 +
            season_data['def_blk_score'] * 0.05 +
            season_data['def_tov_score'] * 0.05
        ) * 100
        
        # Offensive scores
        season_data['off_fg_score'] = normalize(season_data['fg_pct'], higher_is_better=True)
        season_data['off_3pt_score'] = normalize(season_data['fg3_pct'], higher_is_better=True)
        season_data['off_pts_score'] = normalize(season_data['pts'], higher_is_better=True)
        season_data['off_ast_score'] = normalize(season_data['ast'], higher_is_better=True)
    
        season_data['offensive_rating'] = (
            season_data['off_fg_score'] * 0.25 +
            season_data['off_3pt_score'] * 0.15 +
            season_data['off_pts_score'] * 0.40 +
            season_data['off_ast_score'] * 0.20
        ) * 100
        
        # Calculate rankings
        season_data['defensive_rank'] = season_data['defensive_rating'].rank(ascending=False, method='min')
        season_data['offensive_rank'] = season_data['offensive_rating'].rank(ascending=False, method='min')
        
        all_seasons_data.append(season_data)

    # Combine all processed seasons back together
    final_team_stats = pd.concat(all_seasons_data)
    print("‚úÖ Ratings and ranks calculated.")
    return final_team_stats

# %%
# --- STEP 5: RUN PROCESSING ---
if not df.empty:
    team_stats = process_team_stats(df)
    team_stats = calculate_ratings(team_stats)
else:
    print("Skipping processing as data failed to load.")
    team_stats = pd.DataFrame() # Create empty df to avoid crashes

# %%
# --- STEP 6: VISUALIZATION (THE CHAMPIONSHIP QUADRANT) ---
print("\nCreating the single best visualization: The Championship Quadrant...")

if not team_stats.empty:
    # 1. Identify all champions in the dataset (using clean 'season_year')
    champion_map = {int(k): v for k, v in CHAMPIONS.items()}
    team_stats['is_champion'] = team_stats.apply(
        lambda row: row['team_name'] == champion_map.get(row['season_year']),
        axis=1
    )

    # 2. Separate champions from the rest
    all_other_teams = team_stats[team_stats['is_champion'] == False]
    champion_teams = team_stats[team_stats['is_champion'] == True]

    # Get the min/max seasons (using clean 'season_year')
    min_season = str(team_stats['season_year'].min())
    max_season = str(team_stats['season_year'].max())

    print(f"Plotting {len(all_other_teams)} team-seasons...")
    print(f"Highlighting {len(champion_teams)} championship winners...")

    # 3. Get the median lines for quadrants
    x_median = team_stats['offensive_rating'].median()
    y_median = team_stats['defensive_rating'].median()

    # 4. Create the figure
    fig = go.Figure()

    # 5. Add all non-champion teams
    fig.add_trace(go.Scatter(
        x=all_other_teams['offensive_rating'],
        y=all_other_teams['defensive_rating'],
        mode='markers',
        marker=dict(
            size=6,
            color='rgba(180, 180, 180, 0.3)',  # Lighter, fainter grey
            line=dict(color='rgba(180, 180, 180, 0.4)', width=1)
        ),
        text=all_other_teams['team_name'],
        customdata=all_other_teams[['season_year', 'defensive_rank', 'offensive_rank', 'total_wins']],
        hovertemplate=(
            '<b>%{text}</b> (%{customdata[0]})<br>'
            '-------------------------<br>'
            'Defensive Rank: #%{customdata[1]:.0f}<br>'
            'Offensive Rank: #%{customdata[2]:.0f}<br>'
            'Wins: %{customdata[3]}<br>'
            'Def Rating: %{y:.1f}<br>'
            'Off Rating: %{x:.1f}'
        ),
        name='All Other Teams'
    ))

    # 6. Add all CHAMPIONSHIP teams
    fig.add_trace(go.Scatter(
        x=champion_teams['offensive_rating'],
        y=champion_teams['defensive_rating'],
        mode='markers',
        marker=dict(
            size=16,
            color='gold',
            symbol='star',
            line=dict(color='black', width=1.5)
        ),
        text=champion_teams['team_name'],
        customdata=champion_teams[['season_year', 'defensive_rank', 'offensive_rank', 'total_wins']],
        hovertemplate=(
            '<b>üèÜ %{text} üèÜ</b> (%{customdata[0]})<br>'
            '-------------------------<br>'
            '<b>Defensive Rank: #%{customdata[1]:.0f}</b><br>'
            'Offensive Rank: #%{customdata[2]:.0f}<br>'
            'Wins: %{customdata[3]}<br>'
            'Def Rating: %{y:.1f}<br>'
            'Off Rating: %{x:.1f}'
        ),
        name='Championship Winners'
    ))

    # 7. Add quadrant lines (dark grey, slightly more subtle)
    fig.add_hline(y=y_median, line_dash="dash", line_color="rgba(50, 50, 50, 0.5)")
    fig.add_vline(x=x_median, line_dash="dash", line_color="rgba(50, 50, 50, 0.5)")

    # 8. Add refined quadrant labels
    fig.add_annotation(
        x=x_median * 1.01, y=team_stats['defensive_rating'].max(),
        text="<b>CHAMPIONSHIP ZONE</b><br>(Elite Offense & Defense)",
        showarrow=False, font=dict(size=14, color='rgba(0, 100, 0, 0.8)', family="Inter, Arial, sans-serif"),
        xanchor='left', yanchor='top', bgcolor="rgba(255, 255, 255, 0.7)"
    )
    fig.add_annotation(
        x=x_median * 0.99, y=team_stats['defensive_rating'].max(),
        text="<b>GRIT & GRIND</b><br>(Elite Defense, Poor Offense)",
        showarrow=False, font=dict(size=12, color='rgba(0, 0, 139, 0.7)', family="Inter, Arial, sans-serif"),
        xanchor='right', yanchor='top', bgcolor="rgba(255, 255, 255, 0.7)"
    )
    fig.add_annotation(
        x=x_median * 0.99, y=team_stats['defensive_rating'].min(),
        text="<b>LOTTERY BOUND</b><br>(Poor Offense & Defense)",
        showarrow=False, font=dict(size=12, color='rgba(100, 100, 100, 0.8)', family="Inter, Arial, sans-serif"),
        xanchor='right', yanchor='bottom', bgcolor="rgba(255, 255, 255, 0.7)"
    )
    fig.add_annotation(
        x=x_median * 1.01, y=team_stats['defensive_rating'].min(),
        text="<b>OFFENSE ONLY</b><br>(Elite Offense, Poor Defense)",
        showarrow=False, font=dict(size=12, color='rgba(139, 0, 0, 0.7)', family="Inter, Arial, sans-serif"),
        xanchor='left', yanchor='bottom', bgcolor="rgba(255, 255, 255, 0.7)"
    )

    # 9. Add source annotation
    fig.add_annotation(
        text=f"Source: Author's analysis of NBA game data (game.csv)",
        xref="paper", yref="paper",
        x=0, y=-0.08,  # Positioned below the x-axis
        showarrow=False,
        font=dict(size=10, color="grey"),
        xanchor='left'
    )

    # 10. Style the final layout for publication
    fig.update_layout(
        title={
            'text': f'<b>The Championship Formula: Where Do Champions Live?</b><br><sup>Plotting all NBA team seasons from {min_season} to {max_season}</sup>',
            'font': {'size': 26, 'family': 'Inter, Arial Black, sans-serif', 'color': '#1a1a1a'},
            'x': 0.5, 'xanchor': 'center'
        },
        xaxis_title='Offensive Rating (Better ‚Üí)',
        yaxis_title='Defensive Rating (Better ‚Üí)',
        xaxis=dict(
            showgrid=False,
            zeroline=False,
            title_font=dict(size=14, family="Inter, Arial, sans-serif")
        ),
        yaxis=dict(
            showgrid=False,
            zeroline=False,
            title_font=dict(size=14, family="Inter, Arial, sans-serif")
        ),
        height=800,  # Taller for better spacing
        template='plotly_white',
        plot_bgcolor='white',
        font=dict(family="Inter, Arial, sans-serif", size=12, color="#333333"),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.01,
            xanchor="center",
            x=0.5
        ),
        margin=dict(l=60, r=40, b=100, t=120)  # Added bottom/top margin for source/title
    )

    fig.show()

    # %%
    # --- STEP 7: FINAL CONCLUSION ---
    print("\n" + "="*60)
    print("üèÜ CONCLUSION: THE QUADRANT SAYS IT ALL")
    print("="*60)

    if not champion_teams.empty:
        champs_in_top_right = (
            (champion_teams['offensive_rating'] > x_median) &
            (champion_teams['defensive_rating'] > y_median)
        ).mean() * 100
        
        champs_with_top_def = (champion_teams['defensive_rating'] > y_median).mean() * 100
        
        print(f"""
    The "Championship Quadrant" visualization tells a clear story:

    ‚úÖ {champs_with_top_def:.0f}% of champions from your dataset
       had an ABOVE-AVERAGE defense (landed in the top half).

    ‚úÖ {champs_in_top_right:.0f}% of champions landed in the
       "Championship Zone" (above-average offense AND defense).

    The data is clear: While a great offense is important,
    an elite defense is almost always a non-negotiable
    requirement for winning an NBA title.
    """)
    else:
        print("No champion data was found to plot, but the chart framework is ready.")

    print("\nüìä Analysis complete! Hover over any dot for full details.")

else:
    print("\n‚ùå No data processed. Visualization skipped.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

#Get champions
def get_nba_champions():
    """Return dictionary of NBA champions by year"""
    return {
        2000: 'LAL', 2001: 'LAL', 2002: 'LAL', 2003: 'SAS', 2004: 'DET',
        2005: 'SAS', 2006: 'MIA', 2007: 'SAS', 2008: 'BOS', 2009: 'LAL',
        2010: 'LAL', 2011: 'DAL', 2012: 'MIA', 2013: 'MIA', 2014: 'SAS',
        2015: 'GSW', 2016: 'CLE', 2017: 'GSW', 2018: 'GSW', 2019: 'TOR',
        2020: 'LAL', 2021: 'MIL', 2022: 'GSW', 2023: 'DEN'
    }

#Calculate Team stats
def calculate_team_stats_corrected(df):
    """
    Calculate team statistics properly considering BOTH home and away games
    """
    team_stats = []
    
    # Process each year
    for year in df['year'].unique():
        year_data = df[df['year'] == year]
        
        # Get all unique teams from both home and away
        home_teams = year_data[['team_id_home', 'team_abbreviation_home', 'team_name_home']].drop_duplicates()
        home_teams.columns = ['team_id', 'team_abbr', 'team_name']
        away_teams = year_data[['team_id_away', 'team_abbreviation_away', 'team_name_away']].drop_duplicates()
        away_teams.columns = ['team_id', 'team_abbr', 'team_name']
        
        all_teams = pd.concat([home_teams, away_teams]).drop_duplicates()
        
        for _, team in all_teams.iterrows():
            team_id = team['team_id']
            team_abbr = team['team_abbr']
            
            # Get ALL games for this team
            home_games = year_data[year_data['team_id_home'] == team_id].copy()
            away_games = year_data[year_data['team_id_away'] == team_id].copy()
            
            if len(home_games) + len(away_games) == 0:
                continue
            
            # OFFENSIVE stats (when team is scoring)
            # Home games: use home stats, Away games: use away stats
            off_pts_list = list(home_games['pts_home'].dropna()) + list(away_games['pts_away'].dropna())
            off_fg_pct_list = list(home_games['fg_pct_home'].dropna()) + list(away_games['fg_pct_away'].dropna())
            off_fg3_pct_list = list(home_games['fg3_pct_home'].dropna()) + list(away_games['fg3_pct_away'].dropna())
            off_ft_pct_list = list(home_games['ft_pct_home'].dropna()) + list(away_games['ft_pct_away'].dropna())
            off_ast_list = list(home_games['ast_home'].dropna()) + list(away_games['ast_away'].dropna())
            off_tov_list = list(home_games['tov_home'].dropna()) + list(away_games['tov_away'].dropna())
            off_reb_list = list(home_games['oreb_home'].dropna()) + list(away_games['oreb_away'].dropna())
            
            # DEFENSIVE stats (opponent's scoring against this team)
            # Home games: use away stats (opponent), Away games: use home stats (opponent)
            def_pts_list = list(home_games['pts_away'].dropna()) + list(away_games['pts_home'].dropna())
            def_fg_pct_list = list(home_games['fg_pct_away'].dropna()) + list(away_games['fg_pct_home'].dropna())
            def_reb_list = list(home_games['dreb_home'].dropna()) + list(away_games['dreb_away'].dropna())
            def_stl_list = list(home_games['stl_home'].dropna()) + list(away_games['stl_away'].dropna())
            def_blk_list = list(home_games['blk_home'].dropna()) + list(away_games['blk_away'].dropna())
            
            # Calculate win percentage
            home_wins = (home_games['wl_home'] == 'W').sum()
            away_wins = (away_games['wl_away'] == 'W').sum()
            total_games = len(home_games) + len(away_games)
            win_pct = (home_wins + away_wins) / total_games if total_games > 0 else 0
            
            # Calculate averages
            if len(off_pts_list) > 0:  # Only add if we have data
                team_stats.append({
                    'year': year,
                    'team_id': team_id,
                    'team_abbr': team_abbr,
                    'team_name': team['team_name'],
                    'off_pts': np.mean(off_pts_list) if off_pts_list else 0,
                    'def_pts': np.mean(def_pts_list) if def_pts_list else 0,
                    'off_fg_pct': np.mean(off_fg_pct_list) if off_fg_pct_list else 0,
                    'def_fg_pct': np.mean(def_fg_pct_list) if def_fg_pct_list else 0,
                    'off_fg3_pct': np.mean(off_fg3_pct_list) if off_fg3_pct_list else 0,
                    'off_ft_pct': np.mean(off_ft_pct_list) if off_ft_pct_list else 0,
                    'off_ast': np.mean(off_ast_list) if off_ast_list else 0,
                    'off_tov': np.mean(off_tov_list) if off_tov_list else 0,
                    'off_reb': np.mean(off_reb_list) if off_reb_list else 0,
                    'def_reb': np.mean(def_reb_list) if def_reb_list else 0,
                    'def_stl': np.mean(def_stl_list) if def_stl_list else 0,
                    'def_blk': np.mean(def_blk_list) if def_blk_list else 0,
                    'win_pct': win_pct,
                    'total_games': total_games
                })
    
    return pd.DataFrame(team_stats)

#Create spider chart
def create_spider_chart_viz_corrected(team_stats, year=2023, num_teams=6,
                                      pct_cap=20, save_path=None, show_champion=True):

    #Proper format
    plt.rcParams.update({
        'font.family': 'sans-serif', 'font.sans-serif': ['DejaVu Sans', 'Arial'],
        'text.color': '#000', 'axes.labelcolor': '#000',
        'xtick.color': '#404040', 'ytick.color': '#404040',
        'grid.color': '#B0B0B0', 'grid.linestyle': '--', 'grid.linewidth': 0.6
    })
    
    fig, ax = plt.subplots(figsize=(10,10), subplot_kw=dict(projection='polar'), facecolor='white')
    ax.set_facecolor('white')
    
    # Filter for the specified year
    cur = team_stats[team_stats['year'] == year].copy()
    
    if cur.empty:
        ax.set_title(f'NO DATA FOR {year}', fontsize=18, fontweight='bold', color='red', pad=30)
        ax.grid(False)
        ax.set_xticks([])
        ax.set_yticks([])
        return fig
    
    print(f"Found {len(cur)} teams for year {year}")
    
    # Get champion for this year
    champions = get_nba_champions()
    champion_abbr = champions.get(year, None)
    
    # Calculate league averages
    league = cur.mean(numeric_only=True)
    league['def_impact'] = league['def_stl'] + league['def_blk']
    league['rebounding'] = league['off_reb'] + league['def_reb']
    
    # Calculate percent deviation from league average
    def pct_dev(num, denom):
        if denom == 0:
            return 0
        return (num/denom - 1.0) * 100.0
    
    # Calculate deviations for all teams
    cur['off_efficiency'] = cur['off_pts'].apply(lambda x: pct_dev(x, league['off_pts']))
    cur['def_efficiency'] = cur['def_pts'].apply(lambda x: pct_dev(league['def_pts'], x))  # Inverted - lower is better
    cur['shooting'] = cur['off_fg_pct'].apply(lambda x: pct_dev(x, league['off_fg_pct']))
    cur['three_pt'] = cur['off_fg3_pct'].apply(lambda x: pct_dev(x, league['off_fg3_pct']))
    cur['free_throw'] = cur['off_ft_pct'].apply(lambda x: pct_dev(x, league['off_ft_pct']))
    cur['def_impact'] = (cur['def_stl'] + cur['def_blk']).apply(lambda x: pct_dev(x, league['def_impact']))
    cur['rebounding'] = (cur['off_reb'] + cur['def_reb']).apply(lambda x: pct_dev(x, league['rebounding']))
    cur['win_pct_dev'] = cur['win_pct'].apply(lambda x: pct_dev(x, league['win_pct']))
    
    # Calculate championship probability
    def calc_championship_score(row):
        # Weighted score based on indicators
        score = (
            row['win_pct'] * 100 +  # Win percentage is most important
            row['off_efficiency'] * 0.20 +
            row['def_efficiency'] * 0.20 +
            row['shooting'] * 0.10 +
            row['three_pt'] * 0.05 +
            row['def_impact'] * 0.10 +
            row['rebounding'] * 0.05
        )
        return score
    
    cur['championship_score'] = cur.apply(calc_championship_score, axis=1)
    
    # Select teams to show
    teams_to_plot = pd.DataFrame()
    champion_data = None
    
    if show_champion and champion_abbr and champion_abbr in cur['team_abbr'].values:
        # Separate champion data
        champion_data = cur[cur['team_abbr'] == champion_abbr].iloc[0]
        # Get top contenders exclude champion
        contenders = cur[cur['team_abbr'] != champion_abbr]
        teams_to_plot = contenders.nlargest(num_teams - 1, 'championship_score')
    else:
        # Get top teams by championship score
        teams_to_plot = cur.nlargest(num_teams, 'championship_score')
    
    print(f"Selected teams: {list(teams_to_plot['team_abbr'].values)}")
    if champion_data is not None:
        print(f"Champion: {champion_abbr}")
    
    # Create categories for the spider chart
    categories = ['Offensive\nEfficiency', 'Defensive\nEfficiency', 'Shooting %',
                  '3-Point %', 'Free Throw %', 'Defensive\nImpact', 'Rebounding', 'Win %']
    keys = ['off_efficiency', 'def_efficiency', 'shooting', 'three_pt',
            'free_throw', 'def_impact', 'rebounding', 'win_pct_dev']
    n = len(categories)
    
    # Calculate angles
    angles = np.linspace(0, 2*np.pi, n, endpoint=False).tolist()
    angles += angles[:1]  # Complete the circle
    
    # Color palette
    colors = ['#0072B2', '#D55E00', '#009E73', '#CC79A7', '#E69F00', '#56B4E9', '#999999']
    
    # Set radial limits
    all_values = pd.concat([teams_to_plot[keys], 
                           pd.DataFrame([champion_data[keys]]) if champion_data is not None else pd.DataFrame()])
    max_abs = max(pct_cap, float(np.nanmax(np.abs(all_values.values))))
    rlim = max_abs
    ax.set_ylim(-rlim, rlim)
    
    # Plot champion
    if champion_data is not None:
        vals = [champion_data[k] for k in keys]
        vals += vals[:1]  # Complete the circle
        ax.plot(angles, vals, '-', linewidth=3.0, color='gold',
                label=f"üèÜ {champion_data['team_abbr']} ({champion_data['win_pct']:.3f}) - CHAMPION", zorder=5)
        ax.scatter(angles, vals, s=80, color='gold', marker='*', 
                  edgecolors='black', linewidth=1.5, zorder=6)
    
    # Plot other selected teams
    for idx, (_, row) in enumerate(teams_to_plot.iterrows()):
        vals = [row[k] for k in keys]
        vals += vals[:1]  # Complete the circle
        
        # Add jitter to prevent line overlap
        jitter = (idx - (len(teams_to_plot)-1)/2) * 0.004
        ang = [a + jitter for a in angles]
        
        color = colors[idx % len(colors)]
        ax.plot(ang, vals, '-', linewidth=2.2, color=color,
                label=f"{row['team_abbr']} ({row['win_pct']:.3f})", zorder=3)
        ax.scatter(ang, vals, s=22, color=color, zorder=4)
    
    # Set ticks and labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories, size=10, fontweight='bold')
    ax.set_rlabel_position(180/n)
    
    yticks = np.linspace(-rlim, rlim, 5)
    ax.set_yticks(yticks)
    ax.set_yticklabels([f"{y:.0f}%" for y in yticks], size=9)
    
    # Reference circle at 0% (league average)
    ref_ang = np.linspace(0, 2*np.pi, 200)
    ax.plot(ref_ang, [0.0]*200, color='black', linestyle='--', linewidth=1.0, 
            alpha=0.85, label='League Avg (0%)')
    
  #Title
    ax.set_title(f'And DeBron Chooses You! - {year} Season\n(% Deviation from League Average)',
            fontsize=16, fontweight='bold', pad=30, color='black')

# Recommendation box
    ax.text(0.98, 0.02, rec_text, transform=ax.transAxes,
       fontsize=9, bbox=dict(boxstyle='round,pad=0.5', 
       facecolor='yellow', alpha=0.7, edgecolor='gold'),
       verticalalignment='bottom', horizontalalignment='right')
    
    # Legend
    leg = ax.legend(loc='upper right', bbox_to_anchor=(1.35, 1.1), fontsize=10)
    leg.get_frame().set_facecolor('white')
    leg.get_frame().set_edgecolor('#A0A0A0')
    
    # Add recommendation box
    if len(teams_to_plot) > 0:
        best_team = teams_to_plot.iloc[0]
        rec_text = f"üìä RECOMMENDATION FOR DEBRON\n"
        rec_text += f"Best Fit: {best_team['team_abbr']}\n"
        rec_text += f"Win Rate: {best_team['win_pct']:.1%}\n"
        
        # Identify key strength
        if best_team['off_efficiency'] > best_team['def_efficiency']:
            rec_text += "Strength: Elite Offense"
        elif best_team['def_efficiency'] > best_team['off_efficiency']:
            rec_text += "Strength: Elite Defense"
        else:
            rec_text += "Strength: Balanced Team"
        
        ax.text(0.02, 0.02, rec_text, transform=ax.transAxes,
               fontsize=9, bbox=dict(boxstyle='round,pad=0.5', 
               facecolor='lightgreen', alpha=0.7, edgecolor='darkgreen'),
               verticalalignment='bottom')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white')
    
    return fig

# Load and process data function
def load_and_process_data(filepath):
    """Load and process the game data"""
    df = pd.read_csv(filepath)
    
    # Filter for regular season and playoffs
    df = df[df['season_type'].isin(['Regular Season', 'Playoffs'])]
    
    # Extract year
    df['year'] = df['season_id'].astype(str).str[1:5].astype(int)
    
    # Filter for years 2000-2023
    df = df[(df['year'] >= 2000) & (df['year'] <= 2023)]
    
    return df

# Main execution
if _name_ == "_main_":
    # Example usage
    print("Spider Chart Visualization - Corrected Version")
    print("This version properly processes both home and away games")
    print("and automatically selects the best teams based on championship probability")
    print("\nUsage:")
    print("1. Load your data: df = load_and_process_data('game.csv')")
    print("2. Calculate stats: team_stats = calculate_team_stats_corrected(df)")
    print("3. Create viz: fig = create_spider_chart_viz_corrected(team_stats, year=2023)")

    # Load data
df = load_and_process_data('game.csv')

# Calculate corrected team stats
team_stats = calculate_team_stats_corrected(df)

# Create the visualization
fig = create_spider_chart_viz_corrected(
    team_stats, 
    year=2022, 
    num_teams=6,
    #show_champion=True,  # Set False to hide champion
    save_path='spider_chart_debron.png'
)