# DATASETS

In [None]:
import pandas as pd

#shared_drive_path = '/content/drive/Shared drives/analitica_visual_project/datasets/'
shared_drive_path = 'datasets/'

df_players = pd.read_csv(shared_drive_path + 'euroleague_players.csv')
df_box_score = pd.read_csv(shared_drive_path + 'euroleague_box_score.csv')
df_teams = pd.read_csv(shared_drive_path + 'euroleague_teams.csv')
df_points = pd.read_csv(shared_drive_path + 'euroleague_points.csv')
df_header = pd.read_csv(shared_drive_path + 'euroleague_header.csv')

df_box_score_eurocup = pd.read_csv(shared_drive_path + 'eurocup_box_score.csv')

In [None]:
df_team_stats_each_game = df_box_score[df_box_score['dorsal'] == 'TOTAL']

# Add the points_received column
df_team_stats_each_game['points_received'] = df_team_stats_each_game.groupby('game_id')['points'].shift(-1)

# To ensure the last row in each game_id group gets its corresponding points
df_team_stats_each_game['points_received'] = df_team_stats_each_game['points_received'].fillna(df_team_stats_each_game.groupby('game_id')['points'].shift(1))

# Adding column WIN
df_team_stats_each_game['win'] = (df_team_stats_each_game['points'] > df_team_stats_each_game['points_received']).astype(int)


# Add 'points_received_per_game' to df_teams
# Filter only team stats from df_team_stats_each_game
team_stats = df_team_stats_each_game[['team_id', 'season_code', 'points_received']]

# Group by team and season to calculate the mean points received
points_received_avg = team_stats.groupby(['team_id', 'season_code'])['points_received'].mean().reset_index()
points_received_avg.rename(columns={'points_received': 'points_received_per_game'}, inplace=True)

# Merge with df_teams
df_teams = pd.merge(df_teams, points_received_avg, on=['team_id', 'season_code'], how='left')

# Step 1.1: Drop unnecessary columns
columns_to_drop = ['game_player_id', 'player_id', 'is_starter', 'is_playing', 'dorsal', 'player', 'minutes']
df_team_stats_each_game = df_team_stats_each_game.drop(columns=columns_to_drop)

# Step 1.3: Combine rows into single rows representing matchups
# Split rows into two groups (team A and team B stats)
team_a_stats = df_team_stats_each_game[df_team_stats_each_game['game_id'].duplicated(keep='first')]
team_b_stats = df_team_stats_each_game[df_team_stats_each_game['game_id'].duplicated(keep='last')]

# Merge team A and team B stats into a single row per game
merged_df_team_stats_each_game = pd.merge(team_a_stats, team_b_stats, on='game_id', suffixes=('_a', '_b'))

# Optional: Remove duplicate columns and reorder if necessary
merged_df_team_stats_each_game = merged_df_team_stats_each_game.drop(columns=['game_b', 'round_b', 'phase_b'])
merged_df_team_stats_each_game.rename(columns={'season_code_a': 'season_code'}, inplace=True)
merged_df_team_stats_each_game.rename(columns={'game_a': 'game'}, inplace=True)
merged_df_team_stats_each_game.rename(columns={'round_a': 'round'}, inplace=True)
merged_df_team_stats_each_game.rename(columns={'phase_a': 'phase'}, inplace=True)

In [None]:
merged_df_team_stats_each_game

In [None]:
df_team_stats_each_game_eurocup = df_box_score_eurocup[df_box_score_eurocup['dorsal'] == 'TOTAL']

# Add the points_received column
df_team_stats_each_game_eurocup['points_received'] = df_team_stats_each_game_eurocup.groupby('game_id')['points'].shift(-1)

# To ensure the last row in each game_id group gets its corresponding points
df_team_stats_each_game_eurocup['points_received'] = df_team_stats_each_game_eurocup['points_received'].fillna(df_team_stats_each_game_eurocup.groupby('game_id')['points'].shift(1))

# Adding column WIN
df_team_stats_each_game_eurocup['win'] = (df_team_stats_each_game_eurocup['points'] > df_team_stats_each_game_eurocup['points_received']).astype(int)

# Step 1.1: Drop unnecessary columns
columns_to_drop = ['game_player_id', 'player_id', 'is_starter', 'is_playing', 'dorsal', 'player', 'minutes']
df_team_stats_each_game_eurocup = df_team_stats_each_game_eurocup.drop(columns=columns_to_drop)

# Step 1.3: Combine rows into single rows representing matchups
# Split rows into two groups (team A and team B stats)
team_a_stats_eurocup = df_team_stats_each_game_eurocup[df_team_stats_each_game_eurocup['game_id'].duplicated(keep='first')]
team_b_stats_eurocup = df_team_stats_each_game_eurocup[df_team_stats_each_game_eurocup['game_id'].duplicated(keep='last')]

# Merge team A and team B stats into a single row per game
merged_df_team_stats_each_game_eurocup = pd.merge(team_a_stats_eurocup, team_b_stats_eurocup, on='game_id', suffixes=('_a', '_b'))

# Optional: Remove duplicate columns and reorder if necessary
merged_df_team_stats_each_game_eurocup = merged_df_team_stats_each_game_eurocup.drop(columns=['game_b', 'round_b', 'phase_b'])
merged_df_team_stats_each_game_eurocup.rename(columns={'season_code_a': 'season_code'}, inplace=True)
merged_df_team_stats_each_game_eurocup.rename(columns={'game_a': 'game'}, inplace=True)
merged_df_team_stats_each_game_eurocup.rename(columns={'round_a': 'round'}, inplace=True)
merged_df_team_stats_each_game_eurocup.rename(columns={'phase_a': 'phase'}, inplace=True)

In [None]:
# Concatenating the DataFrames
merged_df_team_stats_each_game_all_games = pd.concat([merged_df_team_stats_each_game_eurocup, merged_df_team_stats_each_game], axis=0,  ignore_index=True)

# Check the result
merged_df_team_stats_each_game_all_games


In [None]:
df_teams

In [None]:
team_mappings = {
    'OLYMPIACOS': ['OLYMPIACOS PIRAEUS B.C.', 'OLYMPIACOS PIRAEUS', 'OLYMPIACOS PIRAEUS BC', 'OLYMPIACOS'],
    'BARCELONA': ['AXA FC BARCELONA', 'REGAL FC BARCELONA', 'REGAL FC BARCELONA ', 'FC BARCELONA REGAL',
                  'FC BARCELONA', 'FC BARCELONA LASSA'],
    'REAL MADRID': ['REAL MADRID'],
    'CSKA MOSCOW': ['CSKA MOSCOW'],
    'ZALGIRIS': ['ZALGIRIS', 'ZALGIRIS KAUNAS'],
    'PANATHINAIKOS': ['PANATHINAIKOS', 'PANATHINAIKOS BSA ATHENS', 'PANATHINAIKOS ATHENS',
                      'PANATHINAIKOS SUPERFOODS ATHENS', 'PANATHINAIKOS OPAP ATHENS', 'PANATHINAIKOS AKTOR ATHENS'],
    'PARTIZAN': ['PARTIZAN BC', 'PARTIZAN', 'PARTIZAN IGOKEA', 'PARTIZAN MT:S', 'PARTIZAN MT:S BELGRADE',
                 'PARTIZAN NIS BELGRADE', 'PARTIZAN MOZZART BET BELGRADE'],
    'MILAN': ['MILANO', 'ARMANI JEANS MILANO', 'EA7 EMPORIO ARMANI MILANO', 'AX ARMANI EXCHANGE OLIMPIA MILAN',
              'AX ARMANI EXCHANGE MILAN'],
    'FENERBAHCE': ['FENERBAHCE ULKER', 'FENERBAHCE ULKER ISTANBUL', 'FENERBAHCE ISTANBUL',
                   'FENERBAHCE BEKO ISTANBUL', 'FB DOGUS'],
    'MACCABI TEL AVIV': ['MACCABI TEL AVIV', 'MACCABI ELITE TEL AVIV', 'MACCABI ELECTRA TEL AVIV',
                         'MACCABI FOX TEL AVIV', 'MACCABI PLAYTIKA TEL AVIV'],
    'EFES': ['EFES PILSEN ISTANBUL', 'ANADOLU EFES ISTANBUL'],
    'BASKONIA': ['CAJA LABORAL BASKONIA', 'CAJA LABORAL', 'LABORAL KUTXA VITORIA',
                 'LABORAL KUTXA VITORIA GASTEIZ', 'KIROLBET BASKONIA VITORIA GASTEIZ',
                 'KIROLBET BASKONIA VITORIA-GASTEIZ', 'TD SYSTEMS BASKONIA VITORIA-GASTEIZ',
                 'BITCI BASKONIA VITORIA-GASTEIZ', 'BASKONIA VITORIA-GASTEIZ',
                 'CAZOO BASKONIA VITORIA-GASTEIZ'],
    'VALENCIA': ['POWER ELECTRONICS VALENCIA', 'VALENCIA BASKET'],
    'UNICAJA': ['UNICAJA', 'UNICAJA MALAGA'],
    'ALBA BERLIN': ['ALBA BERLIN'],
    'BAYERN MUNICH': ['FC BAYERN MUNICH'],
    'BROSE BAMBERG': ['BROSE BASKETS', 'BASKETS BAMBERG', 'BROSE BASKETS BAMBERG', 'BROSE BAMBERG'],
    'GALATASARAY': ['GALATASARAY MEDICAL PARK', 'GALATASARAY LIV HOSPITAL ISTANBUL', 'GALATASARAY ODEABANK ISTANBUL'],
    'CRVENA ZVEZDA': ['CRVENA ZVEZDA TELEKOM BELGRADE', 'CRVENA ZVEZDA MTS BELGRADE',
                      'CRVENA ZVEZDA MERIDIANBET BELGRADE'],
    'BILBAO': ['BILBAO BASKET', 'BIZKAIA BILBAO BASKET', 'GESCRAP BB'],
    'VIRTUS BOLOGNA': ['VIRTUS VIDIVICI BOLOGNA', 'VIRTUS SEGAFREDO BOLOGNA'],
    'DARUSSAFAKA': ['DARUSSAFAKA DOGUS ISTANBUL', 'DARUSSAFAKA TEKFEN ISTANBUL'],
    'BUDUCNOST': ['BUDUCNOST VOLI PODGORICA'],
    'MONACO': ['AS MONACO'],
    'KHIMKI': ['BC KHIMKI', 'KHIMKI MOSCOW REGION', 'BC KHIMKI MOSCOW REGION'],
    'LIETUVOS RYTAS': ['LIETUVOS RYTAS', 'LIETUVOS RYTAS VILNIUS'],
    'CIBONA': ['KK CIBONA', 'CIBONA'],
    'UNION OLIMPIJA': ['UNION OLIMPIJA', 'UNION OLIMPIJA LJUBLJANA'],
    'PROKOM': ['PROKOM TREFL SOPOT', 'ASSECO PROKOM SOPOT', 'ASSECO PROKOM GDYNIA', 'ASSECO PROKOM'],
    'VIRTUS ROMA': ['LOTTOMATICA ROMA', 'VIRTUS ROMA'],
    'ASVEL': ['ASVEL LYON', 'LDLC ASVEL VILLEURBANNE'],
    'NANTERRE': ['JSF NANTERRE'],
    'STRASBOURG': ['STRASBOURG'],
    'LIMOGES': ['LIMOGES CSP'],
    'CHOLET': ['CHOLET BASKET'],
    'CHALON': ['ELAN CHALON-SUR-SAONE'],
    'PARIS BASKETBALL': ['PARIS BASKETBALL']
}


# Create a reverse lookup dictionary from team_mappings
name_to_normalized = {}
for normalized_name, variations in team_mappings.items():
    for variation in variations:
        name_to_normalized[variation.upper()] = normalized_name

# Function to normalize team names
def normalize_team_name(team_name):
    return name_to_normalized.get(team_name.upper(), team_name)  # Return input if no match

# Apply normalization to 'team_a' and 'team_b' columns
df_header['team_a_'] = df_header['team_a'].apply(normalize_team_name)
df_header['team_b_'] = df_header['team_b'].apply(normalize_team_name)

# Display the updated DataFrame (optional)
df_header.head()


In [None]:
# Assume df_teams is your DataFrame and team_mappings has already been defined.

# Create a mapping of team_id to normalized team names
# In this case, manually map 'team_id' keys to their normalized names.
team_id_to_name = {
    'OLY': 'OLYMPIACOS',
    'BAR': 'BARCELONA',
    'MAD': 'REAL MADRID',
    'CSK': 'CSKA MOSCOW',
    'ZAL': 'ZALGIRIS',
    'PAN': 'PANATHINAIKOS',
    'PAR': 'PARTIZAN',
    'MIL': 'MILAN',
    'IST': 'FENERBAHCE',
    'TEL': 'MACCABI TEL AVIV',
    'EFES': 'EFES',
    'BAS': 'BASKONIA',
    'VAL': 'VALENCIA',
    'MAL': 'UNICAJA',
    'BER': 'ALBA BERLIN',
    'MUN': 'BAYERN MUNICH',
    'BAM': 'BROSE BAMBERG',
    'GAL': 'GALATASARAY',
    'RED': 'CRVENA ZVEZDA',
    'BIL': 'BILBAO',
    'VIR': 'VIRTUS BOLOGNA',
    'DAR': 'DARUSSAFAKA',
    'BUD': 'BUDUCNOST',
    'MCO': 'MONACO',
    'KHI': 'KHIMKI',
    'LIE': 'LIETUVOS RYTAS',
    'CIB': 'CIBONA',
    'LJU': 'UNION OLIMPIJA',
    'SOP': 'PROKOM',
    'ROM': 'VIRTUS ROMA',
    'ASV': 'ASVEL',
    'NTR': 'NANTERRE',
    'STR': 'STRASBOURG',
    'LMG': 'LIMOGES',
    'CHO': 'CHOLET',
    'CHA': 'CHALON',
    'PRS': 'PARIS BASKETBALL'
}

# Function to map team_id to team_name
def map_team_id_to_name(team_id):
    return team_id_to_name.get(team_id, team_id)  # Return the team_id if no match is found

# Apply the mapping to create the new 'team_name' column
df_teams['team_name'] = df_teams['team_id'].apply(map_team_id_to_name)

# Display the updated DataFrame (optional)
df_teams.head()


In [None]:
df_players.columns

In [None]:
df_box_score.columns

In [None]:
df_teams.columns

In [None]:
df_points.columns

In [None]:
df_header.columns

In [None]:
df_team_stats_each_game.columns

In [None]:
merged_df_team_stats_each_game.columns

# PRIMEROS PLOTS / LINE CHARTS

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Filter the data for three-pointers made
df_three_pointers = df_points[df_points['action_id'] == '3FGM']

# Group by game_id to get the total three-pointers per game
three_pointers_per_game = df_three_pointers.groupby(['game_id', 'season_code'])['action_id'].count().reset_index()
three_pointers_per_game.rename(columns={'action_id': 'three_pointers_per_game'}, inplace=True)

# Group by season_code and calculate the average three-pointers per game
average_three_pointers_by_season = (
    three_pointers_per_game.groupby('season_code')['three_pointers_per_game']
    .mean()
    .reset_index()
)
average_three_pointers_by_season.rename(columns={'three_pointers_per_game': 'average_three_pointers'}, inplace=True)

# Sort by season_code (optional, if not already sorted)
average_three_pointers_by_season = average_three_pointers_by_season.sort_values('season_code')

# Plot the line chart
plt.figure(figsize=(10, 6))
plt.plot(average_three_pointers_by_season['season_code'],
         average_three_pointers_by_season['average_three_pointers'],
         marker='o', color='green')
plt.title('Average Three-Pointers Scored Per Game Per Season', fontsize=16)
plt.xlabel('Season', fontsize=14)
plt.ylabel('Average Three-Pointers Scored Per Game', fontsize=14)
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Filter the data for three-pointers made
df_three_pointers = df_points[(df_points['action_id'] == '3FGA') | (df_points['action_id'] == '3FGM')]

# Group by game_id to get the total three-pointers per game
three_pointers_per_game = df_three_pointers.groupby(['game_id', 'season_code'])['action_id'].count().reset_index()
three_pointers_per_game.rename(columns={'action_id': 'three_pointers_per_game'}, inplace=True)

# Group by season_code and calculate the average three-pointers per game
average_three_pointers_by_season = (
    three_pointers_per_game.groupby('season_code')['three_pointers_per_game']
    .mean()
    .reset_index()
)
average_three_pointers_by_season.rename(columns={'three_pointers_per_game': 'average_three_pointers'}, inplace=True)

# Sort by season_code (optional, if not already sorted)
average_three_pointers_by_season = average_three_pointers_by_season.sort_values('season_code')

# Plot the line chart
plt.figure(figsize=(10, 6))
plt.plot(average_three_pointers_by_season['season_code'],
         average_three_pointers_by_season['average_three_pointers'],
         marker='o', color='green')
plt.title('Average Three-Pointers Attempted Per Game Per Season', fontsize=16)
plt.xlabel('Season', fontsize=14)
plt.ylabel('Average Three-Pointers Attempted Per Game', fontsize=14)
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Filter the data for three-pointers made
df_three_pointers = df_points[(df_points['action_id'] == '3FGA') | (df_points['action_id'] == '2FGA') | (df_points['action_id'] == '3FGM') | (df_points['action_id'] == '2FGM')]

# Group by game_id to get the total three-pointers per game
three_pointers_per_game = df_three_pointers.groupby(['game_id', 'season_code'])['action_id'].count().reset_index()
three_pointers_per_game.rename(columns={'action_id': 'three_pointers_per_game'}, inplace=True)

# Group by season_code and calculate the average three-pointers per game
average_three_pointers_by_season = (
    three_pointers_per_game.groupby('season_code')['three_pointers_per_game']
    .mean()
    .reset_index()
)
average_three_pointers_by_season.rename(columns={'three_pointers_per_game': 'average_three_pointers'}, inplace=True)

# Sort by season_code (optional, if not already sorted)
average_three_pointers_by_season = average_three_pointers_by_season.sort_values('season_code')

# Plot the line chart
plt.figure(figsize=(10, 6))
plt.plot(average_three_pointers_by_season['season_code'],
         average_three_pointers_by_season['average_three_pointers'],
         marker='o', color='green')
plt.title('Average Shots Attempted Per Game Per Season', fontsize=16)
plt.xlabel('Season', fontsize=14)
plt.ylabel('Average Shots Attempted Per Game', fontsize=14)
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

first_game = df_header.iloc[0]

# Extract the first game's cumulative scores per quarter for both teams
quarters = ['score_quarter_1', 'score_quarter_2', 'score_quarter_3', 'score_quarter_4']

# Cumulative scores for Team A and Team B
cumulative_team_a = [first_game[f"{q}_a"] for q in quarters]
cumulative_team_b = [first_game[f"{q}_b"] for q in quarters]

# Compute points scored per quarter
points_team_a = [cumulative_team_a[0]] + [cumulative_team_a[i] - cumulative_team_a[i-1] for i in range(1, len(cumulative_team_a))]
points_team_b = [cumulative_team_b[0]] + [cumulative_team_b[i] - cumulative_team_b[i-1] for i in range(1, len(cumulative_team_b))]

# Define quarter labels
labels = ['Q1', 'Q2', 'Q3', 'Q4']

# Create a figure with 2 subplots
plt.figure(figsize=(14, 6))

# Subplot 1: Points scored per quarter
plt.subplot(1, 2, 1)
x = range(len(labels))
width = 0.4
plt.bar([p - width/2 for p in x], points_team_a, width=width, label='Team A', color='red')
plt.bar([p + width/2 for p in x], points_team_b, width=width, label='Team B', color='blue')
plt.xticks(x, labels)
plt.title('Points Scored Per Quarter', fontsize=16)
plt.xlabel('Quarter', fontsize=14)
plt.ylabel('Points', fontsize=14)
plt.legend()
plt.grid(axis='y', alpha=0.3)

# Subplot 2: Cumulative scores per quarter
plt.subplot(1, 2, 2)
plt.bar([p - width/2 for p in x], cumulative_team_a, width=width, label='Team A', color='red')
plt.bar([p + width/2 for p in x], cumulative_team_b, width=width, label='Team B', color='blue')
plt.xticks(x, labels)
plt.title('Cumulative Scores Per Quarter', fontsize=16)
plt.xlabel('Quarter', fontsize=14)
plt.ylabel('Cumulative Points', fontsize=14)
plt.legend()
plt.grid(axis='y', alpha=0.3)

# Adjust layout and display the plots
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Calculate total metrics by summing up across all seasons
df_players_total = df_players.groupby('player')[[
    'points', 'three_points_made', 'total_rebounds', 'assists', 'steals', 'blocks_favour'
]].sum()

# Calculate average metrics by averaging per game across all seasons
df_players_avg = df_players.groupby('player')[[
    'points_per_game', 'three_points_made_per_game', 'total_rebounds_per_game',
    'assists_per_game', 'steals_per_game', 'blocks_favour_per_game'
]].mean()

# Top 10 players by total points scored
top_10_points = df_players_total['points'].nlargest(10)

# Top 10 players by average points per game
top_10_avg_points = df_players_avg['points_per_game'].nlargest(10)

# Top 10 players by total three pointers scored
top_10_three_pointers = df_players_total['three_points_made'].nlargest(10)

# Top 10 players by average three pointers per game
top_10_avg_three_pointers = df_players_avg['three_points_made_per_game'].nlargest(10)

# Top 10 players by total rebounds
top_10_rebounds = df_players_total['total_rebounds'].nlargest(10)

# Top 10 players by average rebounds per game
top_10_avg_rebounds = df_players_avg['total_rebounds_per_game'].nlargest(10)

# Top 10 players by total assists
top_10_assists = df_players_total['assists'].nlargest(10)

# Top 10 players by average assists per game
top_10_avg_assists = df_players_avg['assists_per_game'].nlargest(10)

# Top 10 players by total steals
top_10_steals = df_players_total['steals'].nlargest(10)

# Top 10 players by average steals per game
top_10_avg_steals = df_players_avg['steals_per_game'].nlargest(10)

# Top 10 players by total blocks
top_10_blocks = df_players_total['blocks_favour'].nlargest(10)

# Top 10 players by average blocks per game
top_10_avg_blocks = df_players_avg['blocks_favour_per_game'].nlargest(10)

# Plot all bar charts
fig, axes = plt.subplots(3, 2, figsize=(14, 18))

# Bar chart for Total Points
axes[0, 0].bar(top_10_points.index, top_10_points.values, color='blue')
axes[0, 0].set_title('Top 10 Players by Total Points')
axes[0, 0].set_xlabel('Player')
axes[0, 0].set_ylabel('Total Points')
axes[0, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Points Per Game
axes[0, 1].bar(top_10_avg_points.index, top_10_avg_points.values, color='green')
axes[0, 1].set_title('Top 10 Players by Average Points Per Game')
axes[0, 1].set_xlabel('Player')
axes[0, 1].set_ylabel('Average Points per Game')
axes[0, 1].tick_params(axis='x', rotation=45)

# Bar chart for Total Three Pointers
axes[1, 0].bar(top_10_three_pointers.index, top_10_three_pointers.values, color='red')
axes[1, 0].set_title('Top 10 Players by Total Three Pointers')
axes[1, 0].set_xlabel('Player')
axes[1, 0].set_ylabel('Total Three Pointers')
axes[1, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Three Pointers Per Game
axes[1, 1].bar(top_10_avg_three_pointers.index, top_10_avg_three_pointers.values, color='orange')
axes[1, 1].set_title('Top 10 Players by Average Three Pointers Per Game')
axes[1, 1].set_xlabel('Player')
axes[1, 1].set_ylabel('Average Three Pointers per Game')
axes[1, 1].tick_params(axis='x', rotation=45)

# Bar chart for Total Rebounds
axes[2, 0].bar(top_10_rebounds.index, top_10_rebounds.values, color='purple')
axes[2, 0].set_title('Top 10 Players by Total Rebounds')
axes[2, 0].set_xlabel('Player')
axes[2, 0].set_ylabel('Total Rebounds')
axes[2, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Rebounds Per Game
axes[2, 1].bar(top_10_avg_rebounds.index, top_10_avg_rebounds.values, color='yellow')
axes[2, 1].set_title('Top 10 Players by Average Rebounds Per Game')
axes[2, 1].set_xlabel('Player')
axes[2, 1].set_ylabel('Average Rebounds per Game')
axes[2, 1].tick_params(axis='x', rotation=45)

# Adjust layout for better readability
plt.tight_layout()
plt.show()

# Plot for Steals and Blocks
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Bar chart for Total Steals
axes[0, 0].bar(top_10_steals.index, top_10_steals.values, color='cyan')
axes[0, 0].set_title('Top 10 Players by Total Steals')
axes[0, 0].set_xlabel('Player')
axes[0, 0].set_ylabel('Total Steals')
axes[0, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Steals Per Game
axes[0, 1].bar(top_10_avg_steals.index, top_10_avg_steals.values, color='pink')
axes[0, 1].set_title('Top 10 Players by Average Steals Per Game')
axes[0, 1].set_xlabel('Player')
axes[0, 1].set_ylabel('Average Steals per Game')
axes[0, 1].tick_params(axis='x', rotation=45)

# Bar chart for Total Blocks
axes[1, 0].bar(top_10_blocks.index, top_10_blocks.values, color='magenta')
axes[1, 0].set_title('Top 10 Players by Total Blocks')
axes[1, 0].set_xlabel('Player')
axes[1, 0].set_ylabel('Total Blocks')
axes[1, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Blocks Per Game
axes[1, 1].bar(top_10_avg_blocks.index, top_10_avg_blocks.values, color='brown')
axes[1, 1].set_title('Top 10 Players by Average Blocks Per Game')
axes[1, 1].set_xlabel('Player')
axes[1, 1].set_ylabel('Average Blocks per Game')
axes[1, 1].tick_params(axis='x', rotation=45)

# Adjust layout for better readability
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Calculate total metrics by summing up across all seasons for teams
df_teams_total = df_teams.groupby('team_id')[[
    'points', 'three_points_made', 'total_rebounds', 'assists', 'steals', 'blocks_favour'
]].sum()

# Calculate average metrics by averaging per game across all seasons for teams
df_teams_avg = df_teams.groupby('team_id')[[
    'points_per_game', 'three_points_made_per_game', 'total_rebounds_per_game',
    'assists_per_game', 'steals_per_game', 'blocks_favour_per_game'
]].mean()

# Top 10 teams by total points scored
top_10_teams_points = df_teams_total['points'].nlargest(10)

# Top 10 teams by average points per game
top_10_teams_avg_points = df_teams_avg['points_per_game'].nlargest(10)

# Top 10 teams by total three pointers scored
top_10_teams_three_pointers = df_teams_total['three_points_made'].nlargest(10)

# Top 10 teams by average three pointers per game
top_10_teams_avg_three_pointers = df_teams_avg['three_points_made_per_game'].nlargest(10)

# Top 10 teams by total rebounds
top_10_teams_rebounds = df_teams_total['total_rebounds'].nlargest(10)

# Top 10 teams by average rebounds per game
top_10_teams_avg_rebounds = df_teams_avg['total_rebounds_per_game'].nlargest(10)

# Top 10 teams by total assists
top_10_teams_assists = df_teams_total['assists'].nlargest(10)

# Top 10 teams by average assists per game
top_10_teams_avg_assists = df_teams_avg['assists_per_game'].nlargest(10)

# Top 10 teams by total steals
top_10_teams_steals = df_teams_total['steals'].nlargest(10)

# Top 10 teams by average steals per game
top_10_teams_avg_steals = df_teams_avg['steals_per_game'].nlargest(10)

# Top 10 teams by total blocks
top_10_teams_blocks = df_teams_total['blocks_favour'].nlargest(10)

# Top 10 teams by average blocks per game
top_10_teams_avg_blocks = df_teams_avg['blocks_favour_per_game'].nlargest(10)

# Plot all bar charts
fig, axes = plt.subplots(3, 2, figsize=(14, 18))

# Bar chart for Total Points
axes[0, 0].bar(top_10_teams_points.index, top_10_teams_points.values, color='blue')
axes[0, 0].set_title('Top 10 Teams by Total Points')
axes[0, 0].set_xlabel('Team')
axes[0, 0].set_ylabel('Total Points')
axes[0, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Points Per Game
axes[0, 1].bar(top_10_teams_avg_points.index, top_10_teams_avg_points.values, color='green')
axes[0, 1].set_title('Top 10 Teams by Average Points Per Game')
axes[0, 1].set_xlabel('Team')
axes[0, 1].set_ylabel('Average Points per Game')
axes[0, 1].tick_params(axis='x', rotation=45)

# Bar chart for Total Three Pointers
axes[1, 0].bar(top_10_teams_three_pointers.index, top_10_teams_three_pointers.values, color='red')
axes[1, 0].set_title('Top 10 Teams by Total Three Pointers')
axes[1, 0].set_xlabel('Team')
axes[1, 0].set_ylabel('Total Three Pointers')
axes[1, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Three Pointers Per Game
axes[1, 1].bar(top_10_teams_avg_three_pointers.index, top_10_teams_avg_three_pointers.values, color='orange')
axes[1, 1].set_title('Top 10 Teams by Average Three Pointers Per Game')
axes[1, 1].set_xlabel('Team')
axes[1, 1].set_ylabel('Average Three Pointers per Game')
axes[1, 1].tick_params(axis='x', rotation=45)

# Bar chart for Total Rebounds
axes[2, 0].bar(top_10_teams_rebounds.index, top_10_teams_rebounds.values, color='purple')
axes[2, 0].set_title('Top 10 Teams by Total Rebounds')
axes[2, 0].set_xlabel('Team')
axes[2, 0].set_ylabel('Total Rebounds')
axes[2, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Rebounds Per Game
axes[2, 1].bar(top_10_teams_avg_rebounds.index, top_10_teams_avg_rebounds.values, color='yellow')
axes[2, 1].set_title('Top 10 Teams by Average Rebounds Per Game')
axes[2, 1].set_xlabel('Team')
axes[2, 1].set_ylabel('Average Rebounds per Game')
axes[2, 1].tick_params(axis='x', rotation=45)

# Adjust layout for better readability
plt.tight_layout()
plt.show()

# Plot for Steals and Blocks
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Bar chart for Total Steals
axes[0, 0].bar(top_10_teams_steals.index, top_10_teams_steals.values, color='cyan')
axes[0, 0].set_title('Top 10 Teams by Total Steals')
axes[0, 0].set_xlabel('Team')
axes[0, 0].set_ylabel('Total Steals')
axes[0, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Steals Per Game
axes[0, 1].bar(top_10_teams_avg_steals.index, top_10_teams_avg_steals.values, color='pink')
axes[0, 1].set_title('Top 10 Teams by Average Steals Per Game')
axes[0, 1].set_xlabel('Team')
axes[0, 1].set_ylabel('Average Steals per Game')
axes[0, 1].tick_params(axis='x', rotation=45)

# Bar chart for Total Blocks
axes[1, 0].bar(top_10_teams_blocks.index, top_10_teams_blocks.values, color='magenta')
axes[1, 0].set_title('Top 10 Teams by Total Blocks')
axes[1, 0].set_xlabel('Team')
axes[1, 0].set_ylabel('Total Blocks')
axes[1, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Blocks Per Game
axes[1, 1].bar(top_10_teams_avg_blocks.index, top_10_teams_avg_blocks.values, color='brown')
axes[1, 1].set_title('Top 10 Teams by Average Blocks Per Game')
axes[1, 1].set_xlabel('Team')
axes[1, 1].set_ylabel('Average Blocks per Game')
axes[1, 1].tick_params(axis='x', rotation=45)

# Adjust layout for better readability
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Filter the dataset for the specific season 'E2023'
df_teams_season = df_teams[df_teams['season_code'] == 'E2023']

# Calculate total metrics by summing up across all games for teams in 'E2023'
df_teams_total_season = df_teams_season.groupby('team_id')[[
    'points', 'three_points_made', 'total_rebounds', 'assists', 'steals', 'blocks_favour'
]].sum()

# Calculate average metrics by averaging per game across all games for teams in 'E2023'
df_teams_avg_season = df_teams_season.groupby('team_id')[[
    'points_per_game', 'three_points_made_per_game', 'total_rebounds_per_game',
    'assists_per_game', 'steals_per_game', 'blocks_favour_per_game'
]].mean()

# Top 10 teams by total points scored in season 'E2023'
top_10_teams_points_season = df_teams_total_season['points'].nlargest(10)

# Top 10 teams by average points per game in season 'E2023'
top_10_teams_avg_points_season = df_teams_avg_season['points_per_game'].nlargest(10)

# Top 10 teams by total three pointers scored in season 'E2023'
top_10_teams_three_pointers_season = df_teams_total_season['three_points_made'].nlargest(10)

# Top 10 teams by average three pointers per game in season 'E2023'
top_10_teams_avg_three_pointers_season = df_teams_avg_season['three_points_made_per_game'].nlargest(10)

# Top 10 teams by total rebounds in season 'E2023'
top_10_teams_rebounds_season = df_teams_total_season['total_rebounds'].nlargest(10)

# Top 10 teams by average rebounds per game in season 'E2023'
top_10_teams_avg_rebounds_season = df_teams_avg_season['total_rebounds_per_game'].nlargest(10)

# Top 10 teams by total assists in season 'E2023'
top_10_teams_assists_season = df_teams_total_season['assists'].nlargest(10)

# Top 10 teams by average assists per game in season 'E2023'
top_10_teams_avg_assists_season = df_teams_avg_season['assists_per_game'].nlargest(10)

# Top 10 teams by total steals in season 'E2023'
top_10_teams_steals_season = df_teams_total_season['steals'].nlargest(10)

# Top 10 teams by average steals per game in season 'E2023'
top_10_teams_avg_steals_season = df_teams_avg_season['steals_per_game'].nlargest(10)

# Top 10 teams by total blocks in season 'E2023'
top_10_teams_blocks_season = df_teams_total_season['blocks_favour'].nlargest(10)

# Top 10 teams by average blocks per game in season 'E2023'
top_10_teams_avg_blocks_season = df_teams_avg_season['blocks_favour_per_game'].nlargest(10)

# Plot all bar charts for season 'E2023'
fig, axes = plt.subplots(3, 2, figsize=(14, 18))

# Bar chart for Total Points
axes[0, 0].bar(top_10_teams_points_season.index, top_10_teams_points_season.values, color='blue')
axes[0, 0].set_title('Top 10 Teams by Total Points (Season E2023)')
axes[0, 0].set_xlabel('Team')
axes[0, 0].set_ylabel('Total Points')
axes[0, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Points Per Game
axes[0, 1].bar(top_10_teams_avg_points_season.index, top_10_teams_avg_points_season.values, color='green')
axes[0, 1].set_title('Top 10 Teams by Average Points Per Game (Season E2023)')
axes[0, 1].set_xlabel('Team')
axes[0, 1].set_ylabel('Average Points per Game')
axes[0, 1].tick_params(axis='x', rotation=45)

# Bar chart for Total Three Pointers
axes[1, 0].bar(top_10_teams_three_pointers_season.index, top_10_teams_three_pointers_season.values, color='red')
axes[1, 0].set_title('Top 10 Teams by Total Three Pointers (Season E2023)')
axes[1, 0].set_xlabel('Team')
axes[1, 0].set_ylabel('Total Three Pointers')
axes[1, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Three Pointers Per Game
axes[1, 1].bar(top_10_teams_avg_three_pointers_season.index, top_10_teams_avg_three_pointers_season.values, color='orange')
axes[1, 1].set_title('Top 10 Teams by Average Three Pointers Per Game (Season E2023)')
axes[1, 1].set_xlabel('Team')
axes[1, 1].set_ylabel('Average Three Pointers per Game')
axes[1, 1].tick_params(axis='x', rotation=45)

# Bar chart for Total Rebounds
axes[2, 0].bar(top_10_teams_rebounds_season.index, top_10_teams_rebounds_season.values, color='purple')
axes[2, 0].set_title('Top 10 Teams by Total Rebounds (Season E2023)')
axes[2, 0].set_xlabel('Team')
axes[2, 0].set_ylabel('Total Rebounds')
axes[2, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Rebounds Per Game
axes[2, 1].bar(top_10_teams_avg_rebounds_season.index, top_10_teams_avg_rebounds_season.values, color='yellow')
axes[2, 1].set_title('Top 10 Teams by Average Rebounds Per Game (Season E2023)')
axes[2, 1].set_xlabel('Team')
axes[2, 1].set_ylabel('Average Rebounds per Game')
axes[2, 1].tick_params(axis='x', rotation=45)

# Adjust layout for better readability
plt.tight_layout()
plt.show()

# Plot for Steals and Blocks for season 'E2023'
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Bar chart for Total Steals
axes[0, 0].bar(top_10_teams_steals_season.index, top_10_teams_steals_season.values, color='cyan')
axes[0, 0].set_title('Top 10 Teams by Total Steals (Season E2023)')
axes[0, 0].set_xlabel('Team')
axes[0, 0].set_ylabel('Total Steals')
axes[0, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Steals Per Game
axes[0, 1].bar(top_10_teams_avg_steals_season.index, top_10_teams_avg_steals_season.values, color='pink')
axes[0, 1].set_title('Top 10 Teams by Average Steals Per Game (Season E2023)')
axes[0, 1].set_xlabel('Team')
axes[0, 1].set_ylabel('Average Steals per Game')
axes[0, 1].tick_params(axis='x', rotation=45)

# Bar chart for Total Blocks
axes[1, 0].bar(top_10_teams_blocks_season.index, top_10_teams_blocks_season.values, color='magenta')
axes[1, 0].set_title('Top 10 Teams by Total Blocks (Season E2023)')
axes[1, 0].set_xlabel('Team')
axes[1, 0].set_ylabel('Total Blocks')
axes[1, 0].tick_params(axis='x', rotation=45)

# Bar chart for Average Blocks Per Game
axes[1, 1].bar(top_10_teams_avg_blocks_season.index, top_10_teams_avg_blocks_season.values, color='brown')
axes[1, 1].set_title('Top 10 Teams by Average Blocks Per Game (Season E2023)')
axes[1, 1].set_xlabel('Team')
axes[1, 1].set_ylabel('Average Blocks per Game')
axes[1, 1].tick_params(axis='x', rotation=45)

# Adjust layout for better readability
plt.tight_layout()
plt.show()


# PREDICTING WHO WOULD WIN A GAME

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error
from sklearn.multioutput import MultiOutputRegressor

In [None]:

# Step 3.1: Define numeric features
numeric_features = [
    'two_points_made_a', 'two_points_attempted_a', 'three_points_made_a', 'three_points_attempted_a',
    'free_throws_made_a', 'free_throws_attempted_a', 'offensive_rebounds_a', 'defensive_rebounds_a',
    'total_rebounds_a', 'assists_a', 'steals_a', 'turnovers_a', 'blocks_favour_a', 'blocks_against_a',
    'fouls_committed_a', 'fouls_received_a', 'valuation_a', 'points_received_a',
    'two_points_made_b', 'two_points_attempted_b', 'three_points_made_b', 'three_points_attempted_b',
    'free_throws_made_b', 'free_throws_attempted_b', 'offensive_rebounds_b', 'defensive_rebounds_b',
    'total_rebounds_b', 'assists_b', 'steals_b', 'turnovers_b', 'blocks_favour_b', 'blocks_against_b',
    'fouls_committed_b', 'fouls_received_b', 'valuation_b', 'points_received_b'
]

# Compute correlation matrix
#correlation_matrix = merged_df_team_stats_each_game[numeric_features + ['win_a']].corr()
correlation_matrix = merged_df_team_stats_each_game_all_games[numeric_features + ['win_a']].corr()
correlations_with_target = correlation_matrix['win_a'].drop('win_a').sort_values(ascending=False)

# Visualize the top correlations
plt.figure(figsize=(12, 8))
sns.barplot(x=correlations_with_target.values, y=correlations_with_target.index, palette='viridis')
plt.title('Correlation of Features with Target (win_a)')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Features')
plt.axvline(0, color='black', linewidth=0.8, linestyle='--')
plt.tight_layout()
plt.show()

# Display top correlated features
print("Top Correlated Features with Target (win_a):")
print(correlations_with_target)


In [None]:
# Step 3.2: Define features and target for modeling
features = [
    'defensive_rebounds_a', 'offensive_rebounds_a', 'assists_a', 'three_points_made_a', 'turnovers_a', 'blocks_favour_a', 'steals_a',
    'defensive_rebounds_b', 'offensive_rebounds_b', 'assists_b', 'three_points_made_b', 'turnovers_b', 'blocks_favour_b', 'steals_b']

target = 'win_a'

#X = merged_df_team_stats_each_game[features]
#y = merged_df_team_stats_each_game[target]


X = merged_df_team_stats_each_game_all_games[features]
y = merged_df_team_stats_each_game_all_games[target]

# Step 3.3: Train-test split
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X, y, test_size=0.2, random_state=42)


print("Length of X_train:", len(X_train_all))
print("Length of X_test:", len(X_test_all))
print("Length of y_train:", len(y_train_all))
print("Length of y_test:", len(y_test_all))

In [None]:
# Logistic Regression
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train_all, y_train_all)
logistic_preds = logistic_model.predict(X_test_all)

print("Logistic Regression Performance:")
print(classification_report(y_test_all, logistic_preds))
print("Accuracy:", accuracy_score(y_test_all, logistic_preds))
print("MSE:", mean_squared_error(y_test_all, logistic_preds))


In [None]:
# Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_all, y_train_all)
rf_preds = rf_model.predict(X_test_all)

print("\nRandom Forest Performance:")
print(classification_report(y_test_all, rf_preds))
print("Accuracy:", accuracy_score(y_test_all, rf_preds))
print("MSE:", mean_squared_error(y_test_all, rf_preds))


In [None]:
# Example team IDs for the matchup
team_a_id = 'BAR'
team_b_id = 'ASV'

# Fetch stats for both teams for the season E2024
team_a_stats = df_teams[(df_teams['team_id'] == team_a_id) & (df_teams['season_code'] == 'E2024')].iloc[0]
team_b_stats = df_teams[(df_teams['team_id'] == team_b_id) & (df_teams['season_code'] == 'E2024')].iloc[0]

# Define input features for the matchup
input_features = {
    'defensive_rebounds_a': team_a_stats['defensive_rebounds_per_game'],
    'offensive_rebounds_a': team_a_stats['offensive_rebounds_per_game'],
    'assists_a': team_a_stats['assists_per_game'],
    'three_points_made_a': team_a_stats['three_points_made_per_game'],
    'turnovers_a': team_a_stats['turnovers_per_game'],
    'blocks_favour_a': team_a_stats['blocks_favour_per_game'],
    'steals_a': team_a_stats['steals_per_game'],
    'defensive_rebounds_b': team_b_stats['defensive_rebounds_per_game'],
    'offensive_rebounds_b': team_b_stats['offensive_rebounds_per_game'],
    'assists_b': team_b_stats['assists_per_game'],
    'three_points_made_b': team_b_stats['three_points_made_per_game'],
    'turnovers_b': team_b_stats['turnovers_per_game'],
    'blocks_favour_b': team_b_stats['blocks_favour_per_game'],
    'steals_b': team_b_stats['steals_per_game'],
}


# Convert to DataFrame
input_data = pd.DataFrame([input_features])

# Predict
logistic_prediction = logistic_model.predict(input_data)[0]
rf_prediction = rf_model.predict(input_data)[0]

print(f"Logistic Model Prediction: {'Team A Wins' if logistic_prediction == 1 else 'Team B Wins'}")
print(f"Random Forest Prediction: {'Team A Wins' if rf_prediction == 1 else 'Team B Wins'}")


# PREDICTING SCORES

In [None]:
# Define features for both teams
features = [
    'defensive_rebounds_a', 'offensive_rebounds_a', 'assists_a',
    'three_points_made_a', 'turnovers_a', 'blocks_favour_a', 'steals_a',
    'defensive_rebounds_b', 'offensive_rebounds_b', 'assists_b',
    'three_points_made_b', 'turnovers_b', 'blocks_favour_b', 'steals_b']

# Targets
targets = ['points_a', 'points_b']

# Input features and targets
X = merged_df_team_stats_each_game[features]
y = merged_df_team_stats_each_game[targets]

# Split data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Linear Regression model
linear_regressor = MultiOutputRegressor(LinearRegression())
linear_regressor.fit(X_train, y_train)

# Predictions
y_pred_linear = linear_regressor.predict(X_test)

# Evaluate the model
mse_linear_points_a = mean_squared_error(y_test['points_a'], y_pred_linear[:, 0])
mse_linear_points_b = mean_squared_error(y_test['points_b'], y_pred_linear[:, 1])

print(f"Linear Regression - MSE for points_a: {mse_linear_points_a}")
print(f"Linear Regression - MSE for points_b: {mse_linear_points_b}")

# Predict points for the example teams
predicted_points_linear = linear_regressor.predict(input_data)
predicted_points_a_linear = predicted_points_linear[0][0]
predicted_points_b_linear = predicted_points_linear[0][1]

print(f"Linear Regression Predicted Points - Team A: {predicted_points_a_linear:.2f}")
print(f"Linear Regression Predicted Points - Team B: {predicted_points_b_linear:.2f}")



#Linear Regression - MSE for points_a: 52.00382467233077
#Linear Regression - MSE for points_b: 50.91385157685053 -> mejor no usar eurocup
#Linear Regression Predicted Points - Team A: 79.82
#Linear Regression Predicted Points - Team B: 78.80


# PREDICTING SCORES OTHER MODELS

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting model
gb_regressor = MultiOutputRegressor(GradientBoostingRegressor(random_state=42))
gb_regressor.fit(X_train, y_train)
y_pred_gb = gb_regressor.predict(X_test)

# Evaluate
mse_gb_points_a = mean_squared_error(y_test['points_a'], y_pred_gb[:, 0])
mse_gb_points_b = mean_squared_error(y_test['points_b'], y_pred_gb[:, 1])
print(f"Gradient Boosting - MSE for points_a: {mse_gb_points_a}")
print(f"Gradient Boosting - MSE for points_b: {mse_gb_points_b}")




In [None]:
from sklearn.svm import SVR

# Support Vector Regressor
svr_regressor = MultiOutputRegressor(SVR(kernel='rbf', C=1.0, epsilon=0.1))
svr_regressor.fit(X_train, y_train)
y_pred_svr = svr_regressor.predict(X_test)

# Evaluate
mse_svr_points_a = mean_squared_error(y_test['points_a'], y_pred_svr[:, 0])
mse_svr_points_b = mean_squared_error(y_test['points_b'], y_pred_svr[:, 1])
print(f"SVR - MSE for points_a: {mse_svr_points_a}")
print(f"SVR - MSE for points_b: {mse_svr_points_b}")

#SVR - MSE for points_a: 52.63342699131339 -> mejor no usar eurocup
#SVR - MSE for points_b: 52.26377120953878


In [None]:
from sklearn.neural_network import MLPRegressor

mlp_regressor = MultiOutputRegressor(MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42))
mlp_regressor.fit(X_train, y_train)
y_pred_mlp = mlp_regressor.predict(X_test)

# Evaluate\n",
mse_mlp_points_a = mean_squared_error(y_test['points_a'], y_pred_mlp[:, 0])
mse_mlp_points_b = mean_squared_error(y_test['points_b'], y_pred_mlp[:, 1])
print(f"MLP - MSE for points_a: {mse_mlp_points_a}")
print(f"MLP - MSE for points_b: {mse_mlp_points_b}")


#MLP - MSE for points_a: 54.561366774148205\n",
#MLP - MSE for points_b: 52.910915759019524 -> mejor no usar eurocup\n"

# PREDICTING TEAM RECORD (SEASON SIMULATION)

In [None]:
# Function to simulate a season for a selected team
def simulate_season(team_id, df_teams, model):
    # Filter teams for the current season
    teams = df_teams[df_teams['season_code'] == 'E2024']['team_id'].unique()
    
    # Ensure the selected team is valid
    if team_id not in teams:
        raise ValueError(f"Team ID {team_id} not found in the current season!")
    
    # Initialize record
    wins = 0
    losses = 0
    results = []  # Store detailed game results

    # Get stats for the selected team
    team_stats = df_teams[df_teams['team_id'] == team_id].iloc[0]

    # Simulate games against all other teams
    for opponent_id in teams:
        if opponent_id == team_id:
            continue  # Skip games against itself
        
        # Get stats for the opponent team
        opponent_stats = df_teams[df_teams['team_id'] == opponent_id].iloc[0]

        # Simulate game 1: selected team as local (team_a), opponent as away (team_b)
        input_data_local = pd.DataFrame([{
            'defensive_rebounds_a': team_stats['defensive_rebounds_per_game'],
            'offensive_rebounds_a': team_stats['offensive_rebounds_per_game'],
            'assists_a': team_stats['assists_per_game'],
            'three_points_made_a': team_stats['three_points_made_per_game'],
            'turnovers_a': team_stats['turnovers_per_game'],
            'blocks_favour_a': team_stats.get('blocks_favour_per_game', 0),
            'steals_a': team_stats.get('steals_per_game', 0),
            'defensive_rebounds_b': opponent_stats['defensive_rebounds_per_game'],
            'offensive_rebounds_b': opponent_stats['offensive_rebounds_per_game'],
            'assists_b': opponent_stats['assists_per_game'],
            'three_points_made_b': opponent_stats['three_points_made_per_game'],
            'turnovers_b': opponent_stats['turnovers_per_game'],
            'blocks_favour_b': opponent_stats.get('blocks_favour_per_game', 0),
            'steals_b': opponent_stats.get('steals_per_game', 0),
        }])

        # Predict scores
        predicted_scores_local = model.predict(input_data_local)
        points_a_local = predicted_scores_local[0][0]
        points_b_local = predicted_scores_local[0][1]

        # Determine the result
        if points_a_local > points_b_local:
            wins += 1
            results.append({'team_a': team_id, 'team_b': opponent_id, 'points_a': points_a_local, 'points_b': points_b_local, 'winner': team_id})
        else:
            losses += 1
            results.append({'team_a': team_id, 'team_b': opponent_id, 'points_a': points_a_local, 'points_b': points_b_local, 'winner': opponent_id})

        # Simulate game 2: selected team as away (team_b), opponent as local (team_a)
        input_data_away = pd.DataFrame([{
            'defensive_rebounds_a': opponent_stats['defensive_rebounds_per_game'],
            'offensive_rebounds_a': opponent_stats['offensive_rebounds_per_game'],
            'assists_a': opponent_stats['assists_per_game'],
            'three_points_made_a': opponent_stats['three_points_made_per_game'],
            'turnovers_a': opponent_stats['turnovers_per_game'],
            'blocks_favour_a': opponent_stats.get('blocks_favour_per_game', 0),
            'steals_a': opponent_stats.get('steals_per_game', 0),
            'defensive_rebounds_b': team_stats['defensive_rebounds_per_game'],
            'offensive_rebounds_b': team_stats['offensive_rebounds_per_game'],
            'assists_b': team_stats['assists_per_game'],
            'three_points_made_b': team_stats['three_points_made_per_game'],
            'turnovers_b': team_stats['turnovers_per_game'],
            'blocks_favour_b': team_stats.get('blocks_favour_per_game', 0),
            'steals_b': team_stats.get('steals_per_game', 0),
        }])

        # Predict scores
        predicted_scores_away = model.predict(input_data_away)
        points_a_away = predicted_scores_away[0][0]
        points_b_away = predicted_scores_away[0][1]

        # Determine the result
        if points_b_away > points_a_away:
            wins += 1
            results.append({'team_a': opponent_id, 'team_b': team_id, 'points_a': points_a_away, 'points_b': points_b_away, 'winner': team_id})
        else:
            losses += 1
            results.append({'team_a': opponent_id, 'team_b': team_id, 'points_a': points_a_away, 'points_b': points_b_away, 'winner': opponent_id})

    # Return the season record and detailed results
    return {'team_id': team_id, 'wins': wins, 'losses': losses, 'results': results}

# Example usage
team_to_simulate = 'BAR'
season_results = simulate_season(team_to_simulate, df_teams, linear_regressor)

# Print results
print(f"Simulated Season Record for {team_to_simulate}: {season_results['wins']} Wins, {season_results['losses']} Losses")


# GLOBAL EXPLAINABILITY OF LINEAR REGRESSOR ON POINTS SCORED

In [None]:
import shap
import numpy as np

# SHAP Explainer for MultiOutputRegressor
# SHAP doesn't natively support MultiOutputRegressor, so we explain each target separately.
explainer_points_a = shap.Explainer(linear_regressor.estimators_[0], X_train)
explainer_points_b = shap.Explainer(linear_regressor.estimators_[1], X_train)

# Compute SHAP values for each target
shap_values_points_a = explainer_points_a(X_test)
shap_values_points_b = explainer_points_b(X_test)

# Print the shape of SHAP values
print(f"SHAP values shape for points_a: {np.array(shap_values_points_a.values).shape}")
print(f"SHAP values shape for points_b: {np.array(shap_values_points_b.values).shape}")

In [None]:

# Summary plot for points_a
shap.summary_plot(shap_values_points_a, X_test, show=False)



In [None]:
shap.plots.bar(shap_values_points_a)

In [None]:

# Summary plot for points_b
shap.summary_plot(shap_values_points_b, X_test, show=False)
plt.title("SHAP Summary for Points B")
plt.show()

In [None]:
shap.plots.bar(shap_values_points_b)

In [None]:
# SHAP scatter plot for 'defensive_rebounds_a' (target: points_a)
shap.plots.scatter(shap_values_points_a[:, 'defensive_rebounds_a'], color=shap_values_points_a[:, 'three_points_made_b'])

# LOCAL EXPLAINABILITY

In [None]:
# Get the first sample of X_test (the first game)
sample_X = X_test.iloc[-1:]  # First row, as we want to explain the first prediction

# SHAP Explainer for points_a (target)
shap_values_points_a_sample = explainer_points_a(sample_X)

# SHAP Explainer for points_b (target)
shap_values_points_b_sample = explainer_points_b(sample_X)

# Print the shape of SHAP values for the sample
print(f"SHAP values shape for points_a: {np.array(shap_values_points_a_sample.values).shape}")
print(f"SHAP values shape for points_b: {np.array(shap_values_points_b_sample.values).shape}")


In [None]:
# Waterfall plot for points_a (target)
shap.plots.waterfall(shap_values_points_a_sample[0]) 

In [None]:
# Decision plot for points_a (target)
shap.decision_plot(explainer_points_a.expected_value, shap_values_points_a_sample.values[0], feature_names=list(X_test.columns), show=True)

In [None]:
# Waterfall plot for points_b (target)
shap.plots.waterfall(shap_values_points_b_sample[0])

In [None]:
# Decision plot for points_b (target)
shap.decision_plot(explainer_points_b.expected_value, shap_values_points_b_sample.values[0], feature_names=list(X_test.columns), show=True)

# GLOBAL EXPLAINABILITY SIMPLE LOGISTIC MODEL

In [None]:
# SHAP Explainer for Logistic Regression model
explainer = shap.Explainer(logistic_model, X_train_all)
shap_values = explainer(X_test_all)

# Print the shape of SHAP values
print(f"SHAP values shape: {np.array(shap_values.values).shape}")


In [None]:
# Get the probabilities for class 1 (team A wins)
predicted_probabilities = logistic_model.predict_proba(X_test_all)[:, 1]

# Compute the average prediction
average_prediction = np.mean(predicted_probabilities)

print(f"Average prediction (probability of Team A winning): {average_prediction:.4f}")


In [None]:
shap.summary_plot(shap_values, X_test_all, show=False)


In [None]:
shap.plots.bar(shap_values)

In [None]:
shap.plots.scatter(shap_values[:, 'defensive_rebounds_a'], color=shap_values[:, 'three_points_made_b'])

# LOCAL

In [None]:
# Select the last sample from X_test
sample_X = X_test.iloc[-1:]

# Compute SHAP values for the first sample
shap_values_sample = explainer(sample_X)

# Waterfall plot for the first sample
shap.plots.waterfall(shap_values_sample[0])


In [None]:
shap.decision_plot(explainer.expected_value, shap_values.values[0], feature_names=list(X_test_all.columns), show=True)

# SAVING MODELS PICKLE

In [None]:
import pickle

# Save Logistic Regression model
with open('logistic_model.pkl', 'wb') as f:
    pickle.dump(logistic_model, f)

# Save Random Forest model
with open('rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)


In [None]:

with open('linear_regressor_points.pkl', 'wb') as f:
    pickle.dump(linear_regressor, f)


with open('gradient_boosting_points.pkl', 'wb') as f:
    pickle.dump(gb_regressor, f)


with open('svr_regressor_points.pkl', 'wb') as f:
    pickle.dump(svr_regressor, f)

