<a href="https://colab.research.google.com/github/shishirnarwal/tennis_prediction_model/blob/main/01_data_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Exploring Jeff Sackman's Tennis Dataset

In [None]:
# Loading ATP matches for 2023
import pandas as pd

atp_matches_2023 = 'https://raw.githubusercontent.com/shishirnarwal/tennis_atp_jeff_sackman/refs/heads/master/atp_matches_2023.csv'

try:
    matches_df = pd.read_csv(atp_matches_2023)
    print("CSV loaded successfully!")
    print(matches_df.head())
except Exception as e:
    print(f"Error loading CSV from GitHub: {e}")

In [None]:
# Loading ATP rankings at end of 2023
atp_rankings = 'https://raw.githubusercontent.com/shishirnarwal/tennis_atp_jeff_sackman/refs/heads/master/atp_rankings_current.csv'

try:
    rank_df = pd.read_csv(atp_rankings)
    print("CSV loaded successfully!")
    print(rank_df.head())
except Exception as e:
    print(f"Error loading CSV from GitHub: {e}")

In [None]:
# Loading ATP players details
atp_players = 'https://raw.githubusercontent.com/shishirnarwal/tennis_atp_jeff_sackman/refs/heads/master/atp_players.csv'

try:
    players_df = pd.read_csv(atp_players)
    print("CSV loaded successfully!")
    print(players_df.head())
except Exception as e:
    print(f"Error loading CSV from GitHub: {e}")

In [None]:
# Joining ATP rankings with ATP players
rank_df_joined = rank_df.merge(players_df, left_on='player', right_on='player_id')
rank_df_joined.head()

In [None]:
matches_df.head()

Calculating Win Rates by Surface

In [None]:
# Calculating win count per surface per player
player_wins = matches_df[['surface', 'winner_id']].value_counts().reset_index()
player_wins.columns = ['surface', 'player_id', 'wins']

# Calculating loss count per surface per player
player_losses = matches_df[['surface', 'loser_id']].value_counts().reset_index()
player_losses.columns = ['surface', 'player_id', 'losses']

player_stats = pd.merge(player_wins, player_losses, on=('surface', 'player_id'), how='outer').fillna(0)

player_stats['total_matches'] = player_stats['wins'] + player_stats['losses']
player_stats['win_rate'] = player_stats['wins'] / player_stats['total_matches']

# Sort by win rate in descending order and display the top players
print("Player Win Rates (Top 10):\n")
print(player_stats[player_stats['total_matches'] >= 10].sort_values(by='win_rate', ascending=False).head(10))

# Merging with players_df to get player names
player_win_rates_with_names = pd.merge(player_stats, players_df[['player_id', 'name_first', 'name_last']], on='player_id', how='left')
player_win_rates_with_names['player_name'] = player_win_rates_with_names['name_first'] + ' ' + player_win_rates_with_names['name_last']

print("\nPlayer Win Rates with Names (Top 10):\n")
print(player_win_rates_with_names[player_win_rates_with_names['total_matches'] >= 10].sort_values(by='win_rate', ascending=False).head(10)[['player_name', 'surface', 'wins', 'losses', 'total_matches', 'win_rate']])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Filter for players with at least 10 matches and get the top 10 overall win rates
top_10_win_rates = player_win_rates_with_names[player_win_rates_with_names['total_matches'] >= 10]
top_10_win_rates = top_10_win_rates.sort_values(by='win_rate', ascending=False).head(10)

plt.figure(figsize=(12, 7))
sns.barplot(data=top_10_win_rates, x='player_name', y='win_rate', hue='surface', palette='viridis')
plt.title('Top 10 Player Win Rates by Surface (Min. 10 Matches)')
plt.xlabel('Player Name')
plt.ylabel('Win Rate')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Surface')
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

unique_surfaces = player_win_rates_with_names['surface'].unique()
num_surfaces = len(unique_surfaces)

fig, axes = plt.subplots(num_surfaces, 1, figsize=(12, 6 * num_surfaces), sharex=False)

for i, surface in enumerate(unique_surfaces):
    # Filter data for the current surface and players with at least 5 matches on that surface
    surface_data = player_win_rates_with_names[
        (player_win_rates_with_names['surface'] == surface) &
        (player_win_rates_with_names['total_matches'] >= 5)
    ].sort_values(by='win_rate', ascending=False).head(10)

    if not surface_data.empty:
        sns.barplot(ax=axes[i], data=surface_data, x='player_name', y='win_rate', palette='viridis')
        axes[i].set_title(f'Top 10 Player Win Rates on {surface} Surface (Min. 5 Matches)')
        axes[i].set_xlabel('Player Name')
        axes[i].set_ylabel('Win Rate')
        axes[i].tick_params(axis='x', rotation=45)
    else:
        axes[i].set_title(f'No players with >=10 matches on {surface} surface')
        axes[i].set_xticks([])
        axes[i].set_yticks([])
        axes[i].text(0.5, 0.5, 'No data to display', horizontalalignment='center', verticalalignment='center', transform=axes[i].transAxes)

plt.tight_layout()
plt.show()

Ranking vs Win Rates

In [None]:
# Creating flag for Walkovers
matches_df['Is_Walkover'] = matches_df['score'].str.contains('RET', na=False)

In [None]:
print(f'Number of matches with missing winner rank: {matches_df['winner_rank'].isna().sum()}')
print(f'Number of matches with missing loser rank: {matches_df['loser_rank'].isna().sum()}')