In [None]:
import pandas as pd
import requests
from io import StringIO
import networkx as nx
import numpy as np
from pyvis.network import Network

In [None]:
# Save the data
pickle_path = "/Users/saurabhkumar/Desktop/Work/Tennis"
# atp_data.to_pickle(f"{pickle_path}/ATPdata1968_2024.pkl")
# Read the data
atp_data = pd.read_pickle(f"{pickle_path}//ATPdata1968_2024.pkl")

In [None]:
atp_data.columns

In [None]:

def build_graph(data):
    G = nx.Graph()
    
    # Iterate through each match in the dataset
    for index, row in data.iterrows():
        winner = row['winner_name']
        loser = row['loser_name']
        surface = str(row['surface'])  # Ensure surface is treated as a string
        tourney_name = str(row['tourney_name'])
        winner_country = row['winner_ioc']
        loser_country = row['loser_ioc']
        winner_top_5 = row['winner_rank'] <= 5
        loser_top_5 = row['loser_rank'] <= 5

        # Initialize nodes for winner and loser if they don't already exist
        if winner not in G:
            G.add_node(winner, matches_won=0, matches_played=0, country=winner_country, 
                       surface_wins={}, surface_matches={},tourney_wins={}, tourney_matches={}, top_5_wins=0, top_5_matches=0, 
                       was_top_5=False, win_pct_category='50% or below', opponents=set())
        if loser not in G:
            G.add_node(loser, matches_won=0, matches_played=0, country=loser_country, 
                       surface_wins={}, surface_matches={},tourney_wins={}, tourney_matches={}, top_5_wins=0, top_5_matches=0, 
                       was_top_5=False, win_pct_category='50% or below', opponents=set())

        # Update node data
        G.nodes[winner]['matches_won'] += 1
        G.nodes[winner]['matches_played'] += 1
        G.nodes[loser]['matches_played'] += 1
        G.nodes[winner]['opponents'].add(loser)
        G.nodes[loser]['opponents'].add(winner)

        # Track wins and matches against top 5 players
        if winner_top_5:
            G.nodes[winner]['was_top_5'] = True
        if loser_top_5:
            G.nodes[loser]['was_top_5'] = True
            G.nodes[winner]['top_5_wins'] += 1
            G.nodes[winner]['top_5_matches'] += 1
        if winner_top_5:
            G.nodes[loser]['top_5_matches'] += 1

        # Track wins and matches by surface
        G.nodes[winner]['surface_wins'].setdefault(surface, 0)
        G.nodes[winner]['surface_matches'].setdefault(surface, 0)
        G.nodes[winner]['surface_wins'][surface] += 1
        G.nodes[winner]['surface_matches'][surface] += 1
        G.nodes[loser]['surface_matches'].setdefault(surface, 0)
        G.nodes[loser]['surface_matches'][surface] += 1

        # Track wins and matches by tournament name
        G.nodes[winner]['tourney_wins'].setdefault(tourney_name, 0)
        G.nodes[winner]['tourney_matches'].setdefault(tourney_name, 0)
        G.nodes[winner]['tourney_wins'][tourney_name] += 1
        G.nodes[winner]['tourney_matches'][tourney_name] += 1
        G.nodes[loser]['tourney_matches'].setdefault(tourney_name, 0)
        G.nodes[loser]['tourney_matches'][tourney_name] += 1
        
        # # Add or update edges
        if G.has_edge(winner, loser):
            G[winner][loser]['matches'] += 1
            if 'surface_matches' not in G[winner][loser]:
                G[winner][loser]['surface_matches'] = {}
            if 'tourney_matches' not in G[winner][loser]:
                G[winner][loser]['tourney_matches'] = {}    
            if surface not in G[winner][loser]['surface_matches']:
                G[winner][loser]['surface_matches'][surface] = 0
                G[winner][loser][f'win_count_{surface}_{winner}'] = 0  # Initialize for winner
                G[winner][loser][f'win_count_{surface}_{loser}'] = 0   # Initialize for loser
            if tourney_name not in G[winner][loser]['tourney_matches']:
                G[winner][loser]['tourney_matches'][tourney_name] = 0
                G[winner][loser][f'win_count_{tourney_name}_{winner}'] = 0  # Initialize for winner
                G[winner][loser][f'win_count_{tourney_name}_{loser}'] = 0   # Initialize for loser   
            G[winner][loser]['surface_matches'][surface] += 1
            G[winner][loser][f'win_count_{surface}_{winner}'] += 1
            G[winner][loser]['tourney_matches'][tourney_name] += 1
            G[winner][loser][f'win_count_{tourney_name}_{winner}'] += 1
        else:
            # Initializing a new edge with the first match and setting win counts
            G.add_edge(winner, loser, matches=1, surface_matches={surface: 1},
               **{f'win_count_{surface}_{winner}': 1, f'win_count_{surface}_{loser}': 0})
            G.add_edge(winner, loser, matches=1, surface_matches={surface: 1}, tourney_matches={tourney_name: 1},
                      **{f'win_count_{tourney_name}_{winner}': 1, f'win_count_{tourney_name}_{loser}': 0})

        # if G.has_edge(winner, loser):
        #     G[winner][loser]['matches'] += 1
        #     # Update tournament match counts on the edge
        #     if 'tourney_matches' not in G[winner][loser]:
        #         G[winner][loser]['tourney_matches'] = {}
        #     if tourney_name not in G[winner][loser]['tourney_matches']:
        #         G[winner][loser]['tourney_matches'][tourney_name] = 0
        #         G[winner][loser][f'win_count_{tourney_name}_{winner}'] = 0  # Initialize for winner
        #         G[winner][loser][f'win_count_{tourney_name}_{loser}'] = 0   # Initialize for loser
        #     G[winner][loser]['tourney_matches'][tourney_name] += 1
        #     G[winner][loser][f'win_count_{tourney_name}_{winner}'] += 1
        # else:
        #     # Initializing a new edge with the first match and setting initial counts for both surface and tournament
        #     G.add_edge(winner, loser, matches=1, surface_matches={surface: 1}, tourney_matches={tourney_name: 1},
        #               **{f'win_count_{tourney_name}_{winner}': 1, f'win_count_{tourney_name}_{loser}': 0})
            
    # Update win percentage categories and surface-specific win percentages
    for node, attr in G.nodes(data=True):
        attr['win_pct'] = attr['matches_won'] / attr['matches_played'] * 100
        attr['top_5_win_pct'] = (attr['top_5_wins'] / attr['top_5_matches'] * 100) if attr['top_5_matches'] > 0 else 0
        attr['unique_opponents'] = len(attr['opponents'])
        del attr['opponents']  # Clean up to save memory

        # Update win percentage categories based on overall win percentage
        if attr['win_pct'] > 70:
            attr['win_pct_category'] = 'Above 70%'
        elif attr['win_pct'] > 60:
            attr['win_pct_category'] = '61% - 70%'
        elif attr['win_pct'] > 50:
            attr['win_pct_category'] = '51% - 60%'
        elif attr['win_pct'] > 40:
            attr['win_pct_category'] = '41% - 50%'
        else:
            attr['win_pct_category'] = '40% or below'

        # Calculate surface-specific win percentages
        for surface, wins in attr['surface_wins'].items():
            matches_on_surface = attr['surface_matches'][surface]
            attr['win_pct_' + surface] = wins / matches_on_surface * 100 if matches_on_surface > 0 else 0
         # Calculate Tourney Name - specific win percentages
        for tourney, wins in attr['tourney_wins'].items():
            matches_in_tourney = attr['tourney_matches'][tourney]
            attr['win_pct_' + tourney] = wins / matches_in_tourney * 100 if matches_in_tourney > 0 else 0


    return G




In [None]:
# atp_data['year'] = pd.to_datetime(atp_data['tourney_date'], format='%Y%m%d').dt.year
# atp_data['tourney_name'] = atp_data['tourney_name'].replace('Us Open', 'US Open', regex=False)
grand_slams = ['Australian Open', 'Roland Garros', 'Wimbledon', 'US Open']


# Filter finals for each Grand Slam from 1982 onward
australian_open_finals82 = atp_data[(atp_data['year'] >= 1982) & (atp_data['tourney_name'] == 'Australian Open') & (atp_data['round'] == 'F')]
french_open_finals82 = atp_data[(atp_data['year'] >= 1982) & (atp_data['tourney_name'] == 'Roland Garros') & (atp_data['round'] == 'F')]
wimbledon_finals82 = atp_data[(atp_data['year'] >= 1982) & (atp_data['tourney_name'] == 'Wimbledon') & (atp_data['round'] == 'F')]
us_open_finals82 = atp_data[(atp_data['year'] >= 1982) & (atp_data['tourney_name'] == 'US Open') & (atp_data['round'] == 'F')]


# Filter all Grand Slam finals
grand_slam_finals00 = atp_data[
    (atp_data['year'] >= 2003) &
    (atp_data['tourney_name'].isin(grand_slams)) &
    (atp_data['round'] == 'F')
]

In [None]:
# Building graphs for each Grand Slam final data starting from 1982
australian_open_graph82 = build_graph(australian_open_finals82)
french_open_graph82 = build_graph(french_open_finals82)
wimbledon_graph82 = build_graph(wimbledon_finals82)
us_open_graph82 = build_graph(us_open_finals82)
grand_slam_graph00 = build_graph(grand_slam_finals00)

In [None]:

def print_data_quality_checks(data, graph):
    # Calculate the number of unique players
    unique_players = pd.concat([data['winner_name'], data['loser_name']]).unique()
    print("Number of unique players in data:", len(unique_players))

    # Check the graph structure
    print("Number of nodes in the graph:", graph.number_of_nodes())
    print("Number of edges in the graph:", graph.number_of_edges())

    # Calculate number and percentage of players by win percentage category
    win_pct_categories = [data['win_pct_category'] for _, data in graph.nodes(data=True)]
    category_counts = pd.Series(win_pct_categories).value_counts()
    category_percentages = category_counts / len(graph.nodes()) * 100
    print("\nNumber and percentage of players by win percentage category:")
    for category, count in category_counts.items():
        print(f"{category}: {count} players, {category_percentages[category]:.2f}%")

    # Count and percentage of players who were ever in the top 5
    top_5_count = sum(data['was_top_5'] for _, data in graph.nodes(data=True))
    top_5_percentage = (top_5_count / len(graph.nodes())) * 100
    print(f"\nNumber and percentage of players who were ever in the top 5: {top_5_count}, {top_5_percentage:.2f}%")

    # Calculate number and percentage of edges involving at least one top 5 player
    top_5_edges_count = sum(1 for u, v, d in graph.edges(data=True) if graph.nodes[u]['was_top_5'] or graph.nodes[v]['was_top_5'])
    top_5_edges_percentage = (top_5_edges_count / graph.number_of_edges()) * 100 if graph.number_of_edges() > 0 else 0
    print(f"\nNumber and percentage of edges involving at least one top 5 player: {top_5_edges_count}, {top_5_edges_percentage:.2f}%")

    # Calculate number and percentage of edges involving both top 5 players
    top_5_both_edges_count = sum(1 for u, v, d in graph.edges(data=True) if graph.nodes[u]['was_top_5'] and graph.nodes[v]['was_top_5'])
    top_5_both_edges_percentage = (top_5_both_edges_count / graph.number_of_edges()) * 100 if graph.number_of_edges() > 0 else 0
    print(f"\nNumber and percentage of edges involving both top 5 players: {top_5_both_edges_count}, {top_5_both_edges_percentage:.2f}%")

# Usage
print_data_quality_checks(atp_data, graph)

