In [60]:
import pandas as pd
import requests
from io import StringIO
import networkx as nx
import numpy as np
from pyvis.network import Network

In [None]:
def load_csv_from_github(start_year, end_year):
    base_url = "https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{}.csv"
    df_list = []

    for year in range(start_year, end_year + 1):
        url = base_url.format(year)
        response = requests.get(url)
        if response.status_code == 200:  # Check if the request was successful
            data = StringIO(response.text)
            df = pd.read_csv(data)
            df_list.append(df)
        else:
            print(f"Failed to retrieve data for year {year}")

    # Combine all DataFrames into a single DataFrame
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df

# Load data from 2000 to 2024
atp_data = load_csv_from_github(2000, 2024)
print(atp_data.head())  # Print the first few rows of the DataFrame

In [374]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Show full content of each column
pd.set_option('display.width', 1000)  # Set the width of the display for the output
atp_data.head(5)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2000-301,Auckland,Hard,32,A,20000110,1,103163,1.0,,Tommy Haas,R,188.0,GER,21.7,101543,,,Jeff Tarango,L,180.0,USA,31.1,7-5 4-6 7-5,3,R32,108.0,18.0,4.0,96.0,49.0,39.0,28.0,17.0,3.0,5.0,7.0,8.0,106.0,55.0,39.0,29.0,17.0,4.0,7.0,11.0,1612.0,63.0,595.0
1,2000-301,Auckland,Hard,32,A,20000110,2,102607,,Q,Juan Balcells,R,190.0,ESP,24.5,102644,,,Franco Squillari,L,183.0,ARG,24.3,7-5 7-5,3,R32,85.0,5.0,3.0,76.0,52.0,39.0,13.0,12.0,5.0,6.0,5.0,10.0,74.0,32.0,25.0,18.0,12.0,3.0,6.0,211.0,157.0,49.0,723.0
2,2000-301,Auckland,Hard,32,A,20000110,3,103252,,,Alberto Martin,R,175.0,ESP,21.3,102238,,,Alberto Berasategui,R,173.0,ESP,26.5,6-3 6-1,3,R32,56.0,0.0,0.0,55.0,35.0,25.0,12.0,8.0,1.0,1.0,0.0,6.0,56.0,33.0,20.0,7.0,8.0,7.0,11.0,48.0,726.0,59.0,649.0
3,2000-301,Auckland,Hard,32,A,20000110,4,103507,7.0,,Juan Carlos Ferrero,R,183.0,ESP,19.9,103819,,,Roger Federer,R,185.0,SUI,18.4,6-4 6-4,3,R32,68.0,5.0,1.0,53.0,28.0,26.0,15.0,10.0,0.0,0.0,11.0,2.0,70.0,43.0,29.0,14.0,10.0,6.0,8.0,45.0,768.0,61.0,616.0
4,2000-301,Auckland,Hard,32,A,20000110,5,102103,,Q,Michael Sell,R,180.0,USA,27.3,102765,4.0,,Nicolas Escude,R,185.0,FRA,23.7,0-6 7-6(7) 6-1,3,R32,115.0,1.0,2.0,98.0,66.0,39.0,14.0,13.0,6.0,11.0,8.0,8.0,92.0,46.0,34.0,18.0,12.0,5.0,9.0,167.0,219.0,34.0,873.0


In [88]:
atp_data.columns

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'],
      dtype='object')

In [452]:

def build_graph(data):
    G = nx.Graph()
    
    # Iterate through each match in the dataset
    for index, row in data.iterrows():
        winner = row['winner_name']
        loser = row['loser_name']
        surface = str(row['surface'])  # Ensure surface is treated as a string
        winner_country = row['winner_ioc']
        loser_country = row['loser_ioc']
        winner_top_5 = row['winner_rank'] <= 5
        loser_top_5 = row['loser_rank'] <= 5

        # Initialize nodes for winner and loser if they don't already exist
        if winner not in G:
            G.add_node(winner, matches_won=0, matches_played=0, country=winner_country, 
                       surface_wins={}, surface_matches={}, top_5_wins=0, top_5_matches=0, 
                       was_top_5=False, win_pct_category='50% or below', opponents=set())
        if loser not in G:
            G.add_node(loser, matches_won=0, matches_played=0, country=loser_country, 
                       surface_wins={}, surface_matches={}, top_5_wins=0, top_5_matches=0, 
                       was_top_5=False, win_pct_category='50% or below', opponents=set())

        # Update node data
        G.nodes[winner]['matches_won'] += 1
        G.nodes[winner]['matches_played'] += 1
        G.nodes[loser]['matches_played'] += 1
        G.nodes[winner]['opponents'].add(loser)
        G.nodes[loser]['opponents'].add(winner)

        # Track wins and matches against top 5 players
        if winner_top_5:
            G.nodes[winner]['was_top_5'] = True
        if loser_top_5:
            G.nodes[loser]['was_top_5'] = True
            G.nodes[winner]['top_5_wins'] += 1
            G.nodes[winner]['top_5_matches'] += 1
        if winner_top_5:
            G.nodes[loser]['top_5_matches'] += 1

        # Track wins and matches by surface
        G.nodes[winner]['surface_wins'].setdefault(surface, 0)
        G.nodes[winner]['surface_matches'].setdefault(surface, 0)
        G.nodes[winner]['surface_wins'][surface] += 1
        G.nodes[winner]['surface_matches'][surface] += 1
        G.nodes[loser]['surface_matches'].setdefault(surface, 0)
        G.nodes[loser]['surface_matches'][surface] += 1

        # # Add or update edges
        if G.has_edge(winner, loser):
            G[winner][loser]['matches'] += 1
            if 'surface_matches' not in G[winner][loser]:
                G[winner][loser]['surface_matches'] = {}
            if surface not in G[winner][loser]['surface_matches']:
                G[winner][loser]['surface_matches'][surface] = 0
                G[winner][loser][f'win_count_{surface}_{winner}'] = 0  # Initialize for winner
                G[winner][loser][f'win_count_{surface}_{loser}'] = 0   # Initialize for loser
            G[winner][loser]['surface_matches'][surface] += 1
            G[winner][loser][f'win_count_{surface}_{winner}'] += 1
        else:
            # Initializing a new edge with the first match and setting win counts
            G.add_edge(winner, loser, matches=1, surface_matches={surface: 1},
               **{f'win_count_{surface}_{winner}': 1, f'win_count_{surface}_{loser}': 0})

        # Add or update edges
        # if G.has_edge(winner, loser):
        #     G[winner][loser]['matches'] += 1
        #     if 'surface_matches' not in G[winner][loser]:
        #         G[winner][loser]['surface_matches'] = {}
        #     G[winner][loser]['surface_matches'].setdefault(surface, 0)
        #     G[winner][loser]['surface_matches'][surface] += 1
        #     G[winner][loser].setdefault(f'win_count_{surface}_{winner}', 0)  # Initialize if not present
        #     G[winner][loser][f'win_count_{surface}_{winner}'] += 1  # Increment win count for the winner on this surface
        # else:
        #     # Initializing a new edge with the first match and setting initial counts for surface matches
        #     G.add_edge(winner, loser, matches=1, surface_matches={surface: 1},
        #        **{f'win_count_{surface}_{winner}': 1, f'win_count_{surface}_{loser}': 0})



    # Update win percentage categories and surface-specific win percentages
    for node, attr in G.nodes(data=True):
        attr['win_pct'] = attr['matches_won'] / attr['matches_played'] * 100
        attr['top_5_win_pct'] = (attr['top_5_wins'] / attr['top_5_matches'] * 100) if attr['top_5_matches'] > 0 else 0
        attr['unique_opponents'] = len(attr['opponents'])
        del attr['opponents']  # Clean up to save memory

        # Update win percentage categories based on overall win percentage
        if attr['win_pct'] > 70:
            attr['win_pct_category'] = 'Above 70%'
        elif attr['win_pct'] > 60:
            attr['win_pct_category'] = '61% - 70%'
        elif attr['win_pct'] > 50:
            attr['win_pct_category'] = '51% - 60%'
        elif attr['win_pct'] > 40:
            attr['win_pct_category'] = '41% - 50%'
        else:
            attr['win_pct_category'] = '40% or below'

        # Calculate surface-specific win percentages
        for surface, wins in attr['surface_wins'].items():
            matches_on_surface = attr['surface_matches'][surface]
            attr['win_pct_' + surface] = wins / matches_on_surface * 100 if matches_on_surface > 0 else 0

    # for u, v, d in G.edges(data=True):
    #     for surface in d.get('surface_matches', {}):
    #         matches_on_surface = d['surface_matches'][surface]
    #         u_surface_wins = d.get(f'win_count_{surface}_{winner}', 0)  # Access using winner
    #         v_surface_wins = d.get(f'win_count_{surface}_{loser}', 0)  # Access using loser

    #         if matches_on_surface > 0:
    #             u_win_pct = (u_surface_wins / matches_on_surface) * 100
    #             v_win_pct = (v_surface_wins / matches_on_surface) * 100
    #         else:
    #             u_win_pct = v_win_pct = 0  # No matches to avoid division by zero

    #         d[f'win_pct_{surface}_{winner}'] = u_win_pct  # Store using winner
    #         d[f'win_pct_{surface}_{loser}'] = v_win_pct  # Store using loser


    return G

graph = build_graph(atp_data)
# You can now manipulate 'graph' as needed, analyze it, or visualize it.


In [322]:

atp_data.shape[0]
atp_data.head(1)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2000-301,Auckland,Hard,32,A,20000110,1,103163,1.0,,...,55.0,39.0,29.0,17.0,4.0,7.0,11.0,1612.0,63.0,595.0


In [414]:

def print_data_quality_checks(data, graph):
    # Calculate the number of unique players
    unique_players = pd.concat([data['winner_name'], data['loser_name']]).unique()
    print("Number of unique players in data:", len(unique_players))

    # Check the graph structure
    print("Number of nodes in the graph:", graph.number_of_nodes())
    print("Number of edges in the graph:", graph.number_of_edges())

    # Calculate number and percentage of players by win percentage category
    win_pct_categories = [data['win_pct_category'] for _, data in graph.nodes(data=True)]
    category_counts = pd.Series(win_pct_categories).value_counts()
    category_percentages = category_counts / len(graph.nodes()) * 100
    print("\nNumber and percentage of players by win percentage category:")
    for category, count in category_counts.items():
        print(f"{category}: {count} players, {category_percentages[category]:.2f}%")

    # Count and percentage of players who were ever in the top 5
    top_5_count = sum(data['was_top_5'] for _, data in graph.nodes(data=True))
    top_5_percentage = (top_5_count / len(graph.nodes())) * 100
    print(f"\nNumber and percentage of players who were ever in the top 5: {top_5_count}, {top_5_percentage:.2f}%")

    # Calculate number and percentage of edges involving at least one top 5 player
    top_5_edges_count = sum(1 for u, v, d in graph.edges(data=True) if graph.nodes[u]['was_top_5'] or graph.nodes[v]['was_top_5'])
    top_5_edges_percentage = (top_5_edges_count / graph.number_of_edges()) * 100 if graph.number_of_edges() > 0 else 0
    print(f"\nNumber and percentage of edges involving at least one top 5 player: {top_5_edges_count}, {top_5_edges_percentage:.2f}%")

    # Calculate number and percentage of edges involving both top 5 players
    top_5_both_edges_count = sum(1 for u, v, d in graph.edges(data=True) if graph.nodes[u]['was_top_5'] and graph.nodes[v]['was_top_5'])
    top_5_both_edges_percentage = (top_5_both_edges_count / graph.number_of_edges()) * 100 if graph.number_of_edges() > 0 else 0
    print(f"\nNumber and percentage of edges involving both top 5 players: {top_5_both_edges_count}, {top_5_both_edges_percentage:.2f}%")

# Usage
print_data_quality_checks(atp_data, graph)



Number of unique players in data: 2608
Number of nodes in the graph: 2608
Number of edges in the graph: 40797

Number and percentage of players by win percentage category:
40% or below: 1728 players, 66.26%
41% - 50%: 466 players, 17.87%
51% - 60%: 173 players, 6.63%
Above 70%: 128 players, 4.91%
61% - 70%: 113 players, 4.33%

Number and percentage of players who were ever in the top 5: 51, 1.96%

Number and percentage of edges involving at least one top 5 player: 10060, 24.66%

Number and percentage of edges involving both top 5 players: 765, 1.88%


In [494]:
def print_high_win_pct_players(graph):
    # Print header for clarity
    print("Players with win percentage greater than 82%:")
    
    # Iterate over all nodes and their attributes in the graph
    for node, attr in graph.nodes(data=True):
        win_pct = attr.get('win_pct', 0)  # Get win percentage, default to 0 if not found
        matches_played = attr.get('matches_played', 0)  # Get total matches played, default to 0 if not found
        if matches_played < 200 and matches_played > 100:
            print(f"{node}: {win_pct:.2f}% - Total Matches Played: {matches_played}")

# # Usage
# print_high_win_pct_players(graph)

def print_players_by_match_category(graph):
    # Define the categories based on matches played
    categories = {
        'Less than 50': 0,
        '50 to 300': 0,
        '300 to 700': 0,
        '700+': 0
    }
    
    # Iterate over all nodes to categorize by matches played
    for _, attr in graph.nodes(data=True):
        matches_played = attr.get('matches_played', 0)

        if matches_played < 50:
            categories['Less than 50'] += 1
        elif 100 <= matches_played < 300:
            categories['50 to 300'] += 1
        elif 300 <= matches_played < 700:
            categories['300 to 700'] += 1
        else:
            categories['700+'] += 1

    # Print the results
    print("Number of players by matches played category:")
    for category, count in categories.items():
        print(f"{category}: {count} players")

# Usage
print_players_by_match_category(graph)


Number of players by matches played category:
Less than 50: 2103 players
50 to 300: 208 players
300 to 700: 124 players
700+: 173 players


In [484]:
def create_subgraph(graph):
    # Find nodes with more than 400 matches played
    high_activity_players = [
    node for node, attr in graph.nodes(data=True)
    if (attr['win_pct'] > 60 and attr['matches_played'] > 200) or attr['matches_played'] > 800
    ]

    
    # Create subgraph with these nodes
    subgraph = graph.subgraph(high_activity_players).copy()

    partition = community_louvain.best_partition(subgraph)

    # Add the community information as a node attribute
    nx.set_node_attributes(subgraph, partition, 'community')
    
    return subgraph

# Assuming 'graph' is the main graph you built earlier
subgraph = create_subgraph(graph)
print("Number of nodes in subgraph:", subgraph.number_of_nodes())
print("Number of edges in subgraph:", subgraph.number_of_edges())

unique_communities = len(set(data['community'] for _, data in subgraph.nodes(data=True)))

print("Number of communities:", unique_communities)

Number of nodes in subgraph: 54
Number of edges in subgraph: 1026
Number of communities: 2


In [488]:
# # Print 2 nodes with their attributes
# nodes = list(graph.nodes(data=True))
# print("Nodes and their attributes:")
# for node, attr in nodes[:2]:
#     print(node, attr)

# Print 2 edges with their attributes
# edges = list(graph.edges(data=True))
# print("\nEdges and their attributes:")
# for u, v, attr in edges[:5]:
#     print(f"{u} -- {v}", attr)

# Assuming 'edges' is a list or similar iterable of edges from your network graph
# Example: edges = list(G.edges(data=True))

# Filter and print the first five edges where the number of matches is greater than 20
count = 0  # Initialize a counter for tracking the number of printed edges
for u, v, attr in graph.edges(data=True):
    if attr['matches'] > 20:  # Check if matches are greater than 20
        print(f"{u} -- {v}", attr)
        count += 1
        if count >= 2:
            break  # Exit the loop after printing five such edges


Roger Federer -- Lleyton Hewitt {'matches': 26, 'surface_matches': {'Carpet': 3, 'Hard': 16, 'Grass': 6, 'Clay': 1}, 'win_count_Carpet_Roger Federer': 1, 'win_count_Carpet_Lleyton Hewitt': 2, 'win_count_Hard_Lleyton Hewitt': 4, 'win_count_Hard_Roger Federer': 12, 'win_count_Grass_Lleyton Hewitt': 2, 'win_count_Grass_Roger Federer': 4, 'win_count_Clay_Roger Federer': 1, 'win_count_Clay_Lleyton Hewitt': 0}
Roger Federer -- Andy Roddick {'matches': 24, 'surface_matches': {'Carpet': 2, 'Hard': 17, 'Grass': 4, 'Clay': 1}, 'win_count_Carpet_Roger Federer': 2, 'win_count_Carpet_Andy Roddick': 0, 'win_count_Hard_Roger Federer': 14, 'win_count_Hard_Andy Roddick': 3, 'win_count_Grass_Roger Federer': 4, 'win_count_Grass_Andy Roddick': 0, 'win_count_Clay_Roger Federer': 1, 'win_count_Clay_Andy Roddick': 0}


In [648]:
def create_tooltip(node, attr):
    # Create a plain text tooltip with grouped related information
    tooltip_text = (
        f"Player: {node} | Country: {attr['country']} | Was Top 5: {'Yes' if attr['was_top_5'] else 'No'}\n"
        f"Total Matches Played: {attr['matches_played']} | Total Wins: {attr['matches_won']}\n"
        f"Win Percentage: {attr['win_pct']:.2f}% | Top 5 Win Percentage: {attr['top_5_win_pct']:.2f}%\n"
        f"Unique Opponents: {attr['unique_opponents']}\n"
     )

    # Add surface-specific win percentages if they exist
    if 'surface_matches' in attr:
        tooltip_text += "Surface Win Percentages: "
        for surface, matches_on_surface in attr['surface_matches'].items():
            wins = attr['surface_wins'].get(surface, 0)
            win_pct = wins / matches_on_surface * 100 if matches_on_surface > 0 else 0
            tooltip_text += f"{surface.title()}: {win_pct:.2f}% ({wins} wins) | "
        tooltip_text = tooltip_text.rstrip(' | ')  # Remove the last separator
        tooltip_text += "\n"

    return tooltip_text

def create_edge_tooltip(G, u, v, attr):
    tooltip_text = f"{attr['matches']} matches between {u} and {v}\n"
    was_both_top_5 = 'Yes' if G.nodes[u].get('was_top_5', False) and G.nodes[v].get('was_top_5', False) else 'No'
    tooltip_text += f"Both Players were Top 5: {was_both_top_5}\n"

    # Add detailed surface match information grouped by surface
    tooltip_text += "Matches by Surface:\n"
    for surface in attr.get('surface_matches', {}):
        count = attr['surface_matches'][surface]
        # Fetch win counts directly using the keys with specific player names
        u_surface_wins_key = f'win_count_{surface}_{u}'  # Construct the key for u's win counts
        v_surface_wins_key = f'win_count_{surface}_{v}'  # Construct the key for v's win counts
        u_surface_wins = attr.get(u_surface_wins_key, 0)  # Fetch u's win count
        v_surface_wins = attr.get(v_surface_wins_key, 0)  # Fetch v's win count
        tooltip_text += f"{surface.title()}: Total Matches {count}, {u} Wins: {u_surface_wins}, {v} Wins: {v_surface_wins}\n"

    return tooltip_text





# Set options for layout and interaction
def set_network_options(net):
    net.set_options("""
    {
      "edges": {
        "color": {
          "inherit":false,
          "color": "#848484", 
          "highlight": "#ff7f0e",  
          "hover": "#f7b6d2"
        },
        "width": 1,
        "hoverWidth": 2.5,  
        "highlightWidth": 2.5,  
        "smooth": {
          "type": "continuous"
        }
      },
       "interaction": {
        "hover": true
      },
      "physics": {
        "barnesHut": {
          "gravitationalConstant": -1000,
          "centralGravity": 0.2,
          "springLength": 250,
          "springConstant": 0.01,
          "damping": 0.09,
          "avoidOverlap": 0.1
        },
        "minVelocity": 0.75,
        "maxVelocity": 5,
        "solver": "barnesHut"
      }
    }
    """)

In [654]:


def convert_networkx_to_pyvis(nx_graph):
    net = Network(notebook=False, height="1200px", width="100%")
 
    # Define a color map for win percentage categories np.log(attr.get('matches_played', 1) + 1)
    win_pct_color = {
        'Above 70%': '#bcbd22',
        '61% - 70%': '#ffbb78',
        '51% - 60%': '#aec7e8',
        '41% - 50%': '#7f7f7f',
        '40% or below': '#c7c7c7'
    }
    match_threshold = 18
    
    # Add nodes with additional attributes using the create_tooltip function
    for node, attr in nx_graph.nodes(data=True):
        tooltip = create_tooltip(node, attr)
        net.add_node(node, title=tooltip, color = win_pct_color.get(attr['win_pct_category'], 'grey'), size=np.log(attr.get('matches_played', 1) + 1) * 2)


    # Add edges with weights, tooltips, and styles based on 'was_top_5'
    for u, v, attr in nx_graph.edges(data=True):
        edge_tooltip = create_edge_tooltip(nx_graph,u, v, attr)
        # edge_weight = np.log(attr.get('matches', 1) + 1)  

        # Check if any surface match count exceeds the threshold to determine the line style
        max_surface_matches = max(attr['surface_matches'].values()) if 'surface_matches' in attr and attr['surface_matches'] else 0
        
        
        if max_surface_matches < match_threshold:
            # Use dashed line for edges where no surface matches exceed the threshold
            net.add_edge(u, v, title=edge_tooltip, color="#e0e0e0",  dashes=[2, 10])
        else:
            # Use normal line for edges where any surface matches exceed the threshold
            net.add_edge(u, v, title=edge_tooltip)
   

    return net

In [656]:
pyvis_net = convert_networkx_to_pyvis(subgraph)
set_network_options(pyvis_net)
pyvis_net.save_graph("tennis_network_200Match.html")

In [None]:
color="#c7c7c7",  color="#e0e0e0",

          "color": "#848484", 
          "highlight": "#8c564b",  
          "hover": "c49c94" 

In [610]:
def set_network_options(net):
    net.set_options("""
    {

      "physics": {
        "barnesHut": {
          "gravitationalConstant": -1000,
          "centralGravity": 0.2,
          "springLength": 250,
          "springConstant": 0.01,
          "damping": 0.09,
          "avoidOverlap": 0.1
        },
        "minVelocity": 0.75,
        "maxVelocity": 5,
        "solver": "barnesHut"
      }
    }
    """)

pyvis_net = convert_networkx_to_pyvis(subgraph)
set_network_options(pyvis_net)
pyvis_net.save_graph("tennis_network_200Match.html")