# Imports and Data Loading

In [1]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict
import math
import os

# Configuration
pickle_path = "/Users/saurabhkumar/Desktop/Work/Tennis"
output_path = "/Users/saurabhkumar/Desktop/Work/Tennis/top100bycountry"

# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)

print("Loading ATP data...")
atp_data = pd.read_pickle(f"{pickle_path}/ATPdata1968_2024.pkl")
print(f"Loaded {len(atp_data):,} match records")

Loading ATP data...
Loaded 193,337 match records


# Data Processing and Cleaning

In [2]:
# Extract year and combine winner/loser data into single player dataset
print("Processing player data...")
atp_data['year'] = atp_data['tourney_date'].astype(str).str[:4].astype(int)

players = pd.concat([
    atp_data[['winner_name', 'winner_ioc', 'winner_rank', 'year']].rename(
        columns={'winner_name': 'name', 'winner_ioc': 'country', 'winner_rank': 'rank'}),
    atp_data[['loser_name', 'loser_ioc', 'loser_rank', 'year']].rename(
        columns={'loser_name': 'name', 'loser_ioc': 'country', 'loser_rank': 'rank'})
]).dropna(subset=['name', 'country', 'rank'])

print(f"Total player appearances: {len(players):,}")

# Get each player's best (lowest) rank per year - fixes duplication issue
best_ranks = players.groupby(['year', 'name', 'country'])['rank'].min().reset_index()

# Clean and standardize names and countries
best_ranks['name'] = best_ranks['name'].str.strip().str.title()
best_ranks['country'] = best_ranks['country'].str.strip().str.upper()

print(f"Data from {best_ranks['year'].min()} to {best_ranks['year'].max()}")
print(f"Unique player-year records: {len(best_ranks):,}")

# Compute total unique ATP players per country (across all years)
unique_players_by_country = best_ranks.groupby('country')['name'].nunique().to_dict()
print(f"Computed unique players for {len(unique_players_by_country)} countries")

Processing player data...
Total player appearances: 306,751
Data from 1973 to 2024
Unique player-year records: 19,262
Computed unique players for 114 countries


# Core Data Structure Building

In [3]:
# Build country rankings data structure
print("Building core data structure...")
result = {}

for year in sorted(best_ranks['year'].unique()):
    year_data = best_ranks[best_ranks['year'] == year]
    country_stats = {}
    
    for country in year_data['country'].unique():
        country_players = year_data[year_data['country'] == country]
        
        # Count unique players in top 100 and top 10 during the year
        ever_in_top100 = int((country_players['rank'] <= 100).sum())
        ever_in_top10 = int((country_players['rank'] <= 10).sum())
        unique_atp_player = country_players['name'].nunique()
        
        # Find top player (best rank)
        top_idx = country_players['rank'].idxmin()
        top_player = country_players.loc[top_idx, 'name']
        top_rank = int(country_players.loc[top_idx, 'rank'])
        
        country_stats[country] = {
            'ever_in_top100': ever_in_top100,
            'ever_in_top10': ever_in_top10,
            'top_player': top_player,
            'top_rank': top_rank,
            'unique_atp_player': unique_atp_player
        }
    
    result[year] = country_stats

print(f"Processed {len(result)} years")

# Strategic year sampling for better visualization
selected_years = [1970, 1975, 1980, 1985, 1990, 1995, 2000, 2003, 2006, 2009, 2012, 2015, 2018, 2021, 2024]
filtered_result = {str(year): result[year] for year in selected_years if year in result}

print(f"Filtered to {len(filtered_result)} years: {list(filtered_result.keys())}")

Building core data structure...
Processed 52 years
Filtered to 14 years: ['1975', '1980', '1985', '1990', '1995', '2000', '2003', '2006', '2009', '2012', '2015', '2018', '2021', '2024']


In [4]:
# Convert to array of years structure
years_array = []
for year in sorted(filtered_result.keys(), key=int):
    years_array.append({
        "year": int(year),
        "countries": filtered_result[year]
    })

# Save year-centric JSON
with open("/Users/saurabhkumar/Desktop/Work/Tennis/top100bycountry/top_tennis_players_timeline.json", 'w') as f:
    json.dump(years_array, f, indent=2)

print(f"Saved year-centric JSON with {len(years_array)} years")

Saved year-centric JSON with 14 years


In [5]:
# Build country-centric structure
country_centric = {}
for year in filtered_result:
    for country in filtered_result[year]:
        if country not in country_centric:
            country_centric[country] = {}
        country_centric[country][year] = filtered_result[year][country]

# Save country-centric JSON
with open("/Users/saurabhkumar/Desktop/Work/Tennis/top100bycountry/tennis_country_profiles.json", 'w') as f:
    json.dump(country_centric, f, indent=2)

print(f"Saved country-centric JSON with {len(country_centric)} countries")

Saved country-centric JSON with 107 countries


In [6]:
def calculate_shannon_index(counts):
    """Calculate Shannon diversity index"""
    if not counts or sum(counts) == 0:
        return 0
    
    total = sum(counts)
    proportions = [count/total for count in counts if count > 0]
    return -sum(p * np.log(p) for p in proportions)

# Build global timeline dataset
global_timeline = []

for year in sorted(filtered_result.keys(), key=int):
    year_int = int(year)
    countries_data = filtered_result[year]
    
    # Extract country lists and counts
    countries_top100 = []
    countries_top10 = []
    top100_counts = []
    top10_counts = []
    global_reach_counts = []
    
    total_unique_players = 0
    
    for country, stats in countries_data.items():
        total_unique_players += stats['unique_atp_player']
        global_reach_counts.append(stats['unique_atp_player'])
        
        if stats['ever_in_top100'] > 0:
            countries_top100.append(country)
            top100_counts.append(stats['ever_in_top100'])
        
        if stats['ever_in_top10'] > 0:
            countries_top10.append(country)
            top10_counts.append(stats['ever_in_top10'])
    
    # Calculate metrics
    num_countries_top100 = len(countries_top100)
    num_countries_top10 = len(countries_top10)
    num_countries_with_players = len(countries_data)
    
    # Shannon indices
    shannon_top100 = calculate_shannon_index(top100_counts)
    shannon_top10 = calculate_shannon_index(top10_counts)
    shannon_global_reach = calculate_shannon_index(global_reach_counts)
    
    year_entry = {
        "year": year_int,
        "countries_top100": countries_top100,
        "countries_top10": countries_top10,
        "num_countries_top100": num_countries_top100,
        "num_countries_top10": num_countries_top10,
        "num_countries_with_players": num_countries_with_players,
        "total_unique_players": total_unique_players,
        "shannon_index_top100": round(shannon_top100, 4),
        "shannon_index_top10": round(shannon_top10, 4),
        "shannon_index_global_reach": round(shannon_global_reach, 4)
    }
    
    global_timeline.append(year_entry)

# Save the global timeline dataset
with open("/Users/saurabhkumar/Desktop/Work/Tennis/top100bycountry/global_timeline_dataset.json", 'w') as f:
    json.dump(global_timeline, f, indent=2)

print(f"Created global_timeline_dataset.json with {len(global_timeline)} years")
print(f"Sample entry for {global_timeline[0]['year']}:")
print(f"  Countries in top 100: {global_timeline[0]['num_countries_top100']}")
print(f"  Countries in top 10: {global_timeline[0]['num_countries_top10']}")
print(f"  Total unique players: {global_timeline[0]['total_unique_players']}")
print(f"  Shannon index (global reach): {global_timeline[0]['shannon_index_global_reach']}")

Created global_timeline_dataset.json with 14 years
Sample entry for 1975:
  Countries in top 100: 29
  Countries in top 10: 7
  Total unique players: 295
  Shannon index (global reach): 2.929


In [7]:
print(global_timeline)

[{'year': 1975, 'countries_top100': ['ITA', 'RUS', 'AUS', 'COL', 'EGY', 'IND', 'USA', 'ESP', 'GER', 'HUN', 'CHI', 'RSA', 'SWE', 'CRO', 'NZL', 'GBR', 'BRA', 'FRA', 'CZE', 'AUT', 'ARG', 'PAK', 'ROU', 'POL', 'NED', 'MEX', 'JPN', 'SRB', 'URS'], 'countries_top10': ['AUS', 'USA', 'ESP', 'SWE', 'ARG', 'ROU', 'NED'], 'num_countries_top100': 29, 'num_countries_top10': 7, 'num_countries_with_players': 41, 'total_unique_players': 295, 'shannon_index_top100': 2.7662, 'shannon_index_top10': 1.7987, 'shannon_index_global_reach': 2.929}, {'year': 1980, 'countries_top100': ['ITA', 'CHI', 'COL', 'IND', 'ECU', 'USA', 'ESP', 'HUN', 'BEL', 'RSA', 'AUS', 'SWE', 'NZL', 'GBR', 'ARG', 'BRA', 'FRA', 'MEX', 'AUT', 'PAR', 'ZIM', 'SUI', 'ROU', 'CZE', 'URU', 'GER', 'NED', 'BOL', 'ISR', 'POL', 'CRO'], 'countries_top10': ['USA', 'ESP', 'SWE', 'ARG', 'PAR'], 'num_countries_top100': 31, 'num_countries_top10': 5, 'num_countries_with_players': 31, 'total_unique_players': 183, 'shannon_index_top100': 2.6253, 'shannon_ind