In [1]:
import pandas as pd
import numpy as np

In [2]:
# ----------------- Match Simulation Function -----------------

def simulate_match(rank_a, rank_b):
    """
    Simulates a single match based on rank.
    Probability of A winning: Rank_B / (Rank_A + Rank_B)
    """
    prob_a_wins = rank_b / (rank_a + rank_b)
    # np.random.rand() generates a random float in [0.0, 1.0)
    return 'A' if np.random.rand() < prob_a_wins else 'B'


In [3]:
world_cup_1994_groups = {
    'Group A': ['Romania', 'Switzerland', 'USA', 'Colombia'],
    'Group B': ['Brazil', 'Sweden', 'Russia', 'Cameroon'],
    'Group C': ['Germany', 'Spain', 'Korea Republic', 'Bolivia'],
    'Group D': ['Nigeria', 'Bulgaria', 'Argentina', 'Greece'],
    'Group E': ['Mexico', 'Republic of Ireland', 'Italy', 'Norway'],
    'Group F': ['Netherlands', 'Saudi Arabia', 'Belgium', 'Morocco']
}

world_cup_1998_groups = {
    'Group A': ['Brazil', 'Norway', 'Morocco', 'Scotland'],
    'Group B': ['Italy', 'Chile', 'Austria', 'Cameroon'],
    'Group C': ['France', 'Denmark', 'South Africa', 'Saudi Arabia'],
    'Group D': ['Nigeria', 'Paraguay', 'Spain', 'Bulgaria'],
    'Group E': ['Mexico', 'Netherlands', 'Belgium', 'Korea Republic'],
    'Group F': ['Germany', 'Yugoslavia', 'IR Iran', 'USA'],
    'Group G': ['Romania', 'England', 'Colombia', 'Tunisia'],
    'Group H': ['Argentina', 'Croatia', 'Jamaica', 'Japan']
}

world_cup_2002_groups = {
    'Group A': ['Denmark', 'Senegal', 'Uruguay', 'France'],
    'Group B': ['Spain', 'South Africa', 'Paraguay', 'Slovenia'],
    'Group C': ['Brazil', 'Costa Rica', 'Turkey', 'China PR'],
    'Group D': ['Korea Republic', 'USA', 'Portugal', 'Poland'],
    'Group E': ['Germany', 'Republic of Ireland', 'Cameroon', 'Saudi Arabia'],
    'Group F': ['Sweden', 'England', 'Argentina', 'Nigeria'],
    'Group G': ['Mexico', 'Italy', 'Ecuador', 'Croatia'],
    'Group H': ['Japan', 'Belgium', 'Russia', 'Tunisia']
}

world_cup_2006_groups = {
    'Group A': ['Germany', 'Ecuador', 'Poland', 'Costa Rica'],
    'Group B': ['England', 'Sweden', 'Paraguay', 'Trinidad and Tobago'],
    'Group C': ['Netherlands', 'Argentina', 'Côte d\'Ivoire', 'Serbia'],
    'Group D': ['Portugal', 'Mexico', 'Angola', 'IR Iran'],
    'Group E': ['Italy', 'Ghana', 'Czech Republic', 'USA'],
    'Group F': ['Brazil', 'Australia', 'Croatia', 'Japan'],
    'Group G': ['Switzerland', 'France', 'Korea Republic', 'Togo'],
    'Group H': ['Spain', 'Ukraine', 'Saudi Arabia', 'Tunisia']
}

world_cup_2010_groups = {
    'Group A': ['Uruguay', 'Mexico', 'South Africa', 'France'],
    'Group B': ['Argentina', 'Korea Republic', 'Greece', 'Nigeria'],
    'Group C': ['USA', 'England', 'Slovenia', 'Algeria'],
    'Group D': ['Germany', 'Australia', 'Ghana', 'Serbia'],
    'Group E': ['Netherlands', 'Japan', 'Denmark', 'Cameroon'],
    'Group F': ['Paraguay', 'Slovakia', 'New Zealand', 'Italy'],
    'Group G': ['Brazil', 'Portugal', 'Côte d\'Ivoire', 'Korea DPR'],
    'Group H': ['Spain', 'Chile', 'Switzerland', 'Honduras']
}

world_cup_2014_groups = {
    'Group A': ['Brazil', 'Cameroon', 'Croatia', 'Mexico'],
    'Group B': ['Australia', 'Chile', 'Netherlands', 'Spain'],
    'Group C': ['Colombia', 'Greece', 'Côte d\'Ivoire', 'Japan'],
    'Group D': ['Costa Rica', 'England', 'Italy', 'Uruguay'],
    'Group E': ['Ecuador', 'France', 'Honduras', 'Switzerland'],
    'Group F': ['Argentina', 'Bosnia and Herzegovina', 'IR Iran', 'Nigeria'],
    'Group G': ['Germany', 'Ghana', 'Portugal', 'USA'],
    'Group H': ['Algeria', 'Belgium', 'Russia', 'Korea Republic']
}

world_cup_2018_groups = {
    'Group A': ['Egypt', 'Russia', 'Saudi Arabia', 'Uruguay'],
    'Group B': ['IR Iran', 'Morocco', 'Portugal', 'Spain'],
    'Group C': ['Australia', 'Denmark', 'France', 'Peru'],
    'Group D': ['Argentina', 'Croatia', 'Iceland', 'Nigeria'],
    'Group E': ['Brazil', 'Costa Rica', 'Serbia', 'Switzerland'],
    'Group F': ['Germany', 'Mexico', 'Korea Republic', 'Sweden'],
    'Group G': ['Belgium', 'England', 'Panama', 'Tunisia'],
    'Group H': ['Colombia', 'Japan', 'Poland', 'Senegal']
}

world_cup_2022_groups = {
    'Group A': ['Qatar', 'Ecuador', 'Senegal', 'Netherlands'],
    'Group B': ['England', 'IR Iran', 'USA', 'Wales'],
    'Group C': ['Argentina', 'Saudi Arabia', 'Mexico', 'Poland'],
    'Group D': ['France', 'Australia', 'Denmark', 'Tunisia'],
    'Group E': ['Spain', 'Costa Rica', 'Germany', 'Japan'],
    'Group F': ['Belgium', 'Canada', 'Morocco', 'Croatia'],
    'Group G': ['Brazil', 'Serbia', 'Switzerland', 'Cameroon'],
    'Group H': ['Portugal', 'Ghana', 'Uruguay', 'Korea Republic']
}

In [4]:
def get_world_cup_rank_df(world_cup_groups: dict, year: int) -> pd.DataFrame:
    """
    Creates a DataFrame of World Cup teams, their groups, and their FIFA ranking
    for a given tournament year.

    Args:
        world_cup_groups (dict): A dictionary where keys are group names 
                                 (e.g., 'Group A') and values are lists of team names.
        year (int): The World Cup year (e.g., 1994).

    Returns:
        pd.DataFrame: A DataFrame with columns 'team', 'Group', and the 
                      corresponding FIFA rank for that year.
    """
    # 1. Load Ranking Data
    # Assumes 'fifa_mens_rank.csv' is accessible in the current directory
    try:
        df_rank = pd.read_csv("/Users/georgebuck/Documents/NYU/Fall 25/DS 1007 - Programming/Project/worldcup-socioeconomic-analysis/data/raw_data_files/World Cup Data/Rankings/fifa_mens_rank.csv")
    except FileNotFoundError:
        print("Error: 'fifa_mens_rank.csv' not found.")
        return pd.DataFrame()

    # 2. Prepare Group Data
    group_data_list = []
    for group, teams in world_cup_groups.items():
        for team in teams:
            group_data_list.append({'team': team, 'Group': group})

    df_groups = pd.DataFrame(group_data_list)
    teams_to_find = df_groups['team'].tolist()

    # 3. Filter Ranking Data for the specified year and latest semester
    df_year = df_rank[df_rank['date'] == year]
    
    if df_year.empty:
        print(f"Warning: No ranking data found for the year {year}.")
        df_final = df_groups
        df_final[f'{year} FIFA Rank'] = None
        return df_final

    latest_semester = df_year['semester'].max()
    df_latest_ranks = df_year[df_year['semester'] == latest_semester]

    # Select only the participating teams' ranks
    df_wc_ranks = df_latest_ranks[df_latest_ranks['team'].isin(teams_to_find)][['team', 'rank']]

    # 4. Merge and Clean
    df_final = pd.merge(df_groups, df_wc_ranks, on='team', how='left')

    # Rename the rank column
    rank_col_name = f'rank'
    df_final = df_final.rename(columns={'rank': rank_col_name})

    # Sort by group and rank
    df_final = df_final.sort_values(by=['Group', rank_col_name], na_position='last')

    # Note any missing teams
    missing_ranks = df_final[df_final[rank_col_name].isna()]['team'].tolist()
    if missing_ranks:
        print(f"Note: Ranks for the following teams were not found in the data for {year}: {', '.join(missing_ranks)}")
        
    return df_final


In [5]:
df_1994_ranks = get_world_cup_rank_df(world_cup_1994_groups, 1994)
df_1998_ranks = get_world_cup_rank_df(world_cup_1998_groups, 1998)
df_2002_ranks = get_world_cup_rank_df(world_cup_2002_groups, 2002)
df_2006_ranks = get_world_cup_rank_df(world_cup_2006_groups, 2006)
df_2010_ranks = get_world_cup_rank_df(world_cup_2010_groups, 2010)
df_2014_ranks = get_world_cup_rank_df(world_cup_2014_groups, 2014)
df_2018_ranks = get_world_cup_rank_df(world_cup_2018_groups, 2018)
df_2022_ranks = get_world_cup_rank_df(world_cup_2022_groups, 2022)

In [None]:
rank_map_2022 = dict(zip(df_2022_ranks['team'], df_2022_ranks['rank'])) #creates a dict with team and their rank
all_teams_2022 = df_2022_ranks['team'].tolist()
results_2022 = {team: [] for team in df_2022_ranks['team']}

def simulate_match(team_A, team_B, rank_map):
    """
    Simulates a single match based on rank.
    Probability of A winning: Rank_B / (Rank_A + Rank_B)
    """
    rank_A = rank_map.get(team_A)
    rank_B = rank_map.get(team_B)
    
    prob_a_wins = rank_B / (rank_A + rank_B)
    # np.random.rand() generates a random float in [0.0, 1.0)
    return 'A' if np.random.rand() < prob_a_wins else 'B'

def simulate_group_stage(group_teams, rank_map):
    """Simulates a group and returns the top 2 qualifiers."""
    group_table = {team: 0 for team in group_teams}
    
    # All 6 matches in a 4-team group
    matches = [(group_teams[i], group_teams[j]) for i in range(4) for j in range(i + 1, 4)]
    
    for team_A, team_B in matches:
        winner = simulate_match(team_A, team_B, rank_map)
        # Simplified: Winner gets 3 points, loser 0. (No draws)
        group_table[winner] += 3
        
    # Sort teams by points, then by their original FIFA points as a tie-breaker
    sorted_teams = sorted(group_table.keys(), key=lambda t: (group_table[t], rank_map.get(t, 1000)), reverse=True)
    
    # Return the two qualifiers and the two eliminated teams
    return sorted_teams[:2], sorted_teams[2:]

def simulate_group_stage(group_teams, rank_map):
    """Simulates a group and returns the top 2 qualifiers."""
    group_table = {team: 0 for team in group_teams}
    
    # All 6 matches in a 4-team group
    matches = [(group_teams[i], group_teams[j]) for i in range(4) for j in range(i + 1, 4)]
    
    for team_A, team_B in matches:
        winner = simulate_match(team_A, team_B, rank_map)
        # Simplified: Winner gets 3 points, loser 0. (No draws)
        group_table[winner] += 3
        
    # Sort teams by points, then by their original FIFA points as a tie-breaker
    sorted_teams = sorted(group_table.keys(), key=lambda t: (group_table[t], rank_map.get(t)), reverse=True)
    
    # Return the two qualifiers and the two eliminated teams
    return sorted_teams[:2], sorted_teams[2:]

simulate_group_stage(all_teams_2022, rank_map_2022)


SyntaxError: invalid syntax (402238606.py, line 48)

In [158]:
# 1. Simplified Data Setup
# (This is a simplified representation of the 32 teams and their points
# derived from the 'fifa_mens_rank.csv' file for the simulation)
POINTS_MAP = {
    'Argentina': 1867.25, 'France': 1859.78, 'Spain': 1853.27, 'England': 1813.81,
    'Brazil': 1775.85, 'Portugal': 1756.12, 'Netherlands': 1747.55, 'Belgium': 1740.62,
    'Croatia': 1667.68, 'Denmark': 1665.98, 'Germany': 1644.21, 'Switzerland': 1637.2,
    'Uruguay': 1636.56, 'USA': 1618.73, 'Mexico': 1599.4, 'Senegal': 1585.58,
    'Wales': 1582.13, 'Poland': 1566.2, 'Morocco': 1563.83, 'Serbia': 1549.53,
    'IR Iran': 1548.8, 'Japan': 1544.25, 'Canada': 1530.9, 'Korea Republic': 1526.2,
    'Australia': 1500.6, 'Ecuador': 1494.5, 'Tunisia': 1494.06, 'Costa Rica': 1469.69,
    'Cameroon': 1458.47, 'Saudi Arabia': 1431.3, 'Ghana': 1381.25, 'Qatar': 1362.59
}

GROUPS_2022 = {
    'Group A': ['Qatar', 'Ecuador', 'Senegal', 'Netherlands'],
    'Group B': ['England', 'IR Iran', 'USA', 'Wales'],
    'Group C': ['Argentina', 'Saudi Arabia', 'Mexico', 'Poland'],
    'Group D': ['France', 'Australia', 'Denmark', 'Tunisia'],
    'Group E': ['Spain', 'Costa Rica', 'Germany', 'Japan'],
    'Group F': ['Belgium', 'Canada', 'Morocco', 'Croatia'],
    'Group G': ['Brazil', 'Serbia', 'Switzerland', 'Cameroon'],
    'Group H': ['Portugal', 'Ghana', 'Uruguay', 'Korea Republic']
}
ALL_TEAMS = list(POINTS_MAP.keys())
NUM_SIMULATIONS = 10000 # Reduced for a quicker example

# Progression Levels: 1=Group Stage, 2=R16, 3=QF, 4=4th, 5=3rd, 6=2nd, 7=Winner
RESULTS_LEVELS = {team: [] for team in ALL_TEAMS}

# 2. Core Simulation Logic

def simulate_match(team_A, team_B, points_map):
    """Determines the match winner based on FIFA points."""
    points_A = points_map.get(team_A, 1000)
    points_B = points_map.get(team_B, 1000)
    
    # Probability of A winning: P(A) = Points_A / (Points_A + Points_B)
    prob_A_win = points_A / (points_A + points_B)
    
    # Use a random number generator to pick the winner
    return team_A if np.random.rand() < prob_A_win else team_B

def simulate_group_stage(group_teams, points_map):
    """Simulates a group and returns the top 2 qualifiers."""
    group_table = {team: 0 for team in group_teams}
    
    # All 6 matches in a 4-team group
    matches = [(group_teams[i], group_teams[j]) for i in range(4) for j in range(i + 1, 4)]
    
    for team_A, team_B in matches:
        winner = simulate_match(team_A, team_B, points_map)
        # Simplified: Winner gets 3 points, loser 0. (No draws)
        group_table[winner] += 3
        
    # Sort teams by points, then by their original FIFA points as a tie-breaker
    sorted_teams = sorted(group_table.keys(), key=lambda t: (group_table[t], points_map.get(t, 1000)), reverse=True)
    
    # Return the two qualifiers and the two eliminated teams
    return sorted_teams[:2], sorted_teams[2:]

def simulate_tournament(groups, points_map, results_levels):
    """Simulates the entire World Cup tournament."""
    
    # --- 1. Group Stage (Determines R16 teams and Level 1 eliminations) ---
    r16_teams = []
    eliminated_group_stage = []
    group_order = ['Group A', 'Group B', 'Group C', 'Group D', 'Group E', 'Group F', 'Group G', 'Group H']
    
    for group_name in group_order:
        qualifiers, eliminated = simulate_group_stage(groups[group_name], points_map)
        r16_teams.extend(qualifiers) # Adds A1, A2, B1, B2, etc.
        eliminated_group_stage.extend(eliminated)
        
    for team in eliminated_group_stage:
        results_levels[team].append(1)
        
    # --- 2. Knockout Stages (R16, QF, SF) ---
    def run_round(teams, stage_level, pairings):
        winners, losers = [], []
        for idx_A, idx_B in pairings:
            team_A, team_B = teams[idx_A], teams[idx_B]
            winner = simulate_match(team_A, team_B, points_map)
            loser = team_A if winner == team_B else team_B
            winners.append(winner)
            losers.append(loser)
        for team in losers:
            results_levels[team].append(stage_level)
        return winners, losers

    # R16 Matchups: (A1 vs B2), (B1 vs A2), (C1 vs D2), etc.
    r16_pairings = [(0, 3), (2, 1), (4, 7), (6, 5), (8, 11), (10, 9), (12, 15), (14, 13)]
    qf_teams, _ = run_round(r16_teams, 2, r16_pairings)

    # QF Matchups: Winner R16(1) vs Winner R16(3), etc.
    qf_pairings = [(0, 2), (1, 3), (4, 6), (5, 7)]
    sf_teams, _ = run_round(qf_teams, 3, qf_pairings)

    # SF Matchups:
    sf_pairings = [(0, 1), (2, 3)]
    finalists, third_place_teams = run_round(sf_teams, 'SKIP', sf_pairings) # Skip level logging here

    # --- 3. Third Place Match and Final ---
    # Third Place
    match_3rd_winner = simulate_match(third_place_teams[0], third_place_teams[1], points_map)
    match_3rd_loser = third_place_teams[0] if match_3rd_winner == third_place_teams[1] else third_place_teams[1]
    results_levels[match_3rd_winner].append(5) # 3rd place
    results_levels[match_3rd_loser].append(4)  # 4th place
    
    # Final
    winner = simulate_match(finalists[0], finalists[1], points_map)
    runner_up = finalists[0] if winner == finalists[1] else finalists[1]
    results_levels[winner].append(7)    # Winner
    results_levels[runner_up].append(6) # Runner-up


# 3. Execution and Aggregation
for _ in range(NUM_SIMULATIONS):
    simulate_tournament(GROUPS_2022, POINTS_MAP, RESULTS_LEVELS)

# Aggregate and display the percentage distribution
all_teams_data = []
for team, results in RESULTS_LEVELS.items():
    if not results: continue
    total_sims = len(results)
    
    distribution = {
        'Team': team,
        'R16 (2)': results.count(2) / total_sims,
        'QF (3)': results.count(3) / total_sims,
        '4th (4)': results.count(4) / total_sims,
        '3rd (5)': results.count(5) / total_sims,
        '2nd (6)': results.count(6) / total_sims,
        'Winner (7)': results.count(7) / total_sims,
        'Group Stage (1)': results.count(1) / total_sims,
    }
    all_teams_data.append(distribution)

final_df = pd.DataFrame(all_teams_data)
# (Formatting/sorting code removed for simplicity)

final_df

Unnamed: 0,Team,R16 (2),QF (3),4th (4),3rd (5),2nd (6),Winner (7),Group Stage (1)
0,Argentina,0.279364,0.146386,0.040316,0.041326,0.042795,0.04803,0.32014
1,France,0.278648,0.146555,0.037122,0.041728,0.042649,0.050203,0.324245
2,Spain,0.28549,0.149064,0.034591,0.042985,0.044,0.048335,0.31796
3,England,0.269463,0.152775,0.040209,0.043873,0.041308,0.037095,0.331196
4,Brazil,0.277783,0.147775,0.03507,0.039604,0.039511,0.043953,0.34163
5,Portugal,0.284331,0.151895,0.037259,0.040487,0.038642,0.042331,0.327308
6,Netherlands,0.292365,0.147062,0.035909,0.038593,0.041092,0.044146,0.32633
7,Belgium,0.277968,0.139861,0.040185,0.036028,0.040092,0.040739,0.348915
8,Croatia,0.250982,0.129605,0.03011,0.034786,0.033851,0.036843,0.418927
9,Denmark,0.254885,0.131744,0.030669,0.034315,0.033006,0.032071,0.418326
