In [23]:
import pandas as pd
import numpy as np
import random

In [24]:
world_cup_1994_groups = {
    'A': ['Romania', 'Switzerland', 'USA', 'Colombia'],
    'B': ['Brazil', 'Sweden', 'Russia', 'Cameroon'],
    'C': ['Germany', 'Spain', 'Korea Republic', 'Bolivia'],
    'D': ['Nigeria', 'Bulgaria', 'Argentina', 'Greece'],
    'E': ['Mexico', 'Republic of Ireland', 'Italy', 'Norway'],
    'F': ['Netherlands', 'Saudi Arabia', 'Belgium', 'Morocco']
}

world_cup_1998_groups = {
    'A': ['Brazil', 'Norway', 'Morocco', 'Scotland'],
    'B': ['Italy', 'Chile', 'Austria', 'Cameroon'],
    'C': ['France', 'Denmark', 'South Africa', 'Saudi Arabia'],
    'D': ['Nigeria', 'Paraguay', 'Spain', 'Bulgaria'],
    'E': ['Mexico', 'Netherlands', 'Belgium', 'Korea Republic'],
    'F': ['Germany', 'Yugoslavia', 'IR Iran', 'USA'],
    'G': ['Romania', 'England', 'Colombia', 'Tunisia'],
    'H': ['Argentina', 'Croatia', 'Jamaica', 'Japan']
}

world_cup_2002_groups = {
    'A': ['Denmark', 'Senegal', 'Uruguay', 'France'],
    'B': ['Spain', 'South Africa', 'Paraguay', 'Slovenia'],
    'C': ['Brazil', 'Costa Rica', 'Turkey', 'China PR'],
    'D': ['Korea Republic', 'USA', 'Portugal', 'Poland'],
    'E': ['Germany', 'Republic of Ireland', 'Cameroon', 'Saudi Arabia'],
    'F': ['Sweden', 'England', 'Argentina', 'Nigeria'],
    'G': ['Mexico', 'Italy', 'Ecuador', 'Croatia'],
    'H': ['Japan', 'Belgium', 'Russia', 'Tunisia']
}

world_cup_2006_groups = {
    'A': ['Germany', 'Ecuador', 'Poland', 'Costa Rica'],
    'B': ['England', 'Sweden', 'Paraguay', 'Trinidad and Tobago'],
    'C': ['Netherlands', 'Argentina', 'Côte d\'Ivoire', 'Serbia'],
    'D': ['Portugal', 'Mexico', 'Angola', 'IR Iran'],
    'E': ['Italy', 'Ghana', 'Czech Republic', 'USA'],
    'F': ['Brazil', 'Australia', 'Croatia', 'Japan'],
    'G': ['Switzerland', 'France', 'Korea Republic', 'Togo'],
    'H': ['Spain', 'Ukraine', 'Saudi Arabia', 'Tunisia']
}

world_cup_2010_groups = {
    'A': ['Uruguay', 'Mexico', 'South Africa', 'France'],
    'B': ['Argentina', 'Korea Republic', 'Greece', 'Nigeria'],
    'C': ['USA', 'England', 'Slovenia', 'Algeria'],
    'D': ['Germany', 'Australia', 'Ghana', 'Serbia'],
    'E': ['Netherlands', 'Japan', 'Denmark', 'Cameroon'],
    'F': ['Paraguay', 'Slovakia', 'New Zealand', 'Italy'],
    'G': ['Brazil', 'Portugal', 'Côte d\'Ivoire', 'Korea DPR'],
    'H': ['Spain', 'Chile', 'Switzerland', 'Honduras']
}

world_cup_2014_groups = {
    'A': ['Brazil', 'Cameroon', 'Croatia', 'Mexico'],
    'B': ['Australia', 'Chile', 'Netherlands', 'Spain'],
    'C': ['Colombia', 'Greece', 'Côte d\'Ivoire', 'Japan'],
    'D': ['Costa Rica', 'England', 'Italy', 'Uruguay'],
    'E': ['Ecuador', 'France', 'Honduras', 'Switzerland'],
    'F': ['Argentina', 'Bosnia and Herzegovina', 'IR Iran', 'Nigeria'],
    'G': ['Germany', 'Ghana', 'Portugal', 'USA'],
    'H': ['Algeria', 'Belgium', 'Russia', 'Korea Republic']
}

world_cup_2018_groups = {
    'A': ['Egypt', 'Russia', 'Saudi Arabia', 'Uruguay'],
    'B': ['IR Iran', 'Morocco', 'Portugal', 'Spain'],
    'C': ['Australia', 'Denmark', 'France', 'Peru'],
    'D': ['Argentina', 'Croatia', 'Iceland', 'Nigeria'],
    'E': ['Brazil', 'Costa Rica', 'Serbia', 'Switzerland'],
    'F': ['Germany', 'Mexico', 'Korea Republic', 'Sweden'],
    'G': ['Belgium', 'England', 'Panama', 'Tunisia'],
    'H': ['Colombia', 'Japan', 'Poland', 'Senegal']
}

world_cup_2022_groups = {
    'A': ['Qatar', 'Ecuador', 'Senegal', 'Netherlands'],
    'B': ['England', 'IR Iran', 'USA', 'Wales'],
    'C': ['Argentina', 'Saudi Arabia', 'Mexico', 'Poland'],
    'D': ['France', 'Australia', 'Denmark', 'Tunisia'],
    'E': ['Spain', 'Costa Rica', 'Germany', 'Japan'],
    'F': ['Belgium', 'Canada', 'Morocco', 'Croatia'],
    'G': ['Brazil', 'Serbia', 'Switzerland', 'Cameroon'],
    'H': ['Portugal', 'Ghana', 'Uruguay', 'Korea Republic']
}

In [25]:
def get_world_cup_rank_df(world_cup_groups: dict, year: int) -> pd.DataFrame:
    """
    Creates a DataFrame of World Cup teams, their groups, and their FIFA ranking
    for a given tournament year.

    Args:
        world_cup_groups (dict): A dictionary where keys are group names 
                                 (e.g., 'Group A') and values are lists of team names.
        year (int): The World Cup year (e.g., 1994).

    Returns:
        pd.DataFrame: A DataFrame with columns 'team', 'Group', and the 
                      corresponding FIFA rank for that year.
    """
    # 1. Load Ranking Data
    # Assumes 'fifa_mens_rank.csv' is accessible in the current directory
    try:
        df_rank = pd.read_csv("/Users/georgebuck/Documents/NYU/Fall 25/DS 1007 - Programming/Project/worldcup-socioeconomic-analysis/data/raw_data_files/World Cup Data/Rankings/fifa_mens_rank.csv")
    except FileNotFoundError:
        print("Error: 'fifa_mens_rank.csv' not found.")
        return pd.DataFrame()

    # 2. Prepare Group Data
    group_data_list = []
    for group, teams in world_cup_groups.items():
        for team in teams:
            group_data_list.append({'team': team, 'group': group})

    df_groups = pd.DataFrame(group_data_list)
    teams_to_find = df_groups['team'].tolist()

    # 3. Filter Ranking Data for the specified year and latest semester
    df_year = df_rank[df_rank['date'] == year]
    
    if df_year.empty:
        print(f"Warning: No ranking data found for the year {year}.")
        df_final = df_groups
        df_final[f'{year} FIFA Rank'] = None
        return df_final

    latest_semester = df_year['semester'].max()
    df_latest_ranks = df_year[df_year['semester'] == latest_semester]

    # Select only the participating teams' ranks
    df_wc_ranks = df_latest_ranks[df_latest_ranks['team'].isin(teams_to_find)][['team', 'rank']]

    # 4. Merge and Clean
    df_final = pd.merge(df_groups, df_wc_ranks, on='team', how='left')

    # Rename the rank column
    rank_col_name = f'rank'
    df_final = df_final.rename(columns={'rank': rank_col_name})

    # Sort by group and rank
    df_final = df_final.sort_values(by=['group', rank_col_name], na_position='last')

    # Note any missing teams
    missing_ranks = df_final[df_final[rank_col_name].isna()]['team'].tolist()
    if missing_ranks:
        print(f"Note: Ranks for the following teams were not found in the data for {year}: {', '.join(missing_ranks)}")
        
    return df_final


In [20]:
get_world_cup_rank_df(world_cup_2022_groups, 2022)

Unnamed: 0,team,group,rank
3,Netherlands,A,6
2,Senegal,A,19
1,Ecuador,A,41
0,Qatar,A,60
4,England,B,5
6,USA,B,13
5,IR Iran,B,24
7,Wales,B,28
8,Argentina,C,2
10,Mexico,C,15


In [None]:
#df should have these columns: 'team', 'rank', 'group'
df = get_world_cup_rank_df(world_cup_2014_groups, 2014)

# Create lookup dictionaries from the DataFrame
team_rank = dict(zip(df['team'], df['rank']))
team_group = dict(zip(df['team'], df['group']))
teams = df['team'].tolist()

# This will store how many times each team reaches each stage
# team_results[team_name][stage_number] = count
team_results = {}
for team in teams:
    team_results[team] = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0}


def play_match(team1, team2):
    """
    Simulate one match between two teams.
    Lower rank = better team.
    Returns the name of the winning team.
    """
    # Get the ranks for both teams
    team1_rank = team_rank[team1]
    team2_rank = team_rank[team2]
    
    # Calculate win probability for team1
    # If team1 is rank 5 and team2 is rank 10:
    # team1 wins with probability = 10/(5+10) = 0.67 (67%)
    prob_team1_wins = team2_rank / (team1_rank + team2_rank)
    
    # Generate random number between 0 and 1
    if random.random() < prob_team1_wins:
        return team1  # team1 wins
    else:
        return team2  # team2 wins


def simulate_one_tournament():
    """
    Simulate one complete World Cup tournament.
    Returns a dictionary: {team_name: final_stage_reached}
    """
    # Use the actual groups from the input data
    groups = []
    unique_groups = sorted(df['group'].unique())
    
    for group_name in unique_groups:
        # Get all teams in this group
        teams_in_group = df[df['group'] == group_name]['team'].tolist()
        groups.append(teams_in_group)
    
    # Dictionary to track how far each team got
    stage_reached = {}
    for team in teams:
        stage_reached[team] = 1  # Everyone starts at group stage
    
    # Group Stage
    # Each group: every team plays every other team once
    # Top 2 teams advance to knockout round
    teams_advancing = []
    
    for group in groups:
        # Count points for each team (win = 3 points)
        points = {}
        for team in group:
            points[team] = 0
        
        # Play all matches in this group
        # Team 0 vs Team 1, Team 0 vs Team 2, Team 0 vs Team 3
        # Team 1 vs Team 2, Team 1 vs Team 3
        # Team 2 vs Team 3
        for i in range(len(group)):
            for j in range(i + 1, len(group)):
                winner = play_match(group[i], group[j])
                points[winner] += 3
        
        # Sort teams by points (most points first)
        # If tied on points, better ranked team advances
        sorted_group = sorted(group, key=lambda t: (-points[t], team_rank[t]))
        
        # Top 2 teams advance
        teams_advancing.append(sorted_group[0])
        teams_advancing.append(sorted_group[1])
    
    # Knockout
    # Now we have 16 teams
    
    # Update: these teams made it to Round of 16
    for team in teams_advancing:
        stage_reached[team] = 2
    
    # ROUND OF 16: 16 teams -> 8 teams
    quarter_finalists = []
    for i in range(0, 16, 2):  # Match teams in pairs
        winner = play_match(teams_advancing[i], teams_advancing[i+1])
        quarter_finalists.append(winner)
    
    # Update: these 8 teams made it to Quarter Finals
    for team in quarter_finalists:
        stage_reached[team] = 3
    
    # QUARTER FINALS: 8 teams -> 4 teams
    semi_finalists = []
    for i in range(0, 8, 2):
        winner = play_match(quarter_finalists[i], quarter_finalists[i+1])
        semi_finalists.append(winner)
    
    # Update: these 4 teams made it to Semi Finals
    for team in semi_finalists:
        stage_reached[team] = 4  # At least 4th place
    
    # SEMI FINALS: 4 teams -> 2 finalists and 2 third-place teams
    finalists = []
    third_place_teams = []
    
    # Semi final 1
    winner1 = play_match(semi_finalists[0], semi_finalists[1])
    loser1 = semi_finalists[1] if winner1 == semi_finalists[0] else semi_finalists[0]
    finalists.append(winner1)
    third_place_teams.append(loser1)
    
    # Semi final 2
    winner2 = play_match(semi_finalists[2], semi_finalists[3])
    loser2 = semi_finalists[3] if winner2 == semi_finalists[2] else semi_finalists[2]
    finalists.append(winner2)
    third_place_teams.append(loser2)
    
    # THIRD PLACE MATCH
    third_place_winner = play_match(third_place_teams[0], third_place_teams[1])
    fourth_place = third_place_teams[1] if third_place_winner == third_place_teams[0] else third_place_teams[0]
    
    stage_reached[third_place_winner] = 5  # Third place
    stage_reached[fourth_place] = 4        # Fourth place
    
    # FINAL
    champion = play_match(finalists[0], finalists[1])
    runner_up = finalists[1] if champion == finalists[0] else finalists[0]
    
    stage_reached[champion] = 7    # Winner! woohoo
    stage_reached[runner_up] = 6   # Runner up
    
    return stage_reached


#run sim 
num_simulations = 10000
print(f"Running {num_simulations} tournament simulations...")

# Run the tournament n times
for sim in range(num_simulations):
    # Show progress every 1000 simulations
    if (sim + 1) % 1000 == 0:
        print(f"  Completed {sim + 1} simulations...")
    
    # Run one tournament
    results = simulate_one_tournament()
    
    # Record the results
    for team, stage in results.items():
        team_results[team][stage] += 1

#results
print("\n" + "="*110)
print("RESULTS: Percentage of times each team reached each stage")
print("="*110)
print(f"{'Team':<20} {'Rank':<8} {'Group%':<10} {'R16%':<10} {'QF%':<10} {'4th%':<10} {'3rd%':<10} {'Runner%':<10} {'Winner%':<10}")
print("-"*110)

# Sort teams by rank for display
teams_sorted = sorted(teams, key=lambda t: team_rank[t])

for team in teams_sorted:
    # Convert counts to percentages
    group_pct = (team_results[team][1] / num_simulations) * 100
    r16_pct = (team_results[team][2] / num_simulations) * 100
    qf_pct = (team_results[team][3] / num_simulations) * 100
    fourth_pct = (team_results[team][4] / num_simulations) * 100
    third_pct = (team_results[team][5] / num_simulations) * 100
    runner_pct = (team_results[team][6] / num_simulations) * 100
    winner_pct = (team_results[team][7] / num_simulations) * 100
    
    print(f"{team:<20} {team_rank[team]:<8} {group_pct:<10.2f} {r16_pct:<10.2f} {qf_pct:<10.2f} {fourth_pct:<10.2f} {third_pct:<10.2f} {runner_pct:<10.2f} {winner_pct:<10.2f}")

print("\n" + "="*110)

print("\n" + "="*100)
print("Column meanings:")
print("  Group% = Eliminated in group stage")
print("  R16%   = Made it to Round of 16")
print("  QF%    = Made it to Quarter Finals")
print("  4th%   = Finished in 4th place")
print("  3rd%   = Finished in 3rd place")
print("  Runner%= Finished as Runner Up")
print("  Winner%= Won the tournament")
print("="*100)

Running 10000 tournament simulations...
  Completed 1000 simulations...
  Completed 2000 simulations...
  Completed 3000 simulations...
  Completed 4000 simulations...
  Completed 5000 simulations...
  Completed 6000 simulations...
  Completed 7000 simulations...
  Completed 8000 simulations...
  Completed 9000 simulations...
  Completed 10000 simulations...

RESULTS: Percentage of times each team reached each stage
Team                 Rank     Group%     R16%       QF%        4th%       3rd%       Runner%    Winner%   
--------------------------------------------------------------------------------------------------------------
Germany              1        0.46       10.32      15.76      2.81       17.59      10.57      42.49     
Argentina            2        0.47       5.66       17.67      11.19      32.30      10.15      22.56     
Colombia             3        1.54       9.86       18.67      11.64      10.54      30.45      17.30     
Belgium              4        2.90       

In [34]:
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display

# Stage information
stage_names = {
    1: 'Group Stage',
    2: 'Round of 16',
    3: 'Quarter Finals',
    4: '4th Place',
    5: '3rd Place',
    6: 'Runner Up',
    7: 'Winner'
}

stage_colors = {
    1: 'red',
    2: 'orange',
    3: 'yellow',
    4: 'green',
    5: 'blue',
    6: 'indigo',
    7: 'violet'
}

def plot_team_results(team_name):
    """Plot results for a single team"""
    
    # Get data for this team
    team_data = team_results[team_name]
    
    # Prepare data
    stages = list(range(1, 8))
    counts = [team_data[s] for s in stages]
    percentages = [(c / num_simulations) * 100 for c in counts]
    colors = [stage_colors[s] for s in stages]
    labels = [stage_names[s] for s in stages]
    
    # Create figure
    fig, ax = plt.subplots(figsize=(14, 6))
    
    # Create bar chart
    bars = ax.bar(labels, percentages, color=colors, alpha=0.8, edgecolor='black', linewidth=2)
    
    # Customize
    ax.set_ylabel('Percentage (%)', fontsize=14, fontweight='bold')
    ax.set_title(f'{team_name} - Tournament Results Distribution\nRank: {team_rank[team_name]} | Group: {team_group[team_name]}', 
                 fontsize=16, fontweight='bold', pad=20)
    ax.tick_params(axis='x', rotation=45, labelsize=11)
    ax.tick_params(axis='y', labelsize=11)
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    ax.set_axisbelow(True)
    
    # Add percentage labels on bars
    for bar, pct, count in zip(bars, percentages, counts):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 1,
               f'{pct:.1f}%\n({count:,})',
               ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    # Add key stats text
    advanced = ((num_simulations - team_data[1]) / num_simulations * 100)
    won = (team_data[7] / num_simulations * 100)
    
    stats_text = f'Advanced from Group: {advanced:.1f}% | Won Tournament: {won:.1f}%'
    ax.text(0.5, -0.25, stats_text, transform=ax.transAxes, 
            ha='center', fontsize=12, fontweight='bold',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    plt.tight_layout()
    plt.show()

# Create dropdown widget
teams_sorted = sorted(teams, key=lambda t: team_rank[t])

dropdown = widgets.Dropdown(
    options=teams_sorted,
    value=teams_sorted[0],
    description='Team:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='300px')
)

# Create interactive plot
interactive_plot = widgets.interactive(plot_team_results, team_name=dropdown)
display(interactive_plot)

interactive(children=(Dropdown(description='Team:', layout=Layout(width='300px'), options=('Germany', 'Argenti…