In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display
from pathlib import Path


In [2]:
world_cup_1994_groups = {
    'A': ['Romania', 'Switzerland', 'USA', 'Colombia'],
    'B': ['Brazil', 'Sweden', 'Russia', 'Cameroon'],
    'C': ['Germany', 'Spain', 'Korea Republic', 'Bolivia'],
    'D': ['Nigeria', 'Bulgaria', 'Argentina', 'Greece'],
    'E': ['Mexico', 'Republic of Ireland', 'Italy', 'Norway'],
    'F': ['Netherlands', 'Saudi Arabia', 'Belgium', 'Morocco']
}

world_cup_1998_groups = {
    'A': ['Brazil', 'Norway', 'Morocco', 'Scotland'],
    'B': ['Italy', 'Chile', 'Austria', 'Cameroon'],
    'C': ['France', 'Denmark', 'South Africa', 'Saudi Arabia'],
    'D': ['Nigeria', 'Paraguay', 'Spain', 'Bulgaria'],
    'E': ['Mexico', 'Netherlands', 'Belgium', 'Korea Republic'],
    'F': ['Germany', 'Yugoslavia', 'IR Iran', 'USA'],
    'G': ['Romania', 'England', 'Colombia', 'Tunisia'],
    'H': ['Argentina', 'Croatia', 'Jamaica', 'Japan']
}

world_cup_2002_groups = {
    'A': ['Denmark', 'Senegal', 'Uruguay', 'France'],
    'B': ['Spain', 'South Africa', 'Paraguay', 'Slovenia'],
    'C': ['Brazil', 'Costa Rica', 'Turkey', 'China PR'],
    'D': ['Korea Republic', 'USA', 'Portugal', 'Poland'],
    'E': ['Germany', 'Republic of Ireland', 'Cameroon', 'Saudi Arabia'],
    'F': ['Sweden', 'England', 'Argentina', 'Nigeria'],
    'G': ['Mexico', 'Italy', 'Ecuador', 'Croatia'],
    'H': ['Japan', 'Belgium', 'Russia', 'Tunisia']
}

world_cup_2006_groups = {
    'A': ['Germany', 'Ecuador', 'Poland', 'Costa Rica'],
    'B': ['England', 'Sweden', 'Paraguay', 'Trinidad and Tobago'],
    'C': ['Netherlands', 'Argentina', 'Côte d\'Ivoire', 'Serbia'],
    'D': ['Portugal', 'Mexico', 'Angola', 'IR Iran'],
    'E': ['Italy', 'Ghana', 'Czech Republic', 'USA'],
    'F': ['Brazil', 'Australia', 'Croatia', 'Japan'],
    'G': ['Switzerland', 'France', 'Korea Republic', 'Togo'],
    'H': ['Spain', 'Ukraine', 'Saudi Arabia', 'Tunisia']
}

world_cup_2010_groups = {
    'A': ['Uruguay', 'Mexico', 'South Africa', 'France'],
    'B': ['Argentina', 'Korea Republic', 'Greece', 'Nigeria'],
    'C': ['USA', 'England', 'Slovenia', 'Algeria'],
    'D': ['Germany', 'Australia', 'Ghana', 'Serbia'],
    'E': ['Netherlands', 'Japan', 'Denmark', 'Cameroon'],
    'F': ['Paraguay', 'Slovakia', 'New Zealand', 'Italy'],
    'G': ['Brazil', 'Portugal', 'Côte d\'Ivoire', 'Korea DPR'],
    'H': ['Spain', 'Chile', 'Switzerland', 'Honduras']
}

world_cup_2014_groups = {
    'A': ['Brazil', 'Cameroon', 'Croatia', 'Mexico'],
    'B': ['Australia', 'Chile', 'Netherlands', 'Spain'],
    'C': ['Colombia', 'Greece', 'Côte d\'Ivoire', 'Japan'],
    'D': ['Costa Rica', 'England', 'Italy', 'Uruguay'],
    'E': ['Ecuador', 'France', 'Honduras', 'Switzerland'],
    'F': ['Argentina', 'Bosnia and Herzegovina', 'IR Iran', 'Nigeria'],
    'G': ['Germany', 'Ghana', 'Portugal', 'USA'],
    'H': ['Algeria', 'Belgium', 'Russia', 'Korea Republic']
}

world_cup_2018_groups = {
    'A': ['Egypt', 'Russia', 'Saudi Arabia', 'Uruguay'],
    'B': ['IR Iran', 'Morocco', 'Portugal', 'Spain'],
    'C': ['Australia', 'Denmark', 'France', 'Peru'],
    'D': ['Argentina', 'Croatia', 'Iceland', 'Nigeria'],
    'E': ['Brazil', 'Costa Rica', 'Serbia', 'Switzerland'],
    'F': ['Germany', 'Mexico', 'Korea Republic', 'Sweden'],
    'G': ['Belgium', 'England', 'Panama', 'Tunisia'],
    'H': ['Colombia', 'Japan', 'Poland', 'Senegal']
}

world_cup_2022_groups = {
    'A': ['Qatar', 'Ecuador', 'Senegal', 'Netherlands'],
    'B': ['England', 'IR Iran', 'USA', 'Wales'],
    'C': ['Argentina', 'Saudi Arabia', 'Mexico', 'Poland'],
    'D': ['France', 'Australia', 'Denmark', 'Tunisia'],
    'E': ['Spain', 'Costa Rica', 'Germany', 'Japan'],
    'F': ['Belgium', 'Canada', 'Morocco', 'Croatia'],
    'G': ['Brazil', 'Serbia', 'Switzerland', 'Cameroon'],
    'H': ['Portugal', 'Ghana', 'Uruguay', 'Korea Republic']
}

In [3]:
PROJECT_ROOT = Path(__file__).resolve().parents[2] if "__file__" in globals() else Path.cwd()
while not (PROJECT_ROOT / "data").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

DATA_DIR = PROJECT_ROOT / "data" / "raw_data_files" / "World Cup Data" / "Rankings"

def get_world_cup_rank_df(world_cup_groups: dict, year: int) -> pd.DataFrame:
    """
    Creates a DataFrame of World Cup teams, their groups, and their FIFA ranking
    for a given tournament year.

    Args:
        world_cup_groups (dict): A dictionary where keys are group names 
        (e.g., 'Group A') and values are lists of team names.
        year (int): The World Cup year (e.g., 1994).

    Returns:
        pd.DataFrame: A DataFrame with columns 'team', 'Group', and the 
        corresponding FIFA rank for that year.
    """
    # 1. Load Ranking Data
    # Assumes 'fifa_mens_rank.csv' is accessible in the current directory
    try:
        #df_rank = pd.read_csv("/Users/georgebuck/Documents/NYU/Fall 25/DS 1007 - Programming/Project/worldcup-socioeconomic-analysis/data/raw_data_files/World Cup Data/Rankings/fifa_mens_rank.csv")
        df_rank = pd.read_csv(DATA_DIR / "fifa_mens_rank.csv")
    except FileNotFoundError:
        print("Error: 'fifa_mens_rank.csv' not found.")
        return pd.DataFrame()

    # 2. Prepare Group Data
    group_data_list = []
    for group, teams in world_cup_groups.items():
        for team in teams:
            group_data_list.append({'team': team, 'group': group})

    df_groups = pd.DataFrame(group_data_list)
    teams_to_find = df_groups['team'].tolist()

    # 3. Filter Ranking Data for the specified year and latest semester
    df_year = df_rank[df_rank['date'] == year]
    
    if df_year.empty:
        print(f"Warning: No ranking data found for the year {year}.")
        df_final = df_groups
        df_final[f'{year} FIFA Rank'] = None
        return df_final

    latest_semester = df_year['semester'].max()
    df_latest_ranks = df_year[df_year['semester'] == latest_semester]

    # Select only the participating teams' ranks
    df_wc_ranks = df_latest_ranks[df_latest_ranks['team'].isin(teams_to_find)][['team', 'rank']]

    # 4. Merge and Clean
    df_final = pd.merge(df_groups, df_wc_ranks, on='team', how='left')

    # Rename the rank column
    rank_col_name = f'rank'
    df_final = df_final.rename(columns={'rank': rank_col_name})

    # Sort by group and rank
    df_final = df_final.sort_values(by=['group', rank_col_name], na_position='last')

    # Note any missing teams
    missing_ranks = df_final[df_final[rank_col_name].isna()]['team'].tolist()
    if missing_ranks:
        print(f"Note: Ranks for the following teams were not found in the data for {year}: {', '.join(missing_ranks)}")
        
    return df_final


In [4]:
get_world_cup_rank_df(world_cup_2022_groups, 2022) #view final df


Unnamed: 0,team,group,rank
3,Netherlands,A,6
2,Senegal,A,19
1,Ecuador,A,41
0,Qatar,A,60
4,England,B,5
6,USA,B,13
5,IR Iran,B,24
7,Wales,B,28
8,Argentina,C,2
10,Mexico,C,15



 MONTE CARLO SIMULATION - WORLD CUP TOURNAMENTS (1998-2022)

This simulation runs 10,000 tournament scenarios for each World Cup year
to estimate the probability of each team reaching different stages.

 How it works:
 1. Each team's strength is based on their FIFA ranking at the beginning of the tournament year
 2. Match outcomes use a logistic probability formula (or linear, but the results were less realistic):
    - Better ranked teams have higher win probability
    - Upsets can still happen
 3. Tournament structure follows actual World Cup format:
    - Group stage (round-robin, top 2 advance)
    - Knockout rounds (single elimination)
 4. By doing 10,000 simulations, we get reliable probability estimates


In [None]:
# Define all World Cup years and their groups
world_cups = {
    1998: world_cup_1998_groups,
    2002: world_cup_2002_groups,
    2006: world_cup_2006_groups,
    2010: world_cup_2010_groups,
    2014: world_cup_2014_groups,
    2018: world_cup_2018_groups,
    2022: world_cup_2022_groups
}

# Stage information
stage_names = {
    1: 'Group Stage',
    2: 'Round of 16',
    3: 'Quarter Finals',
    4: '4th Place',
    5: '3rd Place',
    6: 'Runner Up',
    7: 'Winner'
}

stage_colors = {
    1: '#9E1A1A',
    2: '#FF746C',
    3: '#023E80',
    4: '#90D5FF',
    5: '#CD7F32',
    6: '#C0C0C0',
    7: '#D4AF37'
}

def play_match(team1, team2, team_rank_dict, k=0.1):
    '''
     Simulate a single match between two teams using a logistic win probability.

    The probability that `team1` wins is computed using a logistic function based
    on the difference in team rankings. A random outcome is then drawn based on
    this probability.

    Args:
        team1 (str): Name of the first team.
        team2 (str): Name of the second team.
        team_rank_dict (dict): A dictionary mapping team names (str) to their 
            ranking values (float or int).
        k (float, optional): Logistic scaling factor controlling the influence 
            of ranking differences on win probability. Defaults to 0.1.

    Returns:
        str: The name of the winning team, either `team1` or `team2`.
    '''
    team1_rank = team_rank_dict[team1]
    team2_rank = team_rank_dict[team2]
    
    rank_diff = team1_rank - team2_rank
    prob_team1_wins = 1 / (1 + np.exp(k * rank_diff))
    
    if random.random() < prob_team1_wins:
        return team1
    else:
        return team2
'''
def play_match(team1, team2, team_rank_dict):
    """
    Simulate one match between two teams.
    Lower rank = better team.
    Returns the name of the winning team.
    """
    # Get the ranks for both teams
    team1_rank = team_rank_dict[team1]
    team2_rank = team_rank_dict[team2]
    
    # Calculate win probability for team1
    # If team1 is rank 5 and team2 is rank 10:
    # team1 wins with probability = 10/(5+10) = 0.67 (67%)
    prob_team1_wins = team2_rank / (team1_rank + team2_rank)
    
    # Generate random number between 0 and 1
    if random.random() < prob_team1_wins:
        return team1  # team1 wins
    else:
        return team2  # team2 wins
'''
def simulate_one_tournament(df, team_rank_dict, teams_list):
    """Simulate a full World Cup tournament for a given set of teams and rankings.

    The tournament is simulated in stages:
    Group Stage -> Round of 16 -> Quarter Finals -> Semi Finals -> Third Place Match -> Final.
    Match outcomes are determined using the `play_match` function, which uses team
    rankings to calculate win probabilities.

    Args:
        df: DataFrame containing team and group information. Must
            have columns 'team' and 'group'.
        team_rank_dict: Dictionary mapping team names (str) to ranking values
            (float or int), used to determine win probabilities.
        teams_list: List of all participating team names.

    Returns:
        dict: A dictionary mapping each team name to an integer representing
            the stage they reached:
                1: Group stage
                2: Round of 16
                3: Quarter Final
                4: Fourth place (lost semis, lost 3rd place match)
                5: Won Third Place Match(lost semis, won 3rd place match)
                6: Runner-up in Final
                7: Tournament Champion
    """
    groups = []
    unique_groups = sorted(df['group'].unique())
    
    for group_name in unique_groups:
        teams_in_group = df[df['group'] == group_name]['team'].tolist()
        groups.append(teams_in_group)
    
    stage_reached = {}
    for team in teams_list:
        stage_reached[team] = 1
    
    # Group Stage
    teams_advancing = []
    for group in groups:
        points = {team: 0 for team in group}
        
        for i in range(len(group)):
            for j in range(i + 1, len(group)):
                winner = play_match(group[i], group[j], team_rank_dict)
                points[winner] += 3
        
        sorted_group = sorted(group, key=lambda t: (-points[t], team_rank_dict[t]))
        teams_advancing.extend(sorted_group[:2])
    
    for team in teams_advancing:
        stage_reached[team] = 2
    
    # Round of 16
    quarter_finalists = []
    for i in range(0, 16, 2):
        winner = play_match(teams_advancing[i], teams_advancing[i+1], team_rank_dict)
        quarter_finalists.append(winner)
    
    for team in quarter_finalists:
        stage_reached[team] = 3
    
    # Quarter Finals
    semi_finalists = []
    for i in range(0, 8, 2):
        winner = play_match(quarter_finalists[i], quarter_finalists[i+1], team_rank_dict)
        semi_finalists.append(winner)
    
    for team in semi_finalists:
        stage_reached[team] = 4
    
    # Semi Finals
    finalists = []
    third_place_teams = []
    
    winner1 = play_match(semi_finalists[0], semi_finalists[1], team_rank_dict)
    loser1 = semi_finalists[1] if winner1 == semi_finalists[0] else semi_finalists[0]
    finalists.append(winner1)
    third_place_teams.append(loser1)
    
    winner2 = play_match(semi_finalists[2], semi_finalists[3], team_rank_dict)
    loser2 = semi_finalists[3] if winner2 == semi_finalists[2] else semi_finalists[2]
    finalists.append(winner2)
    third_place_teams.append(loser2)
    
    # Third Place Match
    third_place_winner = play_match(third_place_teams[0], third_place_teams[1], team_rank_dict)
    fourth_place = third_place_teams[1] if third_place_winner == third_place_teams[0] else third_place_teams[0]
    
    stage_reached[third_place_winner] = 5
    stage_reached[fourth_place] = 4
    
    # Final
    champion = play_match(finalists[0], finalists[1], team_rank_dict)
    runner_up = finalists[1] if champion == finalists[0] else finalists[0]
    
    stage_reached[champion] = 7
    stage_reached[runner_up] = 6
    
    return stage_reached

# run simulations for all years
print("Running simulations for all World Cup years...")
num_simulations = 10000
all_results = {}

for year, groups in world_cups.items():
    print(f"\nSimulating {year} World Cup...")
    
    # Get data for this year
    df = get_world_cup_rank_df(groups, year)
    team_rank = dict(zip(df['team'], df['rank']))
    team_group = dict(zip(df['team'], df['group']))
    teams = df['team'].tolist()
    
    # Initialize results storage
    team_results = {}
    for team in teams:
        team_results[team] = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0}
    
    # Run simulations
    for sim in range(num_simulations):
        if (sim + 1) % 2000 == 0:
            print(f"  Completed {sim + 1} simulations...")
        
        results = simulate_one_tournament(df, team_rank, teams)
        
        for team, stage in results.items():
            team_results[team][stage] += 1
    
    # Store results for this year
    all_results[year] = {
        'team_results': team_results,
        'team_rank': team_rank,
        'team_group': team_group,
        'teams': teams
    }

print("\n All simulations complete!")

# Create interactive visualization
def plot_team_results(year, team_name):
    """plot results for a specific team and year"""
    
    # Get data for this year and team
    year_data = all_results[year]
    team_data = year_data['team_results'][team_name]
    team_rank = year_data['team_rank'][team_name]
    team_group = year_data['team_group'][team_name]
    
    # Prepare data
    stages = list(range(1, 8))
    counts = [team_data[s] for s in stages]
    percentages = [(c / num_simulations) * 100 for c in counts]
    colors = [stage_colors[s] for s in stages]
    labels = [stage_names[s] for s in stages]
    
    # Create figure
    fig, ax = plt.subplots(figsize=(16, 6))
    
    # Create bar chart
    bars = ax.bar(labels, percentages, color=colors, alpha=0.8, edgecolor='black', linewidth=2)
    
    # Customize
    ax.set_ylabel('Percentage (%)', fontsize=14, fontweight='bold')
    ax.set_title(f'{team_name} - {year} World Cup Results Distribution\nRank: {team_rank} | Group: {team_group}', 
                 fontsize=16, fontweight='bold', pad=10)
    ax.tick_params(axis='x', rotation=30, labelsize=11)
    ax.tick_params(axis='y', labelsize=11)
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    ax.set_axisbelow(True)
    
    # Set y-axis limit
    max_percentage = max(percentages)
    ax.set_ylim(0, max_percentage * 1.15)
    
    # Add percentage labels on bars
    for bar, pct, count in zip(bars, percentages, counts):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 1.5,
                f'{pct:.1f}%\n({count:,})',
                ha='center', va='bottom', fontsize=10, fontweight='bold',
                bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor='none', alpha=0.9))
    
    # Add key stats text
    advanced = ((num_simulations - team_data[1]) / num_simulations * 100)
    won = (team_data[7] / num_simulations * 100)
    
    stats_text = f'Advanced from Group: {advanced:.1f}% | Won Tournament: {won:.1f}%'
    ax.text(0.5, -0.25, stats_text, transform=ax.transAxes, 
            ha='center', fontsize=12, fontweight='bold',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    plt.tight_layout()
    plt.show()

# Create dropdown widgets
year_dropdown = widgets.Dropdown(
    options=sorted(world_cups.keys()),
    value=2022,
    description='Year:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='200px')
)

# Function to update team options based on selected year
def update_team_options(*args):
    year = year_dropdown.value
    teams_for_year = sorted(all_results[year]['teams'], 
                           key=lambda t: all_results[year]['team_rank'][t])
    team_dropdown.options = teams_for_year
    team_dropdown.value = teams_for_year[0]

team_dropdown = widgets.Dropdown(
    options=sorted(all_results[2022]['teams'], 
                  key=lambda t: all_results[2022]['team_rank'][t]),
    value=sorted(all_results[2022]['teams'], 
                key=lambda t: all_results[2022]['team_rank'][t])[0],
    description='Team:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='300px')
)

# Link year dropdown to update team options
year_dropdown.observe(update_team_options, 'value')

# Create interactive plot
interactive_plot = widgets.interactive(plot_team_results, 
                                      year=year_dropdown, 
                                      team_name=team_dropdown)
display(interactive_plot)

IndentationError: unindent does not match any outer indentation level (<string>, line 51)