# Summary Statistics: The Effect of Winning a World Cup

This notebook provides descriptive statistics for the event study sample used in replicating Mello (OBES).

In [2]:
# Install required packages (run once)
%pip install jinja2

import pandas as pd
import numpy as np
from IPython.display import display, HTML

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Load the data
df = pd.read_csv('paper_replication_event_study_sample.csv')
print(f"Dataset shape: {df.shape[0]:,} observations × {df.shape[1]} variables")

Dataset shape: 8,737 observations × 32 variables


## 1. Panel Structure

In [4]:
# Panel dimensions
n_countries = df['country'].nunique()
n_quarters = df['quarter'].nunique()
year_range = f"{df['year'].min()}–{df['year'].max()}"

panel_info = pd.DataFrame({
    'Metric': ['Countries', 'Quarters', 'Year Range', 'Total Observations'],
    'Value': [n_countries, n_quarters, year_range, f"{len(df):,}"]
})
display(panel_info.style.hide(axis='index').set_caption('Panel Structure'))

# Countries in the sample
print("\nCountries in sample:")
print(", ".join(sorted(df['country'].unique())))

Metric,Value
Countries,48
Quarters,244
Year Range,1961–2021
Total Observations,8737



Countries in sample:
ARG, AUS, AUT, BEL, BGR, BRA, CAN, CHE, CHL, COL, CRI, CZE, DEU, DNK, ESP, EST, FIN, FRA, GBR, GRC, HRV, HUN, IDN, IND, IRL, ISL, ISR, ITA, JPN, KOR, LTU, LUX, LVA, MEX, NLD, NOR, NZL, POL, PRT, ROU, RUS, SAU, SVK, SVN, SWE, TUR, USA, ZAF


## 2. Treatment Status Summary

In [5]:
# World Cup winners (1998-2018)
wc_winners = {
    1998: 'FRA',
    2002: 'BRA', 
    2006: 'ITA',
    2010: 'ESP',
    2014: 'DEU',
    2018: 'FRA'
}

# World Cup hosts
wc_hosts = {
    1998: 'FRA',
    2002: ['JPN', 'KOR'],
    2006: 'DEU',
    2010: 'ZAF',
    2014: 'BRA',
    2018: 'RUS'
}

print("World Cup Winners (SDiD treatment group):")
for year, winner in wc_winners.items():
    print(f"  {year}: {winner}")

print("\nWorld Cup Hosts:")
for year, host in wc_hosts.items():
    host_str = ', '.join(host) if isinstance(host, list) else host
    print(f"  {year}: {host_str}")

World Cup Winners (SDiD treatment group):
  1998: FRA
  2002: BRA
  2006: ITA
  2010: ESP
  2014: DEU
  2018: FRA

World Cup Hosts:
  1998: FRA
  2002: JPN, KOR
  2006: DEU
  2010: ZAF
  2014: BRA
  2018: RUS


In [6]:
# Treatment indicators summary
treatment_summary = pd.DataFrame({
    'Variable': ['Winner (country-quarters)', 'Host (country-quarters)', 'Host & Winner'],
    'Observations': [
        df['winner'].sum(),
        df['host'].sum(),
        df[(df['winner'] == 1) & (df['host'] == 1)].shape[0]
    ],
    'Share (%)': [
        100 * df['winner'].mean(),
        100 * df['host'].mean(),
        100 * ((df['winner'] == 1) & (df['host'] == 1)).mean()
    ]
})
display(treatment_summary.style.hide(axis='index').format({'Share (%)': '{:.2f}'}).set_caption('Treatment Status'))

Variable,Observations,Share (%)
Winner (country-quarters),10,0.11
Host (country-quarters),14,0.16
Host & Winner,3,0.03


## 3. Outcome Variables: Summary Statistics

In [7]:
# Define key outcome variables (YoY percentage changes)
outcome_vars = {
    'gross_domestic_product_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_pct': 'GDP Growth (%)',
    'final_consumption_expenditure_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_pct': 'Consumption Growth (%)',
    'gross_fixed_capital_formation_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_pct': 'Investment Growth (%)',
    'exports_of_goods_and_services_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_pct': 'Exports Growth (%)',
    'imports_of_goods_and_services_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_pct': 'Imports Growth (%)',
    'population_yoy_pct': 'Population Growth (%)'
}

# Summary statistics
summary_stats = df[list(outcome_vars.keys())].describe().T
summary_stats['N (non-missing)'] = df[list(outcome_vars.keys())].notna().sum()
summary_stats.index = [outcome_vars[col] for col in summary_stats.index]
summary_stats = summary_stats[['N (non-missing)', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]
summary_stats.columns = ['N', 'Mean', 'Std', 'Min', 'P25', 'Median', 'P75', 'Max']

display(summary_stats.style.format({
    'N': '{:.0f}',
    'Mean': '{:.2f}',
    'Std': '{:.2f}',
    'Min': '{:.2f}',
    'P25': '{:.2f}',
    'Median': '{:.2f}',
    'P75': '{:.2f}',
    'Max': '{:.2f}'
}).set_caption('Summary Statistics: YoY Growth Rates'))

Unnamed: 0,N,Mean,Std,Min,P25,Median,P75,Max
GDP Growth (%),8737,3.26,3.88,-21.98,1.38,3.21,5.27,28.15
Consumption Growth (%),8693,3.14,3.68,-41.4,1.41,2.98,4.87,41.68
Investment Growth (%),8693,4.34,12.32,-74.77,-1.11,3.96,9.04,296.81
Exports Growth (%),8693,6.05,9.13,-43.74,1.5,5.56,10.23,77.06
Imports Growth (%),8693,6.15,11.07,-63.73,0.83,5.91,11.45,89.95
Population Growth (%),8737,0.75,0.89,-2.52,0.23,0.67,1.23,5.56


## 4. Summary Statistics by Treatment Group

In [8]:
# GDP growth: Winners vs Non-Winners
gdp_col = 'gross_domestic_product_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_pct'

winner_stats = df.groupby('winner')[gdp_col].agg(['count', 'mean', 'std', 'median']).round(3)
winner_stats.index = ['Non-Winners', 'Winners']
winner_stats.columns = ['N', 'Mean', 'Std', 'Median']

print("GDP Growth by Winner Status:")
display(winner_stats)

# GDP growth: Hosts vs Non-Hosts
host_stats = df.groupby('host')[gdp_col].agg(['count', 'mean', 'std', 'median']).round(3)
host_stats.index = ['Non-Hosts', 'Hosts']
host_stats.columns = ['N', 'Mean', 'Std', 'Median']

print("\nGDP Growth by Host Status:")
display(host_stats)

GDP Growth by Winner Status:


Unnamed: 0,N,Mean,Std,Median
Non-Winners,8727,3.261,3.884,3.215
Winners,10,1.948,1.354,1.676



GDP Growth by Host Status:


Unnamed: 0,N,Mean,Std,Median
Non-Hosts,8723,3.26,3.884,3.215
Hosts,14,2.678,2.683,2.719


## 5. Country-Level Summary

In [9]:
# Observations per country
country_summary = df.groupby('country').agg({
    'quarter': 'count',
    'year': ['min', 'max'],
    gdp_col: ['mean', 'std'],
    'winner': 'sum',
    'host': 'sum'
}).round(2)

country_summary.columns = ['Obs', 'Start Year', 'End Year', 'GDP Mean', 'GDP Std', 'Winner Qtrs', 'Host Qtrs']
country_summary = country_summary.sort_values('Obs', ascending=False)

print(f"Country-Level Summary ({len(country_summary)} countries):")
display(country_summary)

Country-Level Summary (48 countries):


Unnamed: 0_level_0,Obs,Start Year,End Year,GDP Mean,GDP Std,Winner Qtrs,Host Qtrs
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AUS,244,1961,2021,3.38,2.27,0,0
AUT,244,1961,2021,2.65,2.66,0,0
BEL,244,1961,2021,2.59,2.46,0,0
CHE,244,1961,2021,2.24,2.7,0,0
ESP,244,1961,2021,3.2,3.85,1,1
DEU,244,1961,2021,2.33,2.63,3,2
DNK,244,1961,2021,2.34,2.65,0,0
LUX,244,1961,2021,3.63,3.55,0,0
IRL,244,1961,2021,5.13,4.68,0,0
ISL,244,1961,2021,3.61,4.56,0,0


## 6. Level Variables Summary

In [10]:
# Level variables (in billions USD PPP)
level_vars = {
    'gross_domestic_product_chain_linked_volume_rebased_us_dollars_ppp_converted': 'GDP (USD bn PPP)',
    'final_consumption_expenditure_chain_linked_volume_rebased_us_dollars_ppp_converted': 'Consumption (USD bn PPP)',
    'gross_fixed_capital_formation_chain_linked_volume_rebased_us_dollars_ppp_converted': 'Investment (USD bn PPP)',
    'exports_of_goods_and_services_chain_linked_volume_rebased_us_dollars_ppp_converted': 'Exports (USD bn PPP)',
    'imports_of_goods_and_services_chain_linked_volume_rebased_us_dollars_ppp_converted': 'Imports (USD bn PPP)',
    'population': 'Population (millions)'
}

# Convert to billions/millions for readability
df_display = df.copy()
for col in level_vars.keys():
    if col != 'population':
        df_display[col] = df_display[col] / 1e9  # to billions
    else:
        df_display[col] = df_display[col] / 1e6  # to millions

level_stats = df_display[list(level_vars.keys())].describe().T
level_stats['N (non-missing)'] = df_display[list(level_vars.keys())].notna().sum()
level_stats.index = [level_vars[col] for col in level_stats.index]
level_stats = level_stats[['N (non-missing)', 'mean', 'std', 'min', '50%', 'max']]
level_stats.columns = ['N', 'Mean', 'Std', 'Min', 'Median', 'Max']

display(level_stats.style.format({
    'N': '{:.0f}',
    'Mean': '{:.1f}',
    'Std': '{:.1f}',
    'Min': '{:.1f}',
    'Median': '{:.1f}',
    'Max': '{:.1f}'
}).set_caption('Summary Statistics: Level Variables'))

Unnamed: 0,N,Mean,Std,Min,Median,Max
GDP (USD bn PPP),8737,0.0,0.0,0.0,0.0,0.0
Consumption (USD bn PPP),8697,0.0,0.0,0.0,0.0,0.0
Investment (USD bn PPP),8697,0.0,0.0,0.0,0.0,0.0
Exports (USD bn PPP),8697,0.0,0.0,0.0,0.0,0.0
Imports (USD bn PPP),8697,0.0,0.0,0.0,0.0,0.0
Population (millions),8737,52.4,138.4,0.2,12.0,1414.2


## 7. Missing Data Overview

In [11]:
# Missing values by variable
missing_pct = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
missing_df = pd.DataFrame({
    'Variable': missing_pct.index,
    'Missing (%)': missing_pct.values
})
missing_df = missing_df[missing_df['Missing (%)'] > 0]

if len(missing_df) > 0:
    print("Variables with Missing Values:")
    display(missing_df.head(15).style.hide(axis='index').format({'Missing (%)': '{:.1f}'}))
else:
    print("No missing values in the dataset.")

Variables with Missing Values:


Variable,Missing (%)
final_consumption_expenditure_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_pct,0.5
gross_fixed_capital_formation_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_pct,0.5
exports_of_goods_and_services_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_pct,0.5
imports_of_goods_and_services_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_pct,0.5
gross_fixed_capital_formation_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_log_4q,0.5
exports_of_goods_and_services_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_log_4q,0.5
imports_of_goods_and_services_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_log_4q,0.5
final_consumption_expenditure_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_log_4q,0.5
final_consumption_expenditure_chain_linked_volume_rebased_us_dollars_ppp_converted,0.5
imports_of_goods_and_services_chain_linked_volume_rebased_us_dollars_ppp_converted,0.5


## 8. Export Summary Table (LaTeX-ready)

In [12]:
# Create publication-ready summary table
def create_summary_table(df, var_dict):
    """Create a summary statistics table."""
    rows = []
    for col, label in var_dict.items():
        if col in df.columns:
            series = df[col].dropna()
            rows.append({
                'Variable': label,
                'N': len(series),
                'Mean': series.mean(),
                'Std. Dev.': series.std(),
                'Min': series.min(),
                'Max': series.max()
            })
    return pd.DataFrame(rows)

summary_table = create_summary_table(df, outcome_vars)
print("Publication-Ready Summary Table:")
display(summary_table.style.hide(axis='index').format({
    'N': '{:,.0f}',
    'Mean': '{:.2f}',
    'Std. Dev.': '{:.2f}',
    'Min': '{:.2f}',
    'Max': '{:.2f}'
}))

# Export to LaTeX
latex_table = summary_table.to_latex(index=False, float_format='%.2f')
print("\nLaTeX Code:")
print(latex_table)

Publication-Ready Summary Table:


Variable,N,Mean,Std. Dev.,Min,Max
GDP Growth (%),8737,3.26,3.88,-21.98,28.15
Consumption Growth (%),8693,3.14,3.68,-41.4,41.68
Investment Growth (%),8693,4.34,12.32,-74.77,296.81
Exports Growth (%),8693,6.05,9.13,-43.74,77.06
Imports Growth (%),8693,6.15,11.07,-63.73,89.95
Population Growth (%),8737,0.75,0.89,-2.52,5.56



LaTeX Code:
\begin{tabular}{lrrrrr}
\toprule
Variable & N & Mean & Std. Dev. & Min & Max \\
\midrule
GDP Growth (%) & 8737 & 3.26 & 3.88 & -21.98 & 28.15 \\
Consumption Growth (%) & 8693 & 3.14 & 3.68 & -41.40 & 41.68 \\
Investment Growth (%) & 8693 & 4.34 & 12.32 & -74.77 & 296.81 \\
Exports Growth (%) & 8693 & 6.05 & 9.13 & -43.74 & 77.06 \\
Imports Growth (%) & 8693 & 6.15 & 11.07 & -63.73 & 89.95 \\
Population Growth (%) & 8737 & 0.75 & 0.89 & -2.52 & 5.56 \\
\bottomrule
\end{tabular}



In [13]:
# Save summary table to CSV
summary_table.to_csv('summary_statistics_table.csv', index=False)
print("Summary table saved to: summary_statistics_table.csv")

Summary table saved to: summary_statistics_table.csv


## 9. Table 1: Summary Statistics by Period (Paper Replication)

This table replicates Table 1 from Mello (OBES), comparing Winners vs Non-winners across different time periods.

In [14]:
# %pip install scipy
from scipy import stats

# Define columns
gdp_level_col = 'gross_domestic_product_chain_linked_volume_rebased_us_dollars_ppp_converted'
gdp_yoy_pct_col = 'gross_domestic_product_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_pct'

# Prep data with periods and winner group (country-level: ever won)
df2 = df.copy()

# Assign periods based on year ranges
def assign_period(year):
    if 1960 <= year <= 1980:
        return '1960–80'
    elif 1980 < year <= 2000:
        return '1980–2000'
    elif 2000 < year <= 2020:
        return '2000–20'
    return None

df2['period'] = df2['year'].apply(assign_period)

# Winner status at country level (ever had rank1 == 1)
winner_by_country = df2.groupby('country')['rank1'].apply(lambda x: (x == 1).any()).reset_index()
winner_by_country.columns = ['country', 'is_winner_country']
df2 = df2.merge(winner_by_country, on='country')
df2['winner_group'] = df2['is_winner_country'].map({True: 'Winner', False: 'Non-winner'})

# Create transformed variables (matching paper units)
df2['gdp_tbl'] = df2[gdp_level_col] / 1000  # thousands of (USD millions)
df2['pop_m'] = df2['population'] / 1e6  # millions
df2['gdp_pc'] = df2[gdp_level_col] * 1e6 / df2['population']  # USD per capita
df2['gdp_yoy'] = df2[gdp_yoy_pct_col]  # YoY percent

print("Winner countries:", df2[df2['is_winner_country']]['country'].unique().tolist())

Winner countries: ['BRA', 'DEU', 'ESP', 'FRA', 'GBR', 'ITA']


In [15]:
# Helper functions
def stars(p):
    """Return significance stars based on p-value."""
    if pd.isna(p):
        return ''
    elif p < 0.01:
        return '***'
    elif p < 0.05:
        return '**'
    elif p < 0.10:
        return '*'
    return ''

def fmt_mean_sd(mean, sd, digits=2, big=False):
    """Format mean (SD) string."""
    if big:
        return f"{mean:,.{digits}f} ({sd:,.{digits}f})"
    return f"{mean:.{digits}f} ({sd:.{digits}f})"

def summ_row(data, var, label, digits=2, big=False):
    """Create a summary row for Winner vs Non-winner comparison."""
    d = data.dropna(subset=[var])
    
    w = d[d['winner_group'] == 'Winner'][var]
    n = d[d['winner_group'] == 'Non-winner'][var]
    
    w_mean, w_sd = w.mean(), w.std()
    n_mean, n_sd = n.mean(), n.std()
    
    # t-test
    try:
        tstat, pval = stats.ttest_ind(w, n, nan_policy='omit')
    except:
        tstat, pval = np.nan, np.nan
    
    t_str = f"{tstat:.2f}{stars(pval)}" if not pd.isna(tstat) else ""
    
    return {
        'Row': label,
        'Winner': fmt_mean_sd(w_mean, w_sd, digits, big),
        'Non-winner': fmt_mean_sd(n_mean, n_sd, digits, big),
        't-test': t_str
    }

def make_period_block(data, period_name):
    """Create a block of rows for a given period."""
    if period_name == 'Full sample':
        d = data
    else:
        d = data[data['period'] == period_name]
    
    rows = [
        summ_row(d, 'gdp_tbl', 'GDP (in thousands of 2015 US dollar millions)', digits=2, big=False),
        summ_row(d, 'pop_m', 'Population (in millions)', digits=2, big=False),
        summ_row(d, 'gdp_pc', 'GDP per capita', digits=2, big=True),
        summ_row(d, 'gdp_yoy', 'Year-on-Year GDP growth', digits=2, big=False),
    ]
    
    block = pd.DataFrame(rows)
    block['Period'] = period_name
    
    # Add counts for Full sample
    if period_name == 'Full sample':
        n_cty_w = d[d['winner_group'] == 'Winner']['country'].nunique()
        n_cty_n = d[d['winner_group'] == 'Non-winner']['country'].nunique()
        n_obs_w = len(d[d['winner_group'] == 'Winner'])
        n_obs_n = len(d[d['winner_group'] == 'Non-winner'])
        
        extra_rows = pd.DataFrame([
            {'Period': period_name, 'Row': 'Number of countries', 
             'Winner': str(n_cty_w), 'Non-winner': str(n_cty_n), 't-test': ''},
            {'Period': period_name, 'Row': 'Number of observations', 
             'Winner': str(n_obs_w), 'Non-winner': str(n_obs_n), 't-test': ''}
        ])
        block = pd.concat([block, extra_rows], ignore_index=True)
    
    return block[['Period', 'Row', 'Winner', 'Non-winner', 't-test']]

In [16]:
# Build Table 1
period_levels = ['1960–80', '1980–2000', '2000–20', 'Full sample']

table1_df = pd.concat([make_period_block(df2, p) for p in period_levels], ignore_index=True)

print("Table 1: Summary Statistics (Paper Replication)")
print("=" * 80)
display(table1_df)

Table 1: Summary Statistics (Paper Replication)


Unnamed: 0,Period,Row,Winner,Non-winner,t-test
0,1960–80,GDP (in thousands of 2015 US dollar millions),1292.76 (512.37),536.91 (1236.34),11.99***
1,1960–80,Population (in millions),54.24 (13.93),26.03 (45.84),12.18***
2,1960–80,GDP per capita,"23,187.93 (5,203.63)","22,219.72 (10,419.19)",1.81*
3,1960–80,Year-on-Year GDP growth,4.21 (3.00),4.73 (3.56),-2.69***
4,1980–2000,GDP (in thousands of 2015 US dollar millions),2197.33 (736.26),949.47 (2130.99),11.74***
5,1980–2000,Population (in millions),61.15 (22.37),41.27 (100.62),3.99***
6,1980–2000,GDP per capita,"36,315.83 (7,322.12)","31,503.05 (16,208.15)",5.91***
7,1980–2000,Year-on-Year GDP growth,2.42 (1.79),3.27 (3.63),-4.63***
8,2000–20,GDP (in thousands of 2015 US dollar millions),3109.93 (819.26),1368.24 (3020.17),12.57***
9,2000–20,Population (in millions),84.89 (50.69),66.51 (197.17),2.03**


In [17]:
# Export Table 1 to LaTeX
def table1_to_latex(df):
    """Convert Table 1 DataFrame to LaTeX with period groupings."""
    latex = r"""\begin{table}[htbp]
\centering
\caption{Summary statistics for the event-study sample}
\label{tab:summary_stats}
\begin{tabular}{lcccc}
\toprule
 & \multicolumn{2}{c}{Winner} & \multicolumn{2}{c}{Non-winner} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5}
 & Mean (SD) & & Mean (SD) & t-test \\
\midrule
"""
    
    current_period = None
    for _, row in df.iterrows():
        if row['Period'] != current_period:
            current_period = row['Period']
            latex += f"\\textbf{{{current_period}}} \\\\\n"
        
        latex += f"\\quad {row['Row']} & {row['Winner']} & & {row['Non-winner']} & {row['t-test']} \\\\\n"
    
    latex += r"""\bottomrule
\end{tabular}
\begin{tablenotes}
\small
\item Notes: Standard deviations in parentheses. *p<0.10, **p<0.05, ***p<0.01.
\end{tablenotes}
\end{table}"""
    
    return latex

latex_table1 = table1_to_latex(table1_df)
print("LaTeX Code for Table 1:")
print(latex_table1)

LaTeX Code for Table 1:
\begin{table}[htbp]
\centering
\caption{Summary statistics for the event-study sample}
\label{tab:summary_stats}
\begin{tabular}{lcccc}
\toprule
 & \multicolumn{2}{c}{Winner} & \multicolumn{2}{c}{Non-winner} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5}
 & Mean (SD) & & Mean (SD) & t-test \\
\midrule
\textbf{1960–80} \\
\quad GDP (in thousands of 2015 US dollar millions) & 1292.76 (512.37) & & 536.91 (1236.34) & 11.99*** \\
\quad Population (in millions) & 54.24 (13.93) & & 26.03 (45.84) & 12.18*** \\
\quad GDP per capita & 23,187.93 (5,203.63) & & 22,219.72 (10,419.19) & 1.81* \\
\quad Year-on-Year GDP growth & 4.21 (3.00) & & 4.73 (3.56) & -2.69*** \\
\textbf{1980–2000} \\
\quad GDP (in thousands of 2015 US dollar millions) & 2197.33 (736.26) & & 949.47 (2130.99) & 11.74*** \\
\quad Population (in millions) & 61.15 (22.37) & & 41.27 (100.62) & 3.99*** \\
\quad GDP per capita & 36,315.83 (7,322.12) & & 31,503.05 (16,208.15) & 5.91*** \\
\quad Year-on-Year GDP growth

In [None]:
# Save Table 1 to CSV
table1_df.to_csv('table1_summary_statistics.csv', index=False)
print("Table 1 saved to: table1_summary_statistics.csv")

# Also save LaTeX to file
with open('table1_summary_statistics.tex', 'w') as f:
    f.write(latex_table1)
print("LaTeX saved to: table1_summary_statistics.tex")

Table 1 saved to: table1_summary_statistics.csv
LaTeX saved to: table1_summary_statistics.tex


## 10. Table 1 (Alternative): GDP Data Starting from 1961

This version assumes GDP level data only starts in 1961. Since YoY growth requires the prior year as base, YoY GDP growth is only available from **1962 onwards**. The first period is therefore adjusted to **1962–80** and observations from 1961 are excluded.

In [19]:
# Create alternative dataset: GDP data starts 1961, so YoY only available from 1962 onwards
df3 = df.copy()

# Filter to years >= 1962 (since YoY growth needs prior year data, and GDP starts in 1961)
df3 = df3[df3['year'] >= 1962].copy()

# Assign periods based on year ranges (starting from 1962)
def assign_period_1962(year):
    if 1962 <= year <= 1980:
        return '1962–80'
    elif 1980 < year <= 2000:
        return '1980–2000'
    elif 2000 < year <= 2020:
        return '2000–20'
    return None

df3['period'] = df3['year'].apply(assign_period_1962)

# Winner status at country level (ever had rank1 == 1)
winner_by_country = df3.groupby('country')['rank1'].apply(lambda x: (x == 1).any()).reset_index()
winner_by_country.columns = ['country', 'is_winner_country']
df3 = df3.merge(winner_by_country, on='country')
df3['winner_group'] = df3['is_winner_country'].map({True: 'Winner', False: 'Non-winner'})

# Create transformed variables (matching paper units)
df3['gdp_tbl'] = df3[gdp_level_col] / 1000  # thousands of (USD millions)
df3['pop_m'] = df3['population'] / 1e6  # millions
df3['gdp_pc'] = df3[gdp_level_col] * 1e6 / df3['population']  # USD per capita
df3['gdp_yoy'] = df3[gdp_yoy_pct_col]  # YoY percent

# Check year range
print(f"Year range in filtered data: {df3['year'].min()} to {df3['year'].max()}")
print(f"\nOriginal observations: {len(df)}")
print(f"Filtered observations (year >= 1962): {len(df3)}")
print(f"\nObservations by period:")
print(df3.groupby('period').size())

Year range in filtered data: 1962 to 2021

Original observations: 8737
Filtered observations (year >= 1962): 8633

Observations by period:
period
1962–80      2052
1980–2000    2590
2000–20      3800
dtype: int64


In [20]:
# Build Table 1 (Alternative) with 1962-based periods (YoY requires prior year)
period_levels_alt = ['1962–80', '1980–2000', '2000–20', 'Full sample']

# Use the same helper functions but with df3
def make_period_block_alt(data, period_name):
    """Create a block of rows for a given period (alternative version)."""
    if period_name == 'Full sample':
        d = data
    else:
        d = data[data['period'] == period_name]
    
    rows = [
        summ_row(d, 'gdp_tbl', 'GDP (in thousands of 2015 US dollar millions)', digits=2, big=False),
        summ_row(d, 'pop_m', 'Population (in millions)', digits=2, big=False),
        summ_row(d, 'gdp_pc', 'GDP per capita', digits=2, big=True),
        summ_row(d, 'gdp_yoy', 'Year-on-Year GDP growth', digits=2, big=False),
    ]
    
    block = pd.DataFrame(rows)
    block['Period'] = period_name
    
    # Add counts for Full sample
    if period_name == 'Full sample':
        n_cty_w = d[d['winner_group'] == 'Winner']['country'].nunique()
        n_cty_n = d[d['winner_group'] == 'Non-winner']['country'].nunique()
        n_obs_w = len(d[d['winner_group'] == 'Winner'])
        n_obs_n = len(d[d['winner_group'] == 'Non-winner'])
        
        extra_rows = pd.DataFrame([
            {'Period': period_name, 'Row': 'Number of countries', 
             'Winner': str(n_cty_w), 'Non-winner': str(n_cty_n), 't-test': ''},
            {'Period': period_name, 'Row': 'Number of observations', 
             'Winner': str(n_obs_w), 'Non-winner': str(n_obs_n), 't-test': ''}
        ])
        block = pd.concat([block, extra_rows], ignore_index=True)
    
    return block[['Period', 'Row', 'Winner', 'Non-winner', 't-test']]

table1_alt_df = pd.concat([make_period_block_alt(df3, p) for p in period_levels_alt], ignore_index=True)

print("Table 1 (Alternative): Summary Statistics with GDP starting from 1961")
print("=" * 80)
display(table1_alt_df)

Table 1 (Alternative): Summary Statistics with GDP starting from 1961


Unnamed: 0,Period,Row,Winner,Non-winner,t-test
0,1962–80,GDP (in thousands of 2015 US dollar millions),1316.58 (509.59),547.70 (1253.62),11.74***
1,1962–80,Population (in millions),54.43 (13.91),26.20 (46.04),11.83***
2,1962–80,GDP per capita,"23,566.32 (5,027.39)","22,544.44 (10,417.95)",1.86*
3,1962–80,Year-on-Year GDP growth,4.09 (2.93),4.71 (3.57),-3.18***
4,1980–2000,GDP (in thousands of 2015 US dollar millions),2197.33 (736.26),949.47 (2130.99),11.74***
5,1980–2000,Population (in millions),61.15 (22.37),41.27 (100.62),3.99***
6,1980–2000,GDP per capita,"36,315.83 (7,322.12)","31,503.05 (16,208.15)",5.91***
7,1980–2000,Year-on-Year GDP growth,2.42 (1.79),3.27 (3.63),-4.63***
8,2000–20,GDP (in thousands of 2015 US dollar millions),3109.93 (819.26),1368.24 (3020.17),12.57***
9,2000–20,Population (in millions),84.89 (50.69),66.51 (197.17),2.03**


I THINK THE 4 OBS MISSING ARE ARGENTINE. THE PAPER SAYS THE SERIES STARTS IN 1993. THECNICALLY I CAN ONLY GET YOY DATA FROM 1994 which have....or somewwhere one control country starts later than expected


In [21]:
# Check Brazil's data coverage for GDP and YoY
bra_data = df[df['country'] == 'BRA'].copy()

print(f"Brazil data range: {bra_data['year'].min()} to {bra_data['year'].max()}")
print(f"Brazil total observations: {len(bra_data)}")

# Check GDP level data
gdp_notna = bra_data[gdp_level_col].notna().sum()
gdp_na = bra_data[gdp_level_col].isna().sum()
print(f"\nGDP level data:")
print(f"  Non-missing: {gdp_notna}")
print(f"  Missing: {gdp_na}")

# Check YoY GDP data
yoy_notna = bra_data[gdp_yoy_pct_col].notna().sum()
yoy_na = bra_data[gdp_yoy_pct_col].isna().sum()
print(f"\nYoY GDP growth data:")
print(f"  Non-missing: {yoy_notna}")
print(f"  Missing: {yoy_na}")

# Show first few years
print(f"\nFirst years of Brazil data:")
print(bra_data[['year', 'quarter', gdp_level_col, gdp_yoy_pct_col]].head(8).to_string())

Brazil data range: 1998 to 2021
Brazil total observations: 95

GDP level data:
  Non-missing: 95
  Missing: 0

YoY GDP growth data:
  Non-missing: 95
  Missing: 0

First years of Brazil data:
     year  quarter  gross_domestic_product_chain_linked_volume_rebased_us_dollars_ppp_converted  gross_domestic_product_chain_linked_volume_rebased_us_dollars_ppp_converted_yoy_pct
944  1998  1998-Q2                                                                  2175834.800                                                                                1.810
945  1998  1998-Q3                                                                  2175182.300                                                                                0.510
946  1998  1998-Q4                                                                  2150199.700                                                                               -1.472
947  1999  1999-Q1                                                                  