In [113]:
import pandas as pd
import numpy as np

In [118]:
def replace_value_with_nan(df, columns, value):
    """
    Replace '-' with NaN in the specified columns of a pandas DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame to modify.
    columns (list): A list of column names where '-' should be replaced with NaN.

    Returns:
    pd.DataFrame: The modified DataFrame with '-' replaced by NaN.
    """
    df[columns] = df[columns].replace(value, np.nan)
    return df

WOMEN

In [114]:
# Women's vs. Men's distance races
# It seems like the men really showed up! Medals in every event from the 1500m to the 10k, whereas the women had NO medals in these events :/

In [115]:
# Specify the path to your text file
file_path = 'womens_marathon_result.txt'

# Read the text file into a pandas DataFrame
w_marathon_result = pd.read_csv(file_path, sep=',', header=0)

# Display the DataFrame
w_marathon_result.head()


Unnamed: 0,pos,bib,country,athlete,parsed_mark_time
0,1,270,NED,Sifan HASSAN,2:22:55
1,2,1790,ETH,Tigst ASSEFA,2:22:58
2,3,2118,KEN,Hellen OBIRI,2:23:10
3,4,2114,KEN,Sharon LOKEDI,2:23:14
4,5,1799,ETH,Amane Beriso SHANKULE,2:23:57


In [116]:
# Specify the path to your text file
file_path = 'womens_marathon_sb.txt'

# Read the text file into a pandas DataFrame
w_marathon_sb = pd.read_csv(file_path, sep=',', header=0)

# Display the DataFrame
w_marathon_sb.head()

Unnamed: 0,bib,country,athlete,pb,sb,WRK
0,1788,ETH,Megertu ALEMU,2:16:34,2:16:34,7
1,1790,ETH,Tigst ASSEFA,2:11:53,2:16:23,1
2,1654,CHN,Li BAI,2:26:33,2:28:17,143
3,2161,MGL,Munkhzaya BAYARTSOGT,2:24:45,2:24:45,133
4,2335,SUI,Helen BEKELE,2:19:44,2:25:25,45


In [120]:
# Clean columns
w_marathon_sb = replace_value_with_nan(w_marathon_sb, ['pb','sb','WRK'], '-')
w_marathon_result = replace_value_with_nan(w_marathon_result, ['pos'],'DNF')
w_marathon_sb['WRK'] = w_marathon_sb['WRK'].astype('Int64')
w_marathon_result['pos'] = w_marathon_result['pos'].astype('Int64')

In [121]:
# Add ranking columns
w_marathon_sb['pb_rank'] = w_marathon_sb['pb'].rank(method='min',na_option='bottom').astype('Int64')
w_marathon_sb['WRK_rank'] = w_marathon_sb['WRK'].replace(np.nan, 10000).rank(method='min',na_option='bottom').astype('Int64')

In [122]:
# Join results and WR
w_marathon_result_sb = w_marathon_result.merge(w_marathon_sb, on = ['athlete','bib','country'], how = 'inner')

In [123]:
# Get difference between world ranking and finish place
w_marathon_result_sb['place_rank_diff'] = w_marathon_result_sb['WRK_rank'] - w_marathon_result_sb['pos']

In [124]:
w_marathon_result_sb[w_marathon_result_sb['country']=='USA']

Unnamed: 0,pos,bib,country,athlete,parsed_mark_time,pb,sb,WRK,pb_rank,WRK_rank,place_rank_diff
11,12.0,2450,USA,Dakotah LINDWURM,2:26:44,2:24:40,2:25:31,138,48,28,16.0
22,23.0,2471,USA,Emily SISSON,2:29:53,2:18:29,2:22:42,60,8,16,-7.0
90,,2461,USA,Fiona O'KEEFFE,DNF,2:22:10,2:22:10,187,21,42,


In [125]:
w_marathon_result_sb.sort_values('place_rank_diff').head(5)

Unnamed: 0,pos,bib,country,athlete,parsed_mark_time,pb,sb,WRK,pb_rank,WRK_rank,place_rank_diff
71,72,1671,CHN,Yuyu XIA,2:42:10,2:25:45,2:25:45,95,65,22,-50
58,59,1676,CHN,Deshun ZHANG,2:36:47,2:24:05,2:26:53,44,36,11,-48
75,76,1654,CHN,Li BAI,2:44:44,2:26:33,2:28:17,143,79,30,-46
57,58,1746,ERI,Dolshi TESFU,2:36:30,2:20:40,,75,15,18,-40
77,78,1871,GBR,Rose HARVEY,2:51:03,2:23:21,,180,27,41,-37


In [127]:
w_marathon_result_sb.sort_values('place_rank_diff', ascending=False).head(5)

Unnamed: 0,pos,bib,country,athlete,parsed_mark_time,pb,sb,WRK,pb_rank,WRK_rank,place_rank_diff
18,19,1584,BEL,Hanne VERBRUGGEN,2:29:03,2:26:32,,302,77,73,54
32,33,2099,KAZ,Zhanna MAMAZHANOVA,2:30:51,2:26:42,2:26:42,482,85,86,53
25,26,1714,CZE,Tereza HROCHOVÁ,2:30:00,2:26:38,2:26:38,348,83,78,52
30,31,2135,LES,Mokulubete Blandina MAKATISI,2:30:20,2:30:54,2:30:54,409,88,81,50
36,37,2314,RSA,Irvette VAN ZYL,2:31:14,2:26:11,,488,73,87,50


In [139]:
w_marathon_result_sb[w_marathon_result_sb['parsed_mark_time']<w_marathon_result_sb['pb']]

Unnamed: 0,pos,bib,country,athlete,parsed_mark_time,pb,sb,WRK,pb_rank,WRK_rank,place_rank_diff
2,3,2118,KEN,Hellen OBIRI,2:23:10,2:25:49,,11,67,6,3
3,4,2114,KEN,Sharon LOKEDI,2:23:14,2:23:23,,40,28,10,6
5,6,2092,JPN,Yuka SUZUKI,2:24:02,2:24:09,,78,37,19,13
30,31,2135,LES,Mokulubete Blandina MAKATISI,2:30:20,2:30:54,2:30:54,409,88,81,50


MEN

In [129]:
# Specify the path to your text file
file_path = 'mens_marathon_result.txt'

# Read the text file into a pandas DataFrame
m_marathon_result = pd.read_csv(file_path, sep=',', header=0)

# Display the DataFrame
m_marathon_result.head()


Unnamed: 0,pos,bib,country,athlete,parsed_mark_time
0,1.0,622,ETH,Tamirat TOLA,2:06:26
1,2.0,372,BEL,Bashir ABDI,2:06:47
2,3.0,974,KEN,Benson KIPRUTO,2:07:00
3,4.0,692,GBR,Emile CAIRESS,2:07:29
4,5.0,616,ETH,Deresa GELETA,2:07:31


In [130]:
# Specify the path to your text file
file_path = 'mens_marathon_sb.txt'

# Read the text file into a pandas DataFrame
m_marathon_sb = pd.read_csv(file_path, sep=',', header=0)

# Display the DataFrame
m_marathon_sb.head()

Unnamed: 0,bib,country,athlete,pb,sb,WRK
0,1215,SUD,Yaseen ABDALLA,-,-,-
1,372,BEL,Bashir ABDI,2:03:36,-,12
2,1216,SUI,Tadesse ABRAHAM,2:05:01,2:05:01,44
3,317,AUS,Liam ADAMS,2:08:39,-,214
4,926,JPN,Akira AKASAKI,2:09:01,-,259


In [131]:
# Clean columns
m_marathon_sb = replace_value_with_nan(m_marathon_sb, ['pb','sb','WRK'], '-')
m_marathon_result = replace_value_with_nan(m_marathon_result, ['pos'],'DNF')
m_marathon_sb['WRK'] = m_marathon_sb['WRK'].astype('Int64')
m_marathon_result['pos'] = m_marathon_result['pos'].astype('Int64')

In [132]:
# Add ranking columns
m_marathon_sb['pb_rank'] = m_marathon_sb['pb'].rank(method='min',na_option='bottom').astype('Int64')
m_marathon_sb['WRK_rank'] = m_marathon_sb['WRK'].replace(np.nan, 10000).rank(method='min',na_option='bottom').astype('Int64')

In [133]:
# Join results and WR
m_marathon_result_sb = m_marathon_result.merge(m_marathon_sb, on = ['athlete','bib','country'], how = 'inner')

In [134]:
# Get difference between world ranking and finish place
m_marathon_result_sb['place_rank_diff'] = m_marathon_result_sb['WRK_rank'] - m_marathon_result_sb['pos']

In [135]:
m_marathon_result_sb[m_marathon_result_sb['country']=='USA']

Unnamed: 0,pos,bib,country,athlete,parsed_mark_time,pb,sb,WRK,pb_rank,WRK_rank,place_rank_diff
7,8,1330,USA,Conner MANTZ,2:08:12,2:07:47,2:09:05,82,56,26,18
8,9,1357,USA,Clayton YOUNG,2:08:44,2:08:00,2:09:06,103,61,32,23
62,63,1324,USA,Leonard KORIR,2:18:45,2:07:56,2:09:57,236,60,59,-4


In [136]:
m_marathon_result_sb.sort_values('place_rank_diff').head(5)

Unnamed: 0,pos,bib,country,athlete,parsed_mark_time,pb,sb,WRK,pb_rank,WRK_rank,place_rank_diff
66,67,486,CHN,Jie HE,2:22:31,2:06:57,2:06:57,73,36,23,-44
38,39,612,ETH,Kenenisa BEKELE,2:12:24,2:01:41,2:04:15,7,2,5,-34
54,55,501,CHN,Shaohui YANG,2:14:48,2:07:09,2:07:26,71,41,22,-33
36,37,1273,UGA,Victor KIPLANGAT,2:11:59,2:05:09,2:07:44,6,15,4,-33
60,61,386,BEL,Koen NAERT,2:16:33,2:06:56,,98,35,30,-31


In [137]:
m_marathon_result_sb.sort_values('place_rank_diff', ascending=False).head(5)

Unnamed: 0,pos,bib,country,athlete,parsed_mark_time,pb,sb,WRK,pb_rank,WRK_rank,place_rank_diff
5,6,926,JPN,Akira AKASAKI,2:07:32,2:09:01,,259.0,75,61,55
32,33,1215,SUD,Yaseen ABDALLA,2:11:41,,,,81,80,47
27,28,1236,SWE,Suldan HASSAN,2:11:21,2:07:36,2:07:36,359.0,49,72,44
10,11,1177,RSA,Elroy GELANT,2:09:07,2:08:56,2:08:56,204.0,72,55,44
29,30,1222,SUI,Matthias KYBURZ,2:11:32,2:07:44,2:07:44,355.0,53,71,41


In [138]:
m_marathon_result_sb[m_marathon_result_sb['parsed_mark_time']<m_marathon_result_sb['pb']]

Unnamed: 0,pos,bib,country,athlete,parsed_mark_time,pb,sb,WRK,pb_rank,WRK_rank,place_rank_diff
5,6,926,JPN,Akira AKASAKI,2:07:32,2:09:01,,259,75,61,55
6,7,1011,LES,Tebello RAMAKONGOANA,2:07:58,2:08:09,2:08:09,58,66,20,13
