### Scraping the data from the IAAF website to build Pandas DataFrames. This part of the code has been adapted from: https://towardsdatascience.com/web-scraping-html-tables-with-python-c9baba21059 ###

In [1]:
import requests
import lxml.html as lh
import pandas as pd

In [2]:
# get url and create a page
url1='http://www.iaaf.org/records/all-time-toplists/sprints/100-metres/outdoor/men/senior'
url2='http://www.iaaf.org/records/all-time-toplists/sprints/200-metres/outdoor/men/senior'
# need this header to open the page from my laptop
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'}
page1 = requests.get(url1, headers=headers)
page2 = requests.get(url2, headers=headers)

# store the contents of the website
doc1 = lh.fromstring(page1.content)
doc2 = lh.fromstring(page2.content)

# parse data that are stored between <tr>..</tr> of HTML
tr1_elements = doc1.xpath('//tr')
tr2_elements = doc2.xpath('//tr')

# in our case the table starts at position 18
tr1_elements = tr1_elements[18:]
tr2_elements = tr2_elements[18:]

In [3]:
# create empty list
col1=[]
col2=[]

#For each row, store each first element (header) and an empty list
i=0
for t in tr1_elements[0]:
    i+=1
    name=t.text_content()
    name=name.strip()
    #print '%d:"%s"'%(i,name)
    col1.append((name,[]))

i=0
for t in tr2_elements[0]:
    i+=1
    name=t.text_content()
    name=name.strip()
    #print '%d:"%s"'%(i,name)
    col2.append((name,[]))

In [4]:
#Since our first row is the header, data is stored on the second row onwards
for j in range(1,len(tr1_elements)):
    #T is our j'th row
    T=tr1_elements[j]
    
    #If row is not of size 11, the //tr data is not from our table 
    if len(T)!=11:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content()
        data=data.strip()
        #Append the data to the empty list of the i'th column
        col1[i][1].append(data)
        #Increment i for the next column
        i+=1
        
for j in range(1,len(tr2_elements)):
    #T is our j'th row
    T=tr2_elements[j]
    
    #If row is not of size 11, the //tr data is not from our table 
    if len(T)!=11:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content()
        data=data.strip()
        #Append the data to the empty list of the i'th column
        col2[i][1].append(data)
        #Increment i for the next column
        i+=1

In [30]:
[len(C) for (title,C) in col1]

[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]

In [78]:
Dict1={title:column for (title,column) in col1}
Dict2={title:column for (title,column) in col2}

# there's a key with all blank entries so we get rid of it
del Dict1['']
del Dict2['']

# create DataFrames
df1=pd.DataFrame(Dict1)
df2=pd.DataFrame(Dict2)

# changing header name
df1.rename(columns={'Mark':'100m'}, inplace=True)
df2.rename(columns={'Mark':'200m'}, inplace=True)

### Creating combined 100-200m list and perform analysis ###

In [79]:
df3=pd.merge(df1[['Competitor','100m']], df2[['Competitor','200m']], on='Competitor')

In [82]:
df3

Unnamed: 0,Competitor,100m,200m
0,Usain BOLT,9.58,19.19
1,Tyson GAY,9.69,19.58
2,Yohan BLAKE,9.69,19.26
3,Asafa POWELL,9.72,19.9
4,Justin GATLIN,9.74,19.57
5,Maurice GREENE,9.79,19.86
6,Christian COLEMAN,9.79,19.85
7,Steve MULLINGS,9.8,19.98
8,Trayvon BROMELL,9.84,20.03
9,Carl LEWIS,9.86,19.75
