### Scraping the data from the IAAF website to build a Pandas DataFrame. This part of the code has been adapted from: https://towardsdatascience.com/web-scraping-html-tables-with-python-c9baba21059 ###

In [1]:
import requests
import lxml.html as lh
import pandas as pd

In [2]:
# get url and create a page
url='http://www.iaaf.org/records/all-time-toplists/sprints/100-metres/outdoor/men/senior'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0'}
page = requests.get(url, headers=headers)

# store the contents of the website
doc = lh.fromstring(page.content)

# parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

# in our case the table starts at position 18
tr_elements = tr_elements[18:]

In [3]:
# create empty list
col=[]
i=0

#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    name=name.strip()
    print '%d:"%s"'%(i,name)
    col.append((name,[]))

1:"Rank"
2:"Mark"
3:"WIND"
4:"Competitor"
5:"DOB"
6:"Nat"
7:"Pos"
8:""
9:"Venue"
10:"Date"
11:"Results Score"


In [4]:
#Since our first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 11, the //tr data is not from our table 
    if len(T)!=11:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content()
        data=data.strip()
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [5]:
[len(C) for (title,C) in col]

[100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]

In [6]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [7]:
df

Unnamed: 0,Unnamed: 1,Competitor,DOB,Date,Mark,Nat,Pos,Rank,Results Score,Venue,WIND
0,,Usain BOLT,21 AUG 1986,16 AUG 2009,9.58,JAM,1,1,1356,Berlin (GER),+0.9
1,,Tyson GAY,09 AUG 1982,20 SEP 2009,9.69,USA,1,2,1316,Shanghai (CHN),+2.0
2,,Yohan BLAKE,26 DEC 1989,23 AUG 2012,9.69,JAM,1,2,1316,Lausanne (SUI),-0.1
3,,Asafa POWELL,23 NOV 1982,02 SEP 2008,9.72,JAM,1f1,4,1305,Lausanne (SUI),+0.2
4,,Justin GATLIN,10 FEB 1982,15 MAY 2015,9.74,USA,1,5,1298,Doha (QAT),+0.9
5,,Nesta CARTER,10 NOV 1985,29 AUG 2010,9.78,JAM,1,6,1283,Rieti (ITA),+0.9
6,,Maurice GREENE,23 JUL 1974,16 JUN 1999,9.79,USA,1f3,7,1280,Athina (GRE),+0.1
7,,Christian COLEMAN,06 MAR 1996,31 AUG 2018,9.79,USA,1,7,1281,Bruxelles (BEL),-0.3
8,,Steve MULLINGS,29 NOV 1982,04 JUN 2011,9.80,JAM,1,9,1276,"Eugene, OR (USA)",+1.3
9,,Richard THOMPSON,07 JUN 1985,21 JUN 2014,9.82,TTO,1f2,10,1269,Port-of-Spain (TTO),+1.7
