## Code for web-scraping without using pandas read_html

### Import libraries

In [1]:
import requests
import lxml.html as lh
import numpy as np
import pandas as pd

### Choose number of pages and determine header

In [2]:
# choose number of pages of results we want to include from the World Athletics lists (for both genders)
npage=5

# determine header to open the webpage (Inspect Element, Network, Headers)
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0'}

### Web-scraping

In [3]:
# create empty final lists
tr1_elements_final = []
tr2_elements_final = []

# men's results
ip=1
while ip<=npage:
    
    # get url and create a page
    url1='http://www.worldathletics.org/records/all-time-toplists/sprints/100-metres/outdoor/men/senior?page=' + str(ip)
    url2='http://www.worldathletics.org/records/all-time-toplists/sprints/200-metres/outdoor/men/senior?page=' + str(ip)
    page1 = requests.get(url1, headers=headers)
    page2 = requests.get(url2, headers=headers)
    
    # store the contents of the website
    doc1 = lh.fromstring(page1.content)
    doc2 = lh.fromstring(page2.content)
    
    # parse data that are stored between <tr>..</tr> of HTML
    tr1_elements = doc1.xpath('//tr')
    tr2_elements = doc2.xpath('//tr')
    
    if ip==1:
        # in our case the table starts at position 0 (with the header)
        tr1_elements_final += tr1_elements
        tr2_elements_final += tr2_elements
    else:
        # without including the header again
        tr1_elements_final += tr1_elements[1:]
        tr2_elements_final += tr2_elements[1:]
        
    ip+=1
    
# women's results
ip=1
while ip<=npage:
    url1='http://www.worldathletics.org/records/all-time-toplists/sprints/100-metres/outdoor/women/senior?page=' + str(ip)
    url2='http://www.worldathletics.org/records/all-time-toplists/sprints/200-metres/outdoor/women/senior?page=' + str(ip)
    page1 = requests.get(url1, headers=headers)
    page2 = requests.get(url2, headers=headers)
    doc1 = lh.fromstring(page1.content)
    doc2 = lh.fromstring(page2.content)
    tr1_elements = doc1.xpath('//tr')
    tr2_elements = doc2.xpath('//tr')
    tr1_elements_final += tr1_elements[1:]
    tr2_elements_final += tr2_elements[1:]
    ip+=1

In [4]:
# create empty list
col1=[]
col2=[]

#For each row, store each first element (header) and an empty list
i=0
for t in tr1_elements_final[0]:
    i+=1
    name=t.text_content()
    name=name.strip()
    #print ('%d:"%s"'%(i,name))
    col1.append((name,[]))

i=0
for t in tr2_elements_final[0]:
    i+=1
    name=t.text_content()
    name=name.strip()
    #print ('%d:"%s"'%(i,name))
    col2.append((name,[]))

In [5]:
#Since our first row is the header, data is stored on the second row onwards
for j in range(1,len(tr1_elements_final)):
    #T is our j'th row
    T=tr1_elements_final[j]
    
    #If row is not of size 11, the //tr data is not from our table 
    if len(T)!=11:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content()
        data=data.strip()
        #Append the data to the empty list of the i'th column
        col1[i][1].append(data)
        #Increment i for the next column
        i+=1
        
for j in range(1,len(tr2_elements_final)):
    #T is our j'th row
    T=tr2_elements_final[j]
    
    #If row is not of size 11, the //tr data is not from our table 
    if len(T)!=11:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content()
        data=data.strip()
        #Append the data to the empty list of the i'th column
        col2[i][1].append(data)
        #Increment i for the next column
        i+=1

### Create pandas DataFrames

In [6]:
Dict1={title:column for (title,column) in col1}
Dict2={title:column for (title,column) in col2}

# there's a key with all blank entries so we get rid of it
del Dict1['']
del Dict2['']

# create DataFrames
df1=pd.DataFrame(Dict1)
df2=pd.DataFrame(Dict2)

# changing header name
df1.rename(columns={'Mark':'100m'}, inplace=True)
df2.rename(columns={'Mark':'200m'}, inplace=True)

In [7]:
df1.head()

Unnamed: 0,Rank,100m,WIND,Competitor,DOB,Nat,Pos,Venue,Date,Results Score
0,1,9.58,0.9,Usain BOLT,21 AUG 1986,JAM,1,"Olympiastadion, Berlin (GER)",16 AUG 2009,1356
1,2,9.69,2.0,Tyson GAY,09 AUG 1982,USA,1,Shanghai (CHN),20 SEP 2009,1316
2,2,9.69,-0.1,Yohan BLAKE,26 DEC 1989,JAM,1,"Pontaise, Lausanne (SUI)",23 AUG 2012,1316
3,4,9.72,0.2,Asafa POWELL,23 NOV 1982,JAM,1f1,"Pontaise, Lausanne (SUI)",02 SEP 2008,1305
4,5,9.74,0.9,Justin GATLIN,10 FEB 1982,USA,1,"Hamad Bin Suhaim, Doha (QAT)",15 MAY 2015,1298


In [8]:
df2.tail()

Unnamed: 0,Rank,200m,WIND,Competitor,DOB,Nat,Pos,Venue,Date,Results Score
995,482,22.84,1.3,Jaevin REED,05 FEB 1998,USA,1f3,"San Antonio, TX (USA)",25 MAR 2017,1151
996,497,22.85,,Silvia CHIVÁS,10 SEP 1954,CUB,1,Guadalajara (MEX),12 AUG 1977,1150
997,497,22.85,0.5,Ewa PISIEWICZ,07 MAY 1962,POL,1,Zabrze (POL),25 AUG 1985,1150
998,497,22.85,0.5,Vera ZAYTSEVA,13 OCT 1963,URS,2,Chelyabinsk (URS),21 JUN 1987,1150
999,497,22.85,1.1,Anita HOWARD,22 MAR 1969,USA,1f2,"New York, NY (USA)",22 JUL 1989,1150


## References ###
[1] https://towardsdatascience.com/web-scraping-html-tables-with-python-c9baba21059