# Scraping athletic world records by gender and category


In [1]:
import requests, pandas as pd, numpy as np

In [2]:
tables=pd.read_html('https://www.worldathletics.org/records/by-category/world-records')

In [3]:
## assigning each table a dataframe
fOut=tables[0]
fIn=tables[1]
mOut=tables[2]
mIn=tables[3]
mixOut=tables[4]

In [4]:
## assigning each dataframe a gender
fOut['Gender']='F'
fIn['Gender']='F'
mOut['Gender']='M'
mIn['Gender']='M'
mixOut['Gender']='Mix'

In [5]:
## add together outdoor sports
outdoor_df = fOut.append(mOut).append(mixOut)

In [6]:
## create outdoor category column and remove irrelevant columns
outdoor_df['Category']='Outdoor'
outdoor= outdoor_df[["Country","Gender", "Category"]]
outdoor

Unnamed: 0,Country,Gender,Category
0,USA,F,Outdoor
1,USA,F,Outdoor
2,GDR,F,Outdoor
3,TCH,F,Outdoor
4,RUS,F,Outdoor
...,...,...,...
38,KEN,M,Outdoor
39,KEN,M,Outdoor
40,KEN,M,Outdoor
41,USA,M,Outdoor


In [7]:
# repeat for indoor
indoor_df = fIn.append(mIn)
indoor_df['Category']='Indoor'
indoor= indoor_df[["Country","Gender", "Category"]]
indoor

Unnamed: 0,Country,Gender,Category
0,RUS,F,Indoor
1,RUS,F,Indoor
2,RUS,F,Indoor
3,JAM,F,Indoor
4,TCH,F,Indoor
5,SLO,F,Indoor
6,MOZ,F,Indoor
7,ETH,F,Indoor
8,ETH,F,Indoor
9,ETH,F,Indoor


In [8]:
## append all data together
athletic_records=outdoor.append(indoor)
athletic_records

Unnamed: 0,Country,Gender,Category
0,USA,F,Outdoor
1,USA,F,Outdoor
2,GDR,F,Outdoor
3,TCH,F,Outdoor
4,RUS,F,Outdoor
...,...,...,...
17,USA,M,Indoor
18,RUS,M,Indoor
19,GBR,M,Indoor
20,USA,M,Indoor


In [9]:
## count numbers of records per country
athletic_records['Country'].value_counts()
athletic_records['Gender'].value_counts()

F      72
M      64
Mix     1
Name: Gender, dtype: int64

In [10]:
## add count of records per country as a column
athletic_records['Frequency'] = athletic_records.groupby('Country')['Country'].transform('count')
athletic_records

Unnamed: 0,Country,Gender,Category,Frequency
0,USA,F,Outdoor,25
1,USA,F,Outdoor,25
2,GDR,F,Outdoor,5
3,TCH,F,Outdoor,3
4,RUS,F,Outdoor,9
...,...,...,...,...
17,USA,M,Indoor,25
18,RUS,M,Indoor,9
19,GBR,M,Indoor,3
20,USA,M,Indoor,25


In [11]:
## experimenting with dropping/renaming columns
##athletic_records.drop(['Total records'], axis=1, inplace=True)
athletic_records.rename(columns={'Frequency': 'Total records'}, inplace = True)
athletic_records

Unnamed: 0,Country,Gender,Category,Total records
0,USA,F,Outdoor,25
1,USA,F,Outdoor,25
2,GDR,F,Outdoor,5
3,TCH,F,Outdoor,3
4,RUS,F,Outdoor,9
...,...,...,...,...
17,USA,M,Indoor,25
18,RUS,M,Indoor,9
19,GBR,M,Indoor,3
20,USA,M,Indoor,25


In [12]:
## dataframe for number of records per county per gender
records_country_gender = athletic_records.groupby(['Country','Gender']).count() 

In [13]:
records_country_gender

Unnamed: 0_level_0,Unnamed: 1_level_0,Category,Total records
Country,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1
BDI,F,1,1
BRN,F,1,1
BUL,F,1,1
BUR,M,1,1
CAN,M,2,2
CHN,F,4,4
CUB,M,2,2
CZE,F,1,1
CZE,M,1,1
DEN,M,1,1


In [14]:
records_female=athletic_records.groupby(['Country']).count()
records_female

Unnamed: 0_level_0,Gender,Category,Total records
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BDI,1,1,1
BRN,1,1,1
BUL,1,1,1
BUR,1,1,1
CAN,2,2,2
CHN,4,4,4
CUB,2,2,2
CZE,2,2,2
DEN,1,1,1
DJI,1,1,1


In [15]:
athletic_records.to_csv('athletic_records.csv')