# Web Scraping

## Import supporting modules

In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Importing modules required for data extraction

In [31]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [33]:
url="https://www.hubertiming.com/results/2017GPTR10K"
html_page=urlopen(url)

# this can also be done with requests like
"""
import requests
r=requests.get(url)
soup=BeautifulSoup(r.content)
"""

### Practicing bs4

In [34]:
soup=BeautifulSoup(html_page)
type(soup)


bs4.BeautifulSoup

In [35]:
print(soup.title)


<title>2017 Intel Great Place to Run 10K \ Urban Clash Games Race Results</title>


In [36]:
attributes=soup.find_all('a')
attributes

[<a href="mailto:timing@hubertiming.com">timing@hubertiming.com</a>,
 <a href="https://www.hubertiming.com/">Huber Timing Home</a>,
 <a href="#individual">Individual Results</a>,
 <a href="#team">Team Results</a>,
 <a class="btn btn-primary btn-lg" href="/results/2017GPTR" role="button" style="margin: 0px 0px 5px 5px">5K</a>,
 <a class="btn btn-primary btn-lg" href="/results/summary/2017GPTR10K" role="button" style="margin: 0px 0px 5px 5px">Summary</a>,
 <a href="#tabs-1" style="font-size: 18px">10K Results</a>,
 <a name="individual"></a>,
 <a name="team"></a>,
 <a href="https://www.hubertiming.com/"><img height="65" src="/sites/all/themes/hubertiming/images/clockWithFinishSign_small.png" width="50"/>Huber Timing</a>,
 <a href="https://facebook.com/hubertiming/"><img src="/results/FB-f-Logo__blue_50.png"/></a>]

In [37]:
type(attributes[0])

bs4.element.Tag

In [38]:
dir(attributes[0])

['__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_find_all',
 '_find_one',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_should_pretty_print',
 'append',
 'attrs',
 'can_be_empty_element',
 'cdata_list_attributes',
 'childGenerator',
 'children',
 'clear',
 'contents',
 'decode',
 'decode_contents',
 'decompose',
 'descendants',
 'encode',
 'encode_contents',
 'extend',
 'extract',
 'fetchNextSiblings',
 'fetchParents',
 'fetchPrevious',
 'fetchPreviousSiblings',
 'find',
 'findAl

In [39]:
attributes[0].getText()

'timing@hubertiming.com'

#### get and getText()

In [41]:
# get and getText functions are to be remmembered.
for attribute in attributes:
    print(attribute.get('href'),"-->",attribute.getText())

mailto:timing@hubertiming.com --> timing@hubertiming.com
https://www.hubertiming.com/ --> Huber Timing Home
#individual --> Individual Results
#team --> Team Results
/results/2017GPTR --> 5K
/results/summary/2017GPTR10K --> Summary
#tabs-1 --> 10K Results
None --> 
None --> 
https://www.hubertiming.com/ --> Huber Timing
https://facebook.com/hubertiming/ --> 


In [44]:
attributes[0].text, attributes[0].getText()

('timing@hubertiming.com', 'timing@hubertiming.com')

In [45]:
rows=soup.find_all('tr')

In [46]:
len(rows)

597

In [47]:
type(rows)

bs4.element.ResultSet

In [48]:
rows[0]

<tr colspan="2"><b>10K:</b></tr>

In [70]:
data=list()
for row in rows:
    temp=tuple()
    for td_list in (row.find_all('td')):
        temp=temp+(td_list.getText(),)
    data.append(temp)

In [60]:
print(data)

[(), ('Finishers:', '577'), ('Male:', '414'), ('Female:', '163'), (), ('1', '814', 'JARED WILSON', 'M', 'TIGARD', 'OR', '36:21', '5:51', '1 of 414', 'M 36-45', '1 of 152', '0:03', '36:24', ''), ('2', '573', 'NATHAN A SUSTERSIC', 'M', 'PORTLAND', 'OR', '36:42', '5:55', '2 of 414', 'M 26-35', '1 of 154', '0:03', '36:45', 'INTEL TEAM F'), ('3', '687', 'FRANCISCO MAYA', 'M', 'PORTLAND', 'OR', '37:44', '6:05', '3 of 414', 'M 46-55', '1 of 64', '0:04', '37:48', ''), ('4', '623', 'PAUL MORROW', 'M', 'BEAVERTON', 'OR', '38:34', '6:13', '4 of 414', 'M 36-45', '2 of 152', '0:03', '38:37', ''), ('5', '569', 'DEREK G OSBORNE', 'M', 'HILLSBORO', 'OR', '39:21', '6:20', '5 of 414', 'M 26-35', '2 of 154', '0:03', '39:24', 'INTEL TEAM F')]


In [71]:
df=pd.DataFrame(data)

In [72]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,,,,,,,,,,,,,,
1,Finishers:,577.0,,,,,,,,,,,,
2,Male:,414.0,,,,,,,,,,,,
3,Female:,163.0,,,,,,,,,,,,
4,,,,,,,,,,,,,,


In [73]:
df.shape

(597, 14)

In [74]:
df.iloc[:][10:20]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
10,6,642,JONATHON TRAN,M,PORTLAND,OR,39:49,6:25,6 of 414,M 18-25,1 of 34,0:06,39:55,
11,7,144,GEORGE TOTONCHY,M,PORTLAND,OR,40:04,6:27,7 of 414,M 36-45,3 of 152,0:13,40:17,
12,8,395,BENJAMIN C CHAFFIN,M,PORTLAND,OR,40:05,6:27,8 of 414,M 36-45,4 of 152,0:04,40:09,
13,9,7,BRANDON THOMAS,M,,,40:17,6:29,9 of 414,M 26-35,3 of 154,0:07,40:24,COLUMBIA TEAM B
14,10,3,ERIK BJORNSTAD,M,,,40:21,6:30,10 of 414,M 36-45,5 of 152,0:04,40:25,COLUMBIA TEAM A
15,11,68,WITALI SPULING,M,PORTLAND,OR,40:28,6:31,11 of 414,M 26-35,4 of 154,0:12,40:40,DTNA1
16,12,788,ANDY WEDAM,M,PORTLAND,OR,40:36,6:33,12 of 414,M 36-45,6 of 152,0:06,40:42,DTNA2
17,13,729,MICHAEL P GEORGE,M,PORTLAND,OR,40:43,6:34,13 of 414,M 18-25,2 of 34,0:07,40:50,
18,14,151,ETHAN JORDAN,M,PORTLAND,OR,41:01,6:37,14 of 414,M 36-45,7 of 152,0:12,41:13,
19,15,758,FLAVIO GRIGGIO,M,PORTLAND,OR,41:19,6:39,15 of 414,M 26-35,5 of 154,0:05,41:24,


In [80]:
soup.find_all('table')[1].get('id')

'individualResults'

## Ordering or getting the things together

In [93]:
# same statements till soup
tables=soup.find_all('table')
for table in tables:
    if(table.get('id')=='individualResults'):
        head_list=table.find_all('th')
        head_tuple=tuple()
        for head_col in head_list:
            head_tuple+=(head_col.text,)
        print(head_tuple)
        rows=table.find_all('tr')
        result=list()
        for row in rows:
            data_list=row.find_all('td')
            row_tuple=tuple()
            for col in data_list:
                row_tuple+=(col.text,)
            if(len(row_tuple)!=0):
                result.append(row_tuple)
        print(result[:10])

('Place', 'Bib', 'Name', 'Gender', 'City', 'State', 'Chip Time', 'Chip Pace', 'Gender Place', 'Age Group', 'Age Group Place', 'Time to Start', 'Gun Time', 'Team')
[('1', '814', 'JARED WILSON', 'M', 'TIGARD', 'OR', '36:21', '5:51', '1 of 414', 'M 36-45', '1 of 152', '0:03', '36:24', ''), ('2', '573', 'NATHAN A SUSTERSIC', 'M', 'PORTLAND', 'OR', '36:42', '5:55', '2 of 414', 'M 26-35', '1 of 154', '0:03', '36:45', 'INTEL TEAM F'), ('3', '687', 'FRANCISCO MAYA', 'M', 'PORTLAND', 'OR', '37:44', '6:05', '3 of 414', 'M 46-55', '1 of 64', '0:04', '37:48', ''), ('4', '623', 'PAUL MORROW', 'M', 'BEAVERTON', 'OR', '38:34', '6:13', '4 of 414', 'M 36-45', '2 of 152', '0:03', '38:37', ''), ('5', '569', 'DEREK G OSBORNE', 'M', 'HILLSBORO', 'OR', '39:21', '6:20', '5 of 414', 'M 26-35', '2 of 154', '0:03', '39:24', 'INTEL TEAM F'), ('6', '642', 'JONATHON TRAN', 'M', 'PORTLAND', 'OR', '39:49', '6:25', '6 of 414', 'M 18-25', '1 of 34', '0:06', '39:55', ''), ('7', '144', 'GEORGE TOTONCHY', 'M', 'PORTLAND'

In [94]:
data=pd.DataFrame(data=result,columns=head_tuple)

In [95]:
data.head()

Unnamed: 0,Place,Bib,Name,Gender,City,State,Chip Time,Chip Pace,Gender Place,Age Group,Age Group Place,Time to Start,Gun Time,Team
0,1,814,JARED WILSON,M,TIGARD,OR,36:21,5:51,1 of 414,M 36-45,1 of 152,0:03,36:24,
1,2,573,NATHAN A SUSTERSIC,M,PORTLAND,OR,36:42,5:55,2 of 414,M 26-35,1 of 154,0:03,36:45,INTEL TEAM F
2,3,687,FRANCISCO MAYA,M,PORTLAND,OR,37:44,6:05,3 of 414,M 46-55,1 of 64,0:04,37:48,
3,4,623,PAUL MORROW,M,BEAVERTON,OR,38:34,6:13,4 of 414,M 36-45,2 of 152,0:03,38:37,
4,5,569,DEREK G OSBORNE,M,HILLSBORO,OR,39:21,6:20,5 of 414,M 26-35,2 of 154,0:03,39:24,INTEL TEAM F


## EDA

In [96]:
data.shape

(577, 14)

In [97]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 577 entries, 0 to 576
Data columns (total 14 columns):
Place              577 non-null object
Bib                577 non-null object
Name               577 non-null object
Gender             577 non-null object
City               577 non-null object
State              577 non-null object
Chip Time          577 non-null object
Chip Pace          577 non-null object
Gender Place       577 non-null object
Age Group          577 non-null object
Age Group Place    577 non-null object
Time to Start      577 non-null object
Gun Time           577 non-null object
Team               577 non-null object
dtypes: object(14)
memory usage: 63.2+ KB


In [98]:
dir(data)

['Bib',
 'City',
 'Gender',
 'Name',
 'Place',
 'State',
 'T',
 'Team',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_SLICEMAP',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',

In [101]:
data.isnull().sum()

Place              0
Bib                0
Name               0
Gender             0
City               0
State              0
Chip Time          0
Chip Pace          0
Gender Place       0
Age Group          0
Age Group Place    0
Time to Start      0
Gun Time           0
Team               0
dtype: int64

In [102]:
data.describe()

Unnamed: 0,Place,Bib,Name,Gender,City,State,Chip Time,Chip Pace,Gender Place,Age Group,Age Group Place,Time to Start,Gun Time,Team
count,577,577,577,577,577,577,577,577,577,577,577,577,577,577.0
unique,577,577,576,2,20,9,516,301,577,12,577,93,508,24.0
top,138,138,NO NAME PLS EMAIL TIMER,M,PORTLAND,OR,55:00,8:29,235 of 414,M 26-35,84 of 154,0:07,43:52,
freq,1,1,2,414,206,551,5,6,1,154,1,14,3,497.0


In [103]:
data.columns[data.isna().any()]

Index([], dtype='object')

# Data Analysis and Visualization