# Data Scrapping with Beautiful Soup

- Import beautiful Soup
- Make a GET request to fetch Page Data
- Parse HTML
- Filter Relevant Parts

##### PACKAGE INSTALLATION
pip install bs4

### Import url library

In [78]:
from urllib.request import urlopen

#### We as a client will make a Request(Get request) to HTTP server and server will respond with a response

- Specify the url

In [79]:
url = "https://en.wikipedia.org/wiki/Android_version_history"

- Check the type of response we are getting from seerver

In [80]:
data = urlopen(url)
print(type(data))

<class 'http.client.HTTPResponse'>


- HTML code of the page

In [81]:
html = data.read()

- We have received HTML data now we need to parse the data
- Now parsing is dine through Beautiful Soup

### Import Beautiful Soup and Parse Data

In [82]:
from bs4 import BeautifulSoup as soup

In [83]:
android_soup = soup(html,'html.parser')
print(type(android_soup))

<class 'bs4.BeautifulSoup'>


- Get all h1 headings

In [84]:
print(android_soup.findAll('h1',{}))

[<h1 class="firstHeading" id="firstHeading" lang="en">Android version history</h1>]


- Parsing Table
- We will get the Class/id of the table using Inspect element

In [85]:
android_table = android_soup.findAll('table','wikitable')

In [86]:
android_table

[<table class="wikitable">
 <tbody><tr>
 <th>Name
 </th>
 <th>Version number(s)
 </th>
 <th>Initial stable<br/>release date
 </th>
 <th>Supported (security fixes)
 </th>
 <th>API level
 </th>
 <th>References
 </th></tr>
 <tr>
 <td rowspan="2">No official codename
 </td>
 <td>1.0
 </td>
 <td>September 23, 2008
 </td>
 <td class="table-no" style="background:#F99;vertical-align:middle;text-align:center;">No
 </td>
 <td>1
 </td>
 <td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-1"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup>
 </td></tr>
 <tr>
 <td>1.1
 </td>
 <td>February 9, 2009
 </td>
 <td class="table-no" style="background:#F99;vertical-align:middle;text-align:center;">No
 </td>
 <td>2
 </td>
 <td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-2"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup><sup class="reference" id="cite_ref-14"><a href="#cite_note-14">[14]</a></sup>
 </td></tr>
 <tr>

## Extracting Useful Information

- First Extract table headers

In [146]:
type(android_table)
table = android_table[0].findAll('tr')

In [148]:
headers = android_table[0].findAll('th')
print(len(headers))
print(headers)


6
[<th>Name
</th>, <th>Version number(s)
</th>, <th>Initial stable<br/>release date
</th>, <th>Supported (security fixes)
</th>, <th>API level
</th>, <th>References
</th>]


In [149]:
col_titles = [ct.text[:-1] for ct in headers]
print(col_titles)

['Name', 'Version number(s)', 'Initial stablerelease date', 'Supported (security fixes)', 'API level', 'References']


- Extract rows and save in table_rows list

In [150]:
row_data = android_table[0].findAll('tr')[1:]
print(len(row_data))
first_row = row_data[0].findAll('td')
for d in first_row:
    print(d.text)

18
No official codename

1.0

September 23, 2008

No

1

[9]



In [173]:
table_rows = []
for row in row_data:
    current_row=[]
    single_row_data = row.findAll("td",{})
    if(len(single_row_data)==6):
        for idx,data in enumerate(single_row_data):
            current_row.append(data.text[:-1])
        table_rows.append(current_row)

In [174]:
table_rows

[['No official codename', '1.0', 'September 23, 2008', 'No', '1', '[9]'],
 ['Cupcake', '1.5', 'April 27, 2009', 'No', '3', '[15]'],
 ['Donut', '1.6', 'September 15, 2009', 'No', '4', '[16]'],
 ['Eclair', '2.0 – 2.1', 'October 26, 2009', 'No', '5 – 7', '[17]'],
 ['Froyo', '2.2 – 2.2.3', 'May 20, 2010', 'No', '8', '[18]'],
 ['Gingerbread', '2.3 – 2.3.7', 'December 6, 2010', 'No', '9 – 10', '[19]'],
 ['Honeycomb', '3.0 – 3.2.6', 'February 22, 2011', 'No', '11 – 13', '[20]'],
 ['Ice Cream Sandwich',
  '4.0 – 4.0.4',
  'October 18, 2011',
  'No',
  '14 – 15',
  '[21]'],
 ['Jelly Bean', '4.1 – 4.3.1', 'July 9, 2012', 'No', '16 – 18', '[22]'],
 ['KitKat', '4.4 – 4.4.4', 'October 31, 2013', 'No', '19 – 20', '[23]'],
 ['Lollipop', '5.0 – 5.1.1', 'November 12, 2014', 'No', '21 – 22', '[24]'],
 ['Marshmallow', '6.0 – 6.0.1', 'October 5, 2015', 'No', '23', '[25]'],
 ['Nougat',
  '7.0 – 7.1.2',
  'August 22, 2016',
  'No',
  '24 – 25',
  '[26][27][28][29]'],
 ['Oreo', '8.0 – 8.1', 'August 21, 2017'

# SAVING IN A CSV

In [203]:
filename = "Android_versions_history.csv"
with open(filename,'w',encoding='utf-8') as f:
    #header data
    header_string = ','.join(col_titles)
    header_string +='\n'
    f.write(header_string)
    #row data
    for row in table_rows:
        row_string = ""
        for word in row:
            word = word.replace(',',"")   
            row_string += word+','
        row_string = row_string[:-1]
        row_string+="\n"
        f.write(row_string)
        #print(row_string)
        

# READING USING PANDAS

In [204]:
import pandas as pd
df = pd.read_csv("Android_versions_history.csv")

In [205]:
print(df.shape)
df.head(n=3)

(17, 6)


Unnamed: 0,Name,Version number(s),Initial stablerelease date,Supported (security fixes),API level,References
0,No official codename,1.0,September 23 2008,No,1,[9]
1,Cupcake,1.5,April 27 2009,No,3,[15]
2,Donut,1.6,September 15 2009,No,4,[16]
