# Data Scraping using Beautiful Soup
* Import Beautiful soup
* Make a get request to fetch page data
* Parse HTML
* Filter Relvant Parts

## Installation

In [1]:
# !pip install bs4//

Collecting bs4
  Using cached bs4-0.0.1-py3-none-any.whl
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)
     -------------------------------------- 143.0/143.0 kB 2.8 MB/s eta 0:00:00
Collecting soupsieve>1.2
  Using cached soupsieve-2.4-py3-none-any.whl (37 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.12.2 bs4-0.0.1 soupsieve-2.4

[notice] A new release of pip available: 22.1.2 -> 23.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from urllib.request import urlopen

In [3]:
andriod_url = "https://en.wikipedia.org/wiki/Android_version_history"

In [4]:
andriod_data = urlopen(andriod_url)
print(type(andriod_data))

<class 'http.client.HTTPResponse'>


In [5]:
andriod_html = andriod_data.read()

In [6]:
print(andriod_html)



## Parsing Data

In [7]:
from bs4 import BeautifulSoup as soup

In [8]:
andriod_soup = soup(andriod_html,'html.parser')
print(andriod_soup)

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled vector-feature-page-tools-enabled" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Android version history - Wikipedia</title>
<script>document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-featu

In [9]:
print(type(andriod_soup))

<class 'bs4.BeautifulSoup'>


In [10]:
andriod_soup.findAll('h1',{})

[<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">Android version history</span></h1>]

In [11]:
tables = andriod_soup.find_all('table',{'class':'wikitable'})
print(len(tables))

34


In [12]:
andriod_table = tables[0]
print(andriod_table)

<table class="wikitable">
<tbody><tr>
<th>Name
</th>
<th>Internal codename<sup class="reference" id="cite_ref-:0_11-1"><a href="#cite_note-:0-11">[11]</a></sup>
</th>
<th>Version number(s)
</th>
<th><a href="/wiki/API" title="API">API</a><br/>level
</th>
<th>Initial stable<br/>release date
</th>
<th>Latest security patch date<sup class="reference" id="cite_ref-16"><a href="#cite_note-16">[16]</a></sup>
</th>
<th>Latest <a href="/wiki/Google_Play_Services" title="Google Play Services">Google Play Services</a> version<sup class="reference" id="cite_ref-17"><a href="#cite_note-17">[17]</a></sup><br/>(release date)
</th></tr>
<tr>
<td>Android 1.0
</td>
<td class="table-na" data-sort-value="" style="background: #ececec; color: #2C2C2C; vertical-align: middle; text-align: center;">—
</td>
<td data-sort-value="1.0" style="background-color: #FDB3AB;" title="Old version, no longer maintained"><span style="display: none;">Old version, no longer maintained:</span> 1.0
</td>
<td>1
</td>
<td>Septem

## Extracing Useful Information
* Remove underired tags
* Extract table header & data

In [13]:
headers = andriod_table.findAll('th')
print(len(headers))

7


In [14]:
column_title = [ct.text[:-1] for ct in headers]
print(column_title)

['Name', 'Internal codename[11]', 'Version number(s)', 'APIlevel', 'Initial stablerelease date', 'Latest security patch date[16]', 'Latest Google Play Services version[17](release date)']


In [15]:
rows_data = andriod_table.find_all('tr')[1:]
print(len(rows_data))
first_row = rows_data[0].findAll('td',{})
for d in first_row:
    print(d.text[:-1])

36
Android 1.0
—
Old version, no longer maintained: 1.0
1
September 23, 2008
—
—


In [16]:
table_rows = []
for row in rows_data:
    current_row = []
    row_data = row.findAll('td',{})
    for idx,data in enumerate(row_data):
        if idx==2 or idx==3:
            info = data.text
            
        if idx==1:
            current_row.append(data.text[:-1].split(": ")[-1])
        if idx!=0 and idx!=3:
            current_row.append(data.text[:-1])
        else:
            current_row.append(data.text)
    table_rows.append(current_row)

In [17]:
print(table_rows)

[['Android 1.0\n', '—', '—', 'Old version, no longer maintained: 1.0', '1\n', 'September 23, 2008', '—', '—'], ['Android 1.1\n', 'Petit Four', 'Petit Four', 'Old version, no longer maintained: 1.1', '2\n', 'February 9, 2009'], ['Android Cupcake\n', 'Cupcake', 'Cupcake', 'Old version, no longer maintained: 1.5', '3\n', 'April 27, 2009'], ['Android Donut\n', 'Donut', 'Donut', 'Old version, no longer maintained: 1.6', '4\n', 'September 15, 2009'], ['Android Eclair\n', 'Eclair', 'Eclair', 'Old version, no longer maintained: 2.0', '5\n', 'October 27, 2009'], ['Old version, no longer maintained: 2.0.1\n', '6', '6', 'December 3, 2009'], ['Old version, no longer maintained: 2.1\n', '7', '7', 'January 11, 2010[18]'], ['Android Froyo\n', 'Froyo', 'Froyo', 'Old version, no longer maintained: 2.2 – 2.2.3', '8\n', 'May 20, 2010', '3.2.25 (October 2014)'], ['Android Gingerbread\n', 'Gingerbread', 'Gingerbread', 'Old version, no longer maintained: 2.3 – 2.3.2', '9\n', 'December 6, 2010', '10.0.84 (No

## Writing & Reading CSV Files

In [18]:
filename = 'andriod_version_history.csv'
with open(filename,'w',encoding = 'utf') as f:
    header_string = ','.join(column_title)
    header_string += '\n'
    f.write(header_string)
    
    for row in table_rows[:-1]:
        row_string = ""
        for w in row:
            w = w.replace(',',' ')
            row_string += w +','
        row_string+='\n'
        f.write(row_string)

## Data cleaning
* Remove unwanted commas & symbols
* Undersired Information

In [23]:
import pandas as pd
df = pd.read_csv('andriod_version_history.csv')
df

ModuleNotFoundError: No module named 'pandas'

## Trying another page

In [None]:
import pandas as pd # library for data analysis
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents

In [None]:
list_of_citites = 'https://en.wikipedia.org/wiki/List_of_cities_in_India_by_population'

In [None]:
response=requests.get(list_of_citites)
print(response.status_code)

200


In [None]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
indiatable=soup.find('table',{'class':"wikitable"})

In [None]:
print(indiatable)

<table class="wikitable sortable" style="width:100%; text-align:center;">
<caption>
</caption>
<tbody><tr>
<th style="width:5%;">Rank
</th>
<th style="width:15%;">City
</th>
<th style="width:25%;">Population<br/>(2011)<sup class="reference" id="cite_ref-Cities1Lakhandabove_3-0"><a href="#cite_note-Cities1Lakhandabove-3">[3]</a></sup>
</th>
<th style="width:25%;">Population<br/>(2001)
</th>
<th style="width:30%;">State or union territory
</th></tr>
<tr>
<td>1</td>
<td><b><a href="/wiki/Mumbai" title="Mumbai">Mumbai</a></b></td>
<td>12,442,373</td>
<td>11,978,450</td>
<td><a href="/wiki/Maharashtra" title="Maharashtra">Maharashtra</a>
</td></tr>
<tr>
<td>2</td>
<td><b><a href="/wiki/Delhi" title="Delhi">Delhi</a></b></td>
<td>11,007,835</td>
<td>9,879,172</td>
<td><a href="/wiki/Delhi" title="Delhi">Delhi</a>
</td></tr>
<tr>
<td>3</td>
<td><b><a href="/wiki/Bangalore" title="Bangalore">Bangalore</a> </b></td>
<td>8,425,970</td>
<td>4,301,326</td>
<td><a href="/wiki/Karnataka" title="Karn

In [22]:
df=pd.read_html(str(indiatable))
# convert list to dataframe
df=pd.DataFrame(df[0])

NameError: name 'pd' is not defined

In [None]:
# drop the unwanted columns
data = df.drop(["Rank", "Population(2001)"], axis=1)
# rename columns for ease
data = data.rename(columns={"State or union territory": "State","Population(2011)[3]": "Population"})

In [21]:
data

<td colspan="7"><div class="templateVersion l" style="margin-left: -1em;"><div style="float: left; margin-left: 1em;"><span style="white-space: nowrap;"><b>Legend:</b></span></div><div style="float: left; margin-left: 1em;"><span style="border-left: 1.2em solid #FDB3AB; padding-left: 0.3em; white-space: nowrap;" title="Old version, no longer maintained">Old version</span></div><div style="float: left; margin-left: 1em;"><span style="border-left: 1.2em solid #FEF8C6; padding-left: 0.3em; white-space: nowrap;" title="An older version, yet still maintained">Older version, still maintained</span></div><div style="float: left; margin-left: 1em;"><span style="border-left: 1.2em solid #D4F4B4; padding-left: 0.3em; white-space: nowrap;" title="Latest stable version"><b>Latest version</b></span></div><div style="float: left; margin-left: 1em;"><span style="border-left: 1.2em solid #FED1A0; padding-left: 0.3em; white-space: nowrap;" title="Latest preview of a future release">Latest preview versi

In [20]:
data.to_csv('city_population.csv',index=False)

TypeError: 'NoneType' object is not callable

In [None]:
df = pd.read_csv('city_population.csv')

In [None]:
df.shape

(343, 3)

In [None]:
df.isnull().sum()

City          0
Population    0
State         0
dtype: int64

In [None]:
df.hw = df['Population'].str.replace(',','')