In this web scraping project, our primary objective is to extract population-related data from Wikipedia.This project underscores the utility of web scraping in acquiring valuable population insights from openly accessible online platforms like Wikipedia, which can serve as a foundational resource

In [1]:
url = 'https://en.wikipedia.org/wiki/World_population'

In [2]:
import requests

In [3]:
response = requests.get(url)

In [4]:
response

<Response [200]>

In [5]:
html_text = response.text # convert into  response into text

In [6]:
html_text



In [7]:
from bs4 import BeautifulSoup

In [8]:
soap = BeautifulSoup(html_text,'html.parser') # converted text to html file format

In [9]:
soap

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-feature-night-mode-disabled skin-night-mode-clientpref-0 vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>World population - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled ve

In [10]:
tables = soap.find_all('table') # Extracting all data partaining to tables

In [11]:
tables

[<table class="wikitable" style="text-align:center; float:right; clear:right; margin-left:8px; margin-right:0;">
 <caption>World population milestones in billions<sup class="reference" id="cite_ref-:6_61-0"><a href="#cite_note-:6-61">[61]</a></sup> (Worldometers estimates)
 </caption>
 <tbody><tr>
 <th scope="row">Population
 </th>
 <th scope="col">1
 </th>
 <th scope="col">2
 </th>
 <th scope="col">3
 </th>
 <th scope="col">4
 </th>
 <th scope="col">5
 </th>
 <th scope="col">6
 </th>
 <th scope="col">7
 </th>
 <th scope="col">8
 </th>
 <th scope="col">9
 </th>
 <th scope="col">10
 </th></tr>
 <tr>
 <th scope="row">Year
 </th>
 <td>1804</td>
 <td>1927</td>
 <td>1960</td>
 <td>1974</td>
 <td>1987</td>
 <td>1999</td>
 <td>2011</td>
 <td>2022</td>
 <td><i>2037</i></td>
 <td><i>2057</i>
 </td></tr>
 <tr>
 <th scope="row">Years elapsed
 </th>
 <td>200,000+</td>
 <td>123</td>
 <td>33</td>
 <td>14</td>
 <td>13</td>
 <td>12</td>
 <td>12</td>
 <td>11</td>
 <td><i>15</i></td>
 <td><i>20</i>
 </t

In [12]:
table=tables[4] #indexing to the  table we want to extract data from

In [13]:
table

<table class="sortable wikitable sticky-header static-row-numbers sort-under col1left col5left" style="text-align:right">
<tbody><tr>
<th>Country / <a href="/wiki/Dependent_territory" title="Dependent territory">Dependency</a>
</th>
<th>Population
</th>
<th>% of<br/>world
</th>
<th>Date
</th>
<th><span class="nowrap">Source (official or from</span><br/>the United Nations)
</th></tr>
<tr>
<td><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/en/thumb/4/41/Flag_of_India.svg/23px-Flag_of_India.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/4/41/Flag_of_India.svg/35px-Flag_of_India.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/4/41/Flag_of_India.svg/45px-Flag_of_India.svg.png 2x" width="23"/></span></span></span> <a href="/wiki/Demographics_of_India" title="Demographics of India">India</a>
</td>
<

In [14]:
rows = table.find_all('tr') # Extracting all rows from table

In [15]:
rows

[<tr>
 <th>Country / <a href="/wiki/Dependent_territory" title="Dependent territory">Dependency</a>
 </th>
 <th>Population
 </th>
 <th>% of<br/>world
 </th>
 <th>Date
 </th>
 <th><span class="nowrap">Source (official or from</span><br/>the United Nations)
 </th></tr>,
 <tr>
 <td><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/en/thumb/4/41/Flag_of_India.svg/23px-Flag_of_India.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/4/41/Flag_of_India.svg/35px-Flag_of_India.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/4/41/Flag_of_India.svg/45px-Flag_of_India.svg.png 2x" width="23"/></span></span></span> <a href="/wiki/Demographics_of_India" title="Demographics of India">India</a>
 </td>
 <td style="text-align:right">1,425,775,850</td>
 <td style="text-align:right;font-size:inherit"><span data-sort-va

extracting table header

In [16]:
header=[]
for row in rows:
    th= row.find_all('th') 
    for h in th:
        header.append(h.text.strip())
header

['Country / Dependency',
 'Population',
 '% ofworld',
 'Date',
 'Source (official or fromthe United Nations)']

extracting table data

In [18]:
final_data=[]
for row in rows:
    td= row.find_all('td')
    table=[]
    for d in td:
        table.append(d.text.strip())
    final_data.append(table)
final_data

[[],
 ['India', '1,425,775,850', '17.6%', '14 Apr 2023', 'UN projection[92]'],
 ['China',
  '1,412,600,000',
  '17.4%',
  '31 Dec 2021',
  'National annual estimate[93]'],
 ['United States',
  '336,243,427',
  '4.15%',
  '26 Mar 2024',
  'National population clock[94]'],
 ['Indonesia',
  '278,696,200',
  '3.44%',
  '1 Jul 2023',
  'National annual estimate[95]'],
 ['Pakistan', '229,488,994', '2.83%', '1 Jul 2022', 'UN projection[96]'],
 ['Nigeria', '216,746,934', '2.68%', '1 Jul 2022', 'UN projection[96]'],
 ['Brazil',
  '217,452,500',
  '2.69%',
  '26 Mar 2024',
  'National population clock[97]'],
 ['Bangladesh',
  '168,220,000',
  '2.08%',
  '1 Jul 2020',
  'Annual Population Estimate[98]'],
 ['Russia',
  '147,190,000',
  '1.82%',
  '1 Oct 2021',
  '2021 preliminary census results[99]'],
 ['Mexico', '128,271,248', '1.58%', '31 Mar 2022', '']]

In [19]:
final_data=final_data[1:]

In [20]:
import pandas as pd

In [21]:
data = pd.DataFrame(final_data)

In [22]:
data

Unnamed: 0,0,1,2,3,4
0,India,1425775850,17.6%,14 Apr 2023,UN projection[92]
1,China,1412600000,17.4%,31 Dec 2021,National annual estimate[93]
2,United States,336243427,4.15%,26 Mar 2024,National population clock[94]
3,Indonesia,278696200,3.44%,1 Jul 2023,National annual estimate[95]
4,Pakistan,229488994,2.83%,1 Jul 2022,UN projection[96]
5,Nigeria,216746934,2.68%,1 Jul 2022,UN projection[96]
6,Brazil,217452500,2.69%,26 Mar 2024,National population clock[97]
7,Bangladesh,168220000,2.08%,1 Jul 2020,Annual Population Estimate[98]
8,Russia,147190000,1.82%,1 Oct 2021,2021 preliminary census results[99]
9,Mexico,128271248,1.58%,31 Mar 2022,


In [23]:
data.columns =header

In [24]:
data

Unnamed: 0,Country / Dependency,Population,% ofworld,Date,Source (official or fromthe United Nations)
0,India,1425775850,17.6%,14 Apr 2023,UN projection[92]
1,China,1412600000,17.4%,31 Dec 2021,National annual estimate[93]
2,United States,336243427,4.15%,26 Mar 2024,National population clock[94]
3,Indonesia,278696200,3.44%,1 Jul 2023,National annual estimate[95]
4,Pakistan,229488994,2.83%,1 Jul 2022,UN projection[96]
5,Nigeria,216746934,2.68%,1 Jul 2022,UN projection[96]
6,Brazil,217452500,2.69%,26 Mar 2024,National population clock[97]
7,Bangladesh,168220000,2.08%,1 Jul 2020,Annual Population Estimate[98]
8,Russia,147190000,1.82%,1 Oct 2021,2021 preliminary census results[99]
9,Mexico,128271248,1.58%,31 Mar 2022,


working on data.

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column                                       Non-Null Count  Dtype 
---  ------                                       --------------  ----- 
 0   Country / Dependency                         10 non-null     object
 1   Population                                   10 non-null     object
 2   % ofworld                                    10 non-null     object
 3   Date                                         10 non-null     object
 4   Source (official or fromthe United Nations)  10 non-null     object
dtypes: object(5)
memory usage: 532.0+ bytes


In [26]:
data['% ofworld'] = data['% ofworld'].str.strip('%').astype(float)

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Country / Dependency                         10 non-null     object 
 1   Population                                   10 non-null     object 
 2   % ofworld                                    10 non-null     float64
 3   Date                                         10 non-null     object 
 4   Source (official or fromthe United Nations)  10 non-null     object 
dtypes: float64(1), object(4)
memory usage: 532.0+ bytes


In [28]:
data['Population'] = data['Population'].str.replace(',','').astype(int)

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Country / Dependency                         10 non-null     object 
 1   Population                                   10 non-null     int32  
 2   % ofworld                                    10 non-null     float64
 3   Date                                         10 non-null     object 
 4   Source (official or fromthe United Nations)  10 non-null     object 
dtypes: float64(1), int32(1), object(3)
memory usage: 492.0+ bytes


In [30]:
data['Date']=pd.to_datetime(data['Date'])

In [31]:
data

Unnamed: 0,Country / Dependency,Population,% ofworld,Date,Source (official or fromthe United Nations)
0,India,1425775850,17.6,2023-04-14,UN projection[92]
1,China,1412600000,17.4,2021-12-31,National annual estimate[93]
2,United States,336243427,4.15,2024-03-26,National population clock[94]
3,Indonesia,278696200,3.44,2023-07-01,National annual estimate[95]
4,Pakistan,229488994,2.83,2022-07-01,UN projection[96]
5,Nigeria,216746934,2.68,2022-07-01,UN projection[96]
6,Brazil,217452500,2.69,2024-03-26,National population clock[97]
7,Bangladesh,168220000,2.08,2020-07-01,Annual Population Estimate[98]
8,Russia,147190000,1.82,2021-10-01,2021 preliminary census results[99]
9,Mexico,128271248,1.58,2022-03-31,


In [32]:
data.index

RangeIndex(start=0, stop=10, step=1)

In [33]:
data.index=data.index.astype(int)

In [34]:
len(data.index)

10

In [35]:
data.index=range(1,len(data.index)+1) #updating index 

In [36]:
data

Unnamed: 0,Country / Dependency,Population,% ofworld,Date,Source (official or fromthe United Nations)
1,India,1425775850,17.6,2023-04-14,UN projection[92]
2,China,1412600000,17.4,2021-12-31,National annual estimate[93]
3,United States,336243427,4.15,2024-03-26,National population clock[94]
4,Indonesia,278696200,3.44,2023-07-01,National annual estimate[95]
5,Pakistan,229488994,2.83,2022-07-01,UN projection[96]
6,Nigeria,216746934,2.68,2022-07-01,UN projection[96]
7,Brazil,217452500,2.69,2024-03-26,National population clock[97]
8,Bangladesh,168220000,2.08,2020-07-01,Annual Population Estimate[98]
9,Russia,147190000,1.82,2021-10-01,2021 preliminary census results[99]
10,Mexico,128271248,1.58,2022-03-31,


In [38]:
data.to_csv("Cleaned populartion.csv")