# Wikipedia page data extraction
In this tutorial, we will learn how to extract a static page and convert it into useful information.

We first get a wikipeidia page using requests.

In [1]:
import requests
import re
from bs4 import BeautifulSoup

In [2]:
bigdata = requests.get('https://en.wikipedia.org/wiki/Big_data')

In [3]:
len(bigdata.text)

498851

In [4]:
bigdata.text

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Big data - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"75f893de-3a94-456d-b25f-36f5c73c7688","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Big_data","wgTitle":"Big data","wgCurRevisionId":1111125151,"wgRevisionId":1111125151,"wgArticleId":27051151,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with bare URLs for citations","Articles with bare URLs for citations from July 2022","Articles with PDF format bare URLs for citations","Webarchive 

## Parsing a wikipedia page

In [5]:
soup = BeautifulSoup(bigdata.text, "lxml")
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Big data - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"75f893de-3a94-456d-b25f-36f5c73c7688","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Big_data","wgTitle":"Big data","wgCurRevisionId":1111125151,"wgRevisionId":1111125151,"wgArticleId":27051151,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with bare URLs for citations","Articles with bare URLs for citations from July 2022","Articles with PDF format bare URLs for citations",

In [6]:
soup.title.string

'Big data - Wikipedia'

In [7]:
soup.find_all('a')

[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>,
 <a href="/wiki/Big_Data_(band)" title="Big Data (band)">Big Data (band)</a>,
 <a href="/wiki/Surveillance_capitalism" title="Surveillance capitalism">Surveillance capitalism</a>,
 <a class="image" href="/wiki/File:Hilbert_InfoGrowth.png"><img alt="" class="thumbimage" data-file-height="720" data-file-width="960" decoding="async" height="300" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/7c/Hilbert_InfoGrowth.png/400px-Hilbert_InfoGrowth.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/7c/Hilbert_InfoGrowth.png/600px-Hilbert_InfoGrowth.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/7/7c/Hilbert_InfoGrowth.png/800px-Hilbert_InfoGrowth.png 2x" width="400"/></a>,
 <a class="internal" href="/wiki/File:Hilbert_InfoGrowth.png" title="Enlarge"></a>,
 <a href="#cite_note-1">[1]</a>,
 <a href="/wiki/Data_set" title="D

In [8]:
for link in soup.find_all('a', limit=15):
    print('{} : {}'.format(link.get('class'), link.get('href')))

None : None
['mw-jump-link'] : #mw-head
['mw-jump-link'] : #searchInput
None : /wiki/Big_Data_(band)
None : /wiki/Surveillance_capitalism
['image'] : /wiki/File:Hilbert_InfoGrowth.png
['internal'] : /wiki/File:Hilbert_InfoGrowth.png
None : #cite_note-1
None : /wiki/Data_set
None : /wiki/Data_processing
None : /wiki/Application_software
['mw-redirect'] : /wiki/Statistical_power
None : /wiki/False_discovery_rate
None : #cite_note-2
None : /wiki/Automatic_identification_and_data_capture


## Match with regular expression on attributes

In [9]:
pattern = re.compile(r'/wiki/(.*)')

In [10]:
for link in soup.find_all('a', {'class': None}, limit=20):
    href = link.get('href')
    if href is not None:
        match = re.match(pattern, href)
        if match:
            print(href)

/wiki/Big_Data_(band)
/wiki/Surveillance_capitalism
/wiki/Data_set
/wiki/Data_processing
/wiki/Application_software
/wiki/False_discovery_rate
/wiki/Automatic_identification_and_data_capture
/wiki/Computer_data_storage
/wiki/Data_analysis
/wiki/Data_sharing
/wiki/Query_language
/wiki/Information_privacy
/wiki/Predictive_analytics
/wiki/User_behavior_analytics
/wiki/Data_valuation


## Extract a list of "See also"

In [11]:
a_list = soup.select('div.div-col ul a')
a_list

[<a href="/wiki/Big_data_ethics" title="Big data ethics">Big data ethics</a>,
 <a class="mw-redirect" href="/wiki/Big_Data_Maturity_Model" title="Big Data Maturity Model">Big Data Maturity Model</a>,
 <a href="/wiki/Big_memory" title="Big memory">Big memory</a>,
 <a href="/wiki/Data_curation" title="Data curation">Data curation</a>,
 <a href="/wiki/Data_defined_storage" title="Data defined storage">Data defined storage</a>,
 <a href="/wiki/Data_engineering" title="Data engineering">Data engineering</a>,
 <a href="/wiki/Data_lineage" title="Data lineage">Data lineage</a>,
 <a href="/wiki/Data_philanthropy" title="Data philanthropy">Data philanthropy</a>,
 <a href="/wiki/Data_science" title="Data science">Data science</a>,
 <a href="/wiki/Datafication" title="Datafication">Datafication</a>,
 <a href="/wiki/Document-oriented_database" title="Document-oriented database">Document-oriented database</a>,
 <a href="/wiki/In-memory_processing" title="In-memory processing">In-memory processing</

In [12]:
for e in a_list:
    print(e['href'])

/wiki/Big_data_ethics
/wiki/Big_Data_Maturity_Model
/wiki/Big_memory
/wiki/Data_curation
/wiki/Data_defined_storage
/wiki/Data_engineering
/wiki/Data_lineage
/wiki/Data_philanthropy
/wiki/Data_science
/wiki/Datafication
/wiki/Document-oriented_database
/wiki/In-memory_processing
/wiki/List_of_big_data_companies
/wiki/Very_large_database
/wiki/XLDB


## Convert links in See also into a dataframe

In [13]:
import pandas as pd

In [14]:
data = []
for e in a_list:
    data.append({ 'keyword' : e.string, 'link' : e['href'] })
df = pd.DataFrame(data)

In [15]:
df

Unnamed: 0,keyword,link
0,Big data ethics,/wiki/Big_data_ethics
1,Big Data Maturity Model,/wiki/Big_Data_Maturity_Model
2,Big memory,/wiki/Big_memory
3,Data curation,/wiki/Data_curation
4,Data defined storage,/wiki/Data_defined_storage
5,Data engineering,/wiki/Data_engineering
6,Data lineage,/wiki/Data_lineage
7,Data philanthropy,/wiki/Data_philanthropy
8,Data science,/wiki/Data_science
9,Datafication,/wiki/Datafication
