In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

# using pd.read_html

Use `pd.read_html` to automatically extract tables from a webpage

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population'

tables = pd.read_html(url)

# number of tables in the webpage
len(tables)

3

In [3]:
tables[0].head()

Unnamed: 0.1,Unnamed: 0,Country / Dependency,Population,% of world,Date,Source (official or from the United Nations),Unnamed: 6
0,–,World,8070745000,100%,10 Nov 2023,UN projection[3],
1,1,China,1411750000,,31 Dec 2022,Official estimate[4],[b]
2,2,India,1392329000,,1 Mar 2023,Official projection[5],[c]
3,3,United States,335609000,,10 Nov 2023,National population clock[7],[d]
4,4,Indonesia,279118866,,1 Jul 2023,National annual projection[8],


In [4]:
tables[1].head()

Unnamed: 0,".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:""[ ""}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:"" ]""}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}vteLists of countries by population statistics",".mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:""[ ""}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:"" ]""}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}vteLists of countries by population statistics.1"
0,Global,Current population United Nations Demographics...
1,Continents/subregions,Africa Antarctica Asia Europe North America Ca...
2,Intercontinental,Americas Arab world Commonwealth of Nations Eu...
3,Cities/urban areas,World cities National capitals Megacities Mega...
4,Past and future,Past and future population World population es...


In [5]:
tables[2].head()

Unnamed: 0,vteWorld,vteWorld.1
0,AfghanistanAlbaniaAlgeriaAndorraAngolaAntigua ...,AfghanistanAlbaniaAlgeriaAndorraAngolaAntigua ...


# using requests and beautiful soup

`requests` and `BeautifulSoup` are useful for scraping static webpages

In [6]:
url = 'https://en.wikipedia.org/wiki/Data_science'

# Send a GET request to the specified URL and store the response
response = requests.get(url)

# Create a BeautifulSoup object to parse the HTML content of the page
soup = bs(response.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-typography-survey-disabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Data science - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-lim

In [7]:
# Extracting all the links on the page
links = soup.find_all('a')
links = [link.get('href') for link in links]

# Print first 20 links
links[0:20]

['#bodyContent',
 '/wiki/Main_Page',
 '/wiki/Wikipedia:Contents',
 '/wiki/Portal:Current_events',
 '/wiki/Special:Random',
 '/wiki/Wikipedia:About',
 '//en.wikipedia.org/wiki/Wikipedia:Contact_us',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 '/wiki/Help:Contents',
 '/wiki/Help:Introduction',
 '/wiki/Wikipedia:Community_portal',
 '/wiki/Special:RecentChanges',
 '/wiki/Wikipedia:File_upload_wizard',
 '/wiki/Main_Page',
 '/wiki/Special:Search',
 '/w/index.php?title=Special:CreateAccount&returnto=Data+science',
 '/w/index.php?title=Special:UserLogin&returnto=Data+science',
 '/w/index.php?title=Special:CreateAccount&returnto=Data+science',
 '/w/index.php?title=Special:UserLogin&returnto=Data+science',
 '/wiki/Help:Introduction']

In [8]:
# Extract main content
main_content_div = soup.find('div', {'id': 'mw-content-text'})

# Extract text from paragraphs within the main content div
paragraphs = main_content_div.find_all('p')

# Combine paragraphs into a single string
main_text = "\n".join([paragraph.get_text() for paragraph in paragraphs])
main_text

'\n\nData science is an interdisciplinary academic field[1] that uses statistics, scientific computing, scientific methods, processes, algorithms and systems to extract or extrapolate knowledge and insights from noisy, structured, and unstructured data.[2]\n\nData science also integrates domain knowledge from the underlying application domain (e.g., natural sciences, information technology, and medicine).[3] Data science is multifaceted and can be described as a science, a research paradigm, a research method, a discipline, a workflow, and a profession.[4]\n\nData science is a "concept to unify statistics, data analysis, informatics, and their related methods" to "understand and analyze actual phenomena" with data.[5] It uses techniques and theories drawn from many fields within the context of mathematics, statistics, computer science, information science, and domain knowledge.[6] However, data science is different from computer science and information science. Turing Award winner Jim 

# using wikipedia api

docs: https://en.wikipedia.org/w/api.php

In [9]:
def scrape_wikipedia(keyword, srlimit=10):
    loop = True
    sr_off = 0
    content_list = list()

    while loop:
        base_url = "https://en.wikipedia.org/w/api.php"
        params = {
            "action": "query",
            "format": "json",
            "list": "search",
            "srsearch": keyword,
            "utf8": 1,
            "srlimit": srlimit,
            "sroffset": sr_off
        }
        
        # GET request
        response = requests.get(base_url, params=params)
        data = response.json()

        if  "query" not in data or "search" not in data["query"]:
            loop = False
        else:
            # Extracting search results
            search_results = data["query"]["search"]

            content_list = list()
            # Loop through search results and fetch content for each page
            for result in search_results:
                title = result["title"]
                timestamp = result["timestamp"]

                # Fetch content for the current page
                content = result["snippet"]

                # Append to the DataFrame
                content_list.append({"Title": title,
                                     "Timestamp": timestamp,
                                     "Content": content})

            if "continue" in data:
                sr_off += data["continue"]["sroffset"]
            else:
                loop = False

    return pd.DataFrame(content_list)

In [10]:
data = scrape_wikipedia('Python', srlimit=10)
len(data)

10

In [11]:
data

Unnamed: 0,Title,Timestamp,Content
0,MonsterQuest,2023-09-19T02:43:36Z,the Lost Tiger 24. Curse Of The Monkey Man 25....
1,Tamori,2023-07-21T16:49:50Z,Thief! Pleasurable Gag Program! Monty <span cl...
2,Monty Python v. American Broadcasting Companie...,2023-03-26T03:00:40Z,"Monty <span class=""searchmatch"">Python</span> ..."
3,Medcouple,2023-04-11T13:24:05Z,fast medcouple algorithm is implemented in a C...
4,Nanoprobing,2023-04-28T05:30:51Z,Circuits (IPFA) Technical papers on SEM-based ...
5,Itzik Kotler,2023-05-12T18:23:19Z,organizing two additional hackathons proving t...
6,Hollywood Husbands,2023-08-01T15:15:13Z,"Hollywood Wives: The New Generation (2001), an..."
7,Martin Kennedy (rugby league),2023-09-24T05:22:43Z,"and neotropical stingrays from Thailand, and i..."
8,Asterix and the Actress,2022-04-02T14:27:42Z,"outside the tavern in Condatum, a reference to..."
9,VOACAP,2023-10-23T20:37:24Z,"model, available at http://www.voacap.com/pred..."
