In [1]:
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd

Load the page

In [2]:
url = "https://docs.python.org/3/library/random.html"
page = urllib.request.urlopen(url)

soup = bs(page)
print(soup)

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta charset="utf-8"/>
<title>random — Generate pseudo-random numbers — Python 3.9.4 documentation</title>
<link href="../_static/pydoctheme.css" rel="stylesheet" type="text/css"/>
<link href="../_static/pygments.css" rel="stylesheet" type="text/css"/>
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/language_data.js"></script>
<script src="../_static/sidebar.js"></script>
<link href="../_static/opensearch.xml" rel="search" title="Search within Python 3.9.4 documentation" type="application/opensearchdescription+xml"/>
<link href="../about.html" rel="author" title="About these documents"/>
<link href="../genindex.html" rel="index" title="Index"/>
<link href="../search.html" rel="search" title="Search"

Target specific elements. For example we can estract the *functions* in the page

In [4]:
names = soup.body.findAll('dt')
names

[<dt id="random.seed">
 <code class="sig-prename descclassname">random.</code><code class="sig-name descname">seed</code><span class="sig-paren">(</span><em class="sig-param">a=None</em>, <em class="sig-param">version=2</em><span class="sig-paren">)</span><a class="headerlink" href="#random.seed" title="Permalink to this definition">¶</a></dt>,
 <dt id="random.getstate">
 <code class="sig-prename descclassname">random.</code><code class="sig-name descname">getstate</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#random.getstate" title="Permalink to this definition">¶</a></dt>,
 <dt id="random.setstate">
 <code class="sig-prename descclassname">random.</code><code class="sig-name descname">setstate</code><span class="sig-paren">(</span><em class="sig-param">state</em><span class="sig-paren">)</span><a class="headerlink" href="#random.setstate" title="Permalink to this definition">¶</a></dt>,
 <dt id="random.randbytes">
 <code class="sig

Now we need to extract the functions' names

In [22]:
function_names = list(map(lambda x: x[4:], re.findall('id="random.\w+', str(names))))
function_names

['random.seed',
 'random.getstate',
 'random.setstate',
 'random.randbytes',
 'random.randrange',
 'random.randint',
 'random.getrandbits',
 'random.choice',
 'random.choices',
 'random.shuffle',
 'random.sample',
 'random.random',
 'random.uniform',
 'random.triangular',
 'random.betavariate',
 'random.expovariate',
 'random.gammavariate',
 'random.gauss',
 'random.lognormvariate',
 'random.normalvariate',
 'random.vonmisesvariate',
 'random.paretovariate',
 'random.weibullvariate',
 'random.Random',
 'random.SystemRandom']

When we need to extract more complex data can be difficult to use regex. For example consider the descriptions *dd*

In [24]:
description = soup.body.findAll('dd')
description

[<dd><p>Initialize the random number generator.</p>
 <p>If <em>a</em> is omitted or <code class="docutils literal notranslate"><span class="pre">None</span></code>, the current system time is used.  If
 randomness sources are provided by the operating system, they are used
 instead of the system time (see the <a class="reference internal" href="os.html#os.urandom" title="os.urandom"><code class="xref py py-func docutils literal notranslate"><span class="pre">os.urandom()</span></code></a> function for details
 on availability).</p>
 <p>If <em>a</em> is an int, it is used directly.</p>
 <p>With version 2 (the default), a <a class="reference internal" href="stdtypes.html#str" title="str"><code class="xref py py-class docutils literal notranslate"><span class="pre">str</span></code></a>, <a class="reference internal" href="stdtypes.html#bytes" title="bytes"><code class="xref py py-class docutils literal notranslate"><span class="pre">bytes</span></code></a>, or <a class="reference interna

Luckily BeautifoulSoup has built-in functions

In [25]:
function_usage = []
for item in description:
    function_usage.append(item.text)
function_usage

['Initialize the random number generator.\nIf a is omitted or None, the current system time is used.  If\nrandomness sources are provided by the operating system, they are used\ninstead of the system time (see the os.urandom() function for details\non availability).\nIf a is an int, it is used directly.\nWith version 2 (the default), a str, bytes, or bytearray\nobject gets converted to an int and all of its bits are used.\nWith version 1 (provided for reproducing random sequences from older versions\nof Python), the algorithm for str and bytes generates a\nnarrower range of seeds.\n\nChanged in version 3.2: Moved to the version 2 scheme which uses all of the bits in a string seed.\n\n\nDeprecated since version 3.9: In the future, the seed must be one of the following types:\nNoneType, int, float, str,\nbytes, or bytearray.\n\n',
 'Return an object capturing the current internal state of the generator.  This\nobject can be passed to setstate() to restore the state.\n',
 'state should ha

In [33]:
data_function = pd.DataFrame({'name': function_names, 'description':function_usage})
data_function.head()

Unnamed: 0,name,description
0,random.seed,Initialize the random number generator.\nIf a ...
1,random.getstate,Return an object capturing the current interna...
2,random.setstate,state should have been obtained from a previou...
3,random.randbytes,Generate n random bytes.\nThis method should n...
4,random.randrange,Return a randomly selected element from range(...


To target elements with specific attributes

In [32]:
example = soup.body.findAll('div', attrs={'id':'bookkeeping-functions'})
example

[<div class="section" id="bookkeeping-functions">
 <h2>Bookkeeping functions<a class="headerlink" href="#bookkeeping-functions" title="Permalink to this headline">¶</a></h2>
 <dl class="function">
 <dt id="random.seed">
 <code class="sig-prename descclassname">random.</code><code class="sig-name descname">seed</code><span class="sig-paren">(</span><em class="sig-param">a=None</em>, <em class="sig-param">version=2</em><span class="sig-paren">)</span><a class="headerlink" href="#random.seed" title="Permalink to this definition">¶</a></dt>
 <dd><p>Initialize the random number generator.</p>
 <p>If <em>a</em> is omitted or <code class="docutils literal notranslate"><span class="pre">None</span></code>, the current system time is used.  If
 randomness sources are provided by the operating system, they are used
 instead of the system time (see the <a class="reference internal" href="os.html#os.urandom" title="os.urandom"><code class="xref py py-func docutils literal notranslate"><span class=

In [30]:
example

[]