<a href="https://colab.research.google.com/github/sabitendu/_portfolio_/blob/main/beginner_web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries

# Standard imports
import pandas as pd

# For web scraping
import requests
import urllib.request
from bs4 import BeautifulSoup

# For performing regex operations
import re

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Scrape The Data

In [2]:
# Connect to Website and pull in data

# Save the URL of the webpage we want to scrape to a variable
url = 'https://docs.python.org/3/library/random.html#module-random'

header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}

# Send a get request and assign the response to a variable
response = requests.get(url)

# Turn the undecoded content into a Beautiful Soup object and assign it to a variable
soup = BeautifulSoup(response.content)









In [3]:
print(soup)

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/><meta content="Docutils 0.17.1: http://docutils.sourceforge.net/" name="generator"/>
<meta content="random — Generate pseudo-random numbers" property="og:title"/>
<meta content="website" property="og:type"/>
<meta content="https://docs.python.org/3/library/random.html" property="og:url"/>
<meta content="Python documentation" property="og:site_name"/>
<meta content="Source code: Lib/random.py This module implements pseudo-random number generators for various distributions. For integers, there is uniform selection from a range. For sequences, there is uniform s..." property="og:description"/>
<meta content="https://docs.python.org/3/_static/og-image.png" property="og:image"/>
<meta content="Python documentation" property="og:image:alt"/>
<meta content="Source code: Lib/random.py This module implements pseudo-random number generators for various distributions. For i

In [4]:
# Find all function names - we specify the name of the element in this case is 'dt'

names = soup.body.findAll('dt')

print(names)

[<dt class="sig sig-object py" id="random.seed">
<span class="sig-prename descclassname"><span class="pre">random.</span></span><span class="sig-name descname"><span class="pre">seed</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">a</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">version</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">2</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#random.seed" title="Permalink to this definition">¶</a></dt>, <dt class="sig sig-object py" id="random.getstate">
<span class="sig-prename descclassname"><span class="pre">random.</span></span><span class="sig-name descname"><span class="pre">getstate</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span

In [5]:
# Find all the information we're looking for with regex
# In this case, it would be every string at starts with id='random.'

function_names = re.findall('id="random.\w+' , str(names)) # '\w+' which means the string should end with the function name

# Let print the results
print(function_names)

['id="random.seed', 'id="random.getstate', 'id="random.setstate', 'id="random.randbytes', 'id="random.randrange', 'id="random.randint', 'id="random.getrandbits', 'id="random.choice', 'id="random.choices', 'id="random.shuffle', 'id="random.sample', 'id="random.binomialvariate', 'id="random.random', 'id="random.uniform', 'id="random.triangular', 'id="random.betavariate', 'id="random.expovariate', 'id="random.gammavariate', 'id="random.gauss', 'id="random.lognormvariate', 'id="random.normalvariate', 'id="random.vonmisesvariate', 'id="random.paretovariate', 'id="random.weibullvariate', 'id="random.Random', 'id="random.SystemRandom']


In [6]:
# Using list comprehension to edit our values:

function_names = [item[4:] for item in function_names]

# Let print the results
print(function_names)


['random.seed', 'random.getstate', 'random.setstate', 'random.randbytes', 'random.randrange', 'random.randint', 'random.getrandbits', 'random.choice', 'random.choices', 'random.shuffle', 'random.sample', 'random.binomialvariate', 'random.random', 'random.uniform', 'random.triangular', 'random.betavariate', 'random.expovariate', 'random.gammavariate', 'random.gauss', 'random.lognormvariate', 'random.normalvariate', 'random.vonmisesvariate', 'random.paretovariate', 'random.weibullvariate', 'random.Random', 'random.SystemRandom']


In [7]:
# Find all the function description

description = soup.body.findAll('dd')

print(description)

[<dd><p>Initialize the random number generator.</p>
<p>If <em>a</em> is omitted or <code class="docutils literal notranslate"><span class="pre">None</span></code>, the current system time is used.  If
randomness sources are provided by the operating system, they are used
instead of the system time (see the <a class="reference internal" href="os.html#os.urandom" title="os.urandom"><code class="xref py py-func docutils literal notranslate"><span class="pre">os.urandom()</span></code></a> function for details
on availability).</p>
<p>If <em>a</em> is an int, it is used directly.</p>
<p>With version 2 (the default), a <a class="reference internal" href="stdtypes.html#str" title="str"><code class="xref py py-class docutils literal notranslate"><span class="pre">str</span></code></a>, <a class="reference internal" href="stdtypes.html#bytes" title="bytes"><code class="xref py py-class docutils literal notranslate"><span class="pre">bytes</span></code></a>, or <a class="reference internal" hre

In [8]:
# Create a list

function_usage = []

# Create a loop

for item in description:
    item = item.text      #  Save the extracted text to a variable
    item = item.replace('\n', ' ')     # to get rid of the next line operator which is `\n`
    function_usage.append(item)



In [9]:
print(function_usage)  # Don't get overwhelmed! they are just all the function description from the above function names

['Initialize the random number generator. If a is omitted or None, the current system time is used.  If randomness sources are provided by the operating system, they are used instead of the system time (see the os.urandom() function for details on availability). If a is an int, it is used directly. With version 2 (the default), a str, bytes, or bytearray object gets converted to an int and all of its bits are used. With version 1 (provided for reproducing random sequences from older versions of Python), the algorithm for str and bytes generates a narrower range of seeds.  Changed in version 3.2: Moved to the version 2 scheme which uses all of the bits in a string seed.   Changed in version 3.11: The seed must be one of the following types: NoneType, int, float, str, bytes, or bytearray.  ', 'Return an object capturing the current internal state of the generator.  This object can be passed to setstate() to restore the state. ', 'state should have been obtained from a previous call to ge

In [10]:
# Let's check the length of the function_names and function_usage

print(f' Length of function_names: {len(function_names)}')
print(f' Length of function_usage: {len(function_usage)}')

 Length of function_names: 26
 Length of function_usage: 26


In [11]:
# Create a dataframe since the length of both variables are equal!

data = pd.DataFrame( {  'function name': function_names,
                      'function usage' : function_usage  } )

data

Unnamed: 0,function name,function usage
0,random.seed,Initialize the random number generator. If a i...
1,random.getstate,Return an object capturing the current interna...
2,random.setstate,state should have been obtained from a previou...
3,random.randbytes,Generate n random bytes. This method should no...
4,random.randrange,Return a randomly selected element from range(...
5,random.randint,Return a random integer N such that a <= N <= ...
6,random.getrandbits,Returns a non-negative Python integer with k r...
7,random.choice,Return a random element from the non-empty seq...
8,random.choices,Return a k sized list of elements chosen from ...
9,random.shuffle,Shuffle the sequence x in place. To shuffle an...


In [12]:
# Let make a CSV file from the dataframe

data.to_csv('random_function.csv')

# **Web Scraping Using Pandas**


In [13]:
# Method 1: only 1 year

# URL of the player stats in 2020

url = 'https://www.basketball-reference.com/leagues/NBA_2020_per_game.html'
url

'https://www.basketball-reference.com/leagues/NBA_2020_per_game.html'

In [14]:
# Method 2: multiple years

years = ['2016', '2017', '2018', '2019', '2020']
str = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'

for year in years:
    url = str.format(year)
    print(url)

https://www.basketball-reference.com/leagues/NBA_2016_per_game.html
https://www.basketball-reference.com/leagues/NBA_2017_per_game.html
https://www.basketball-reference.com/leagues/NBA_2018_per_game.html
https://www.basketball-reference.com/leagues/NBA_2019_per_game.html
https://www.basketball-reference.com/leagues/NBA_2020_per_game.html


In [15]:
# Let check URL of the player stats in 2020

url = 'https://www.basketball-reference.com/leagues/NBA_2020_per_game.html'

# Using pd.read_html()

df = pd.read_html(url, header = 0)

print(df)

[      Rk                    Player Pos Age   Tm   G  GS    MP   FG   FGA  ...  \
0      1              Steven Adams   C  26  OKC  63  63  26.7  4.5   7.6  ...   
1      2               Bam Adebayo  PF  22  MIA  72  72  33.6  6.1  11.0  ...   
2      3         LaMarcus Aldridge   C  34  SAS  53  53  33.1  7.4  15.0  ...   
3      4            Kyle Alexander   C  23  MIA   2   0   6.5  0.5   1.0  ...   
4      5  Nickeil Alexander-Walker  SG  21  NOP  47   1  12.6  2.1   5.7  ...   
..   ...                       ...  ..  ..  ...  ..  ..   ...  ...   ...  ...   
672  525                Trae Young  PG  21  ATL  60  60  35.3  9.1  20.8  ...   
673  526               Cody Zeller   C  27  CHO  58  39  23.1  4.3   8.3  ...   
674  527              Tyler Zeller   C  30  SAS   2   0   2.0  0.5   2.0  ...   
675  528                Ante Žižić   C  23  CLE  22   0  10.0  1.9   3.3  ...   
676  529               Ivica Zubac   C  22  LAC  72  70  18.4  3.3   5.3  ...   

      FT%  ORB  DRB   TRB 

In [16]:
# Check number of DataFrames in this list

print(f'number of tables in df: {len(df)}')

print('================')

# Since there is only 1, pull out the 0th element:
df[0].head(20)

number of tables in df: 1


Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Steven Adams,C,26,OKC,63,63,26.7,4.5,7.6,...,0.582,3.3,6.0,9.3,2.3,0.8,1.1,1.5,1.9,10.9
1,2,Bam Adebayo,PF,22,MIA,72,72,33.6,6.1,11.0,...,0.691,2.4,7.8,10.2,5.1,1.1,1.3,2.8,2.5,15.9
2,3,LaMarcus Aldridge,C,34,SAS,53,53,33.1,7.4,15.0,...,0.827,1.9,5.5,7.4,2.4,0.7,1.6,1.4,2.4,18.9
3,4,Kyle Alexander,C,23,MIA,2,0,6.5,0.5,1.0,...,,1.0,0.5,1.5,0.0,0.0,0.0,0.5,0.5,1.0
4,5,Nickeil Alexander-Walker,SG,21,NOP,47,1,12.6,2.1,5.7,...,0.676,0.2,1.6,1.8,1.9,0.4,0.2,1.1,1.2,5.7
5,6,Grayson Allen,SG,24,MEM,38,0,18.9,3.1,6.6,...,0.867,0.2,2.0,2.2,1.4,0.3,0.1,0.9,1.4,8.7
6,7,Jarrett Allen,C,21,BRK,70,64,26.5,4.3,6.6,...,0.633,3.1,6.5,9.6,1.6,0.6,1.3,1.1,2.3,11.1
7,8,Kadeem Allen,PG,27,NYK,10,0,11.7,1.9,4.4,...,0.636,0.2,0.7,0.9,2.1,0.5,0.2,0.8,0.7,5.0
8,9,Al-Farouq Aminu,PF,29,ORL,18,2,21.1,1.4,4.8,...,0.655,1.3,3.5,4.8,1.2,1.0,0.4,0.9,1.5,4.3
9,10,Justin Anderson,SG,26,BRK,10,1,10.7,1.0,3.8,...,0.5,0.1,2.0,2.1,0.8,0.0,0.6,0.4,1.3,2.8
