## Beautiful Soup

In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
url = r"https://codingblocks.com/"

In [3]:
result = requests.get(url)

In [4]:
result.status_code

200

In [5]:
soup = BeautifulSoup(result.text)

In [6]:
# soup    Hence, we created an object that has the same text content as result.text. But the advantage we have now is that, now we can easily get specific item of the text

In [7]:
soup.title

<title>Best Programming Courses &amp; IT Training Institute in Delhi NCR | Coding Blocks</title>

In [8]:
soup.p   # Gives the first paragraph of the page

<p class="bold lead mb60">Introducing CB Accelerate, a dedicated program to train you in becoming an elite coder | Apply for the selection Test to get enrolled, accepting only 50 students per batch.</p>

In [9]:
soup.h1

<h1 class="uppercase mb16">Live Courses</h1>

In [10]:
soup.find_all("h1")   # Will give all the headings h1 of the page

[<h1 class="uppercase mb16">Live Courses</h1>,
 <h1 class="uppercase mb16">Classroom Courses</h1>,
 <h1 class="uppercase mb16">Online Courses</h1>,
 <h1 class="uppercase mb16">Our Team</h1>]

In [11]:
img_tags = soup.find_all("img")   # We get the details of all the image of the page

In [12]:
# Now, we need to get the src of all the images from the image_tags

In [13]:
# dir(img_tags[0])  # dir gives the methods and states/ behaviours of any object

In [14]:
img_tags[0].get("src")

'https://www.facebook.com/tr?id=1947467048859851&ev=PageView\n    &noscript=1'

In [15]:
for img_tag in img_tags:
    print(img_tag.get("src"))

https://www.facebook.com/tr?id=1947467048859851&ev=PageView
    &noscript=1
https://q.quora.com/_/ad/0e6543f4ed0047658c67c5d04a8ace8d/pixel?tag=ViewContent&noscript=1
/assets/images/logo-light.png
/assets/images/cb/cblogo.png
assets/images/cb/cover/WebbannerCb.jpg
assets/images/cb/cover/summer_batches.jpg
assets/images/cb/cover/online_banner.jpg
assets/images/cb/cover/cover5.jpg
assets/images/cb/cover/cast.jpg
https://minio.codingblocks.com/amoeba/efb2d77c-e577-406d-bb8c-f21a9beeb3a9.svg
https://minio.codingblocks.com/amoeba/538d2648-a9ab-4a20-bac4-14017a014814.svg
https://minio.codingblocks.com/amoeba/8363cfd5-40f7-499a-bd8d-33541baa0a5f.svg
assets/images/cb/logosc/color_java.svg
assets/images/cb/logosc/color_algo.svg
https://minio.codingblocks.com/amoeba/ccfc8d7c-66ac-461e-9ba0-5ded3b1b81ee.png
https://minio.codingblocks.com/amoeba/0ee7474c-de3c-44d7-a704-646491c702e8.svg
https://minio.codingblocks.com/amoeba/587ae53b-6f4e-4afb-a2c5-b328a4dfbdf1.svg
https://minio.codingblocks.com/amo

In [16]:
# We can see some srcs start from https and some with assets
# This means the html links are external and the assets links are internal

# We need to add https://codingblocks.com/ before assets link to make them external

# The links starting with https are not clean links, hence we need to discard them

In [17]:
# We are trying to get all the images that are of team

In [18]:
set_urls = set()

for img_tag in img_tags:
    member_url = img_tag.get("src")
    
    if "team" in member_url:
        set_urls.add(member_url)

In [19]:
set_urls    # We are using set because there might be posssibility that an image might be appearing 2 times. Hence, to get only unique urls, we used set

{'assets/images/cb/team/Garima Final.jpg',
 'assets/images/cb/team/Kartik.JPG',
 'assets/images/cb/team/Varun Final.jpg',
 'assets/images/cb/team/c_badebhayia.jpg',
 'assets/images/cb/team/c_rishab.jpg',
 'assets/images/cb/team/wanushray.jpg',
 'assets/images/cb/team/warnav.jpg',
 'assets/images/cb/team/wmanmohan.jpg',
 'assets/images/cb/team/wprateek.jpg',
 'assets/images/cb/team/wpriyanshu.jpg',
 'assets/images/cb/team/wrajesh.jpg',
 'assets/images/cb/team/wvarun.jpg'}

In [20]:
list(set_urls)

['assets/images/cb/team/c_rishab.jpg',
 'assets/images/cb/team/Kartik.JPG',
 'assets/images/cb/team/Varun Final.jpg',
 'assets/images/cb/team/wprateek.jpg',
 'assets/images/cb/team/Garima Final.jpg',
 'assets/images/cb/team/wrajesh.jpg',
 'assets/images/cb/team/wpriyanshu.jpg',
 'assets/images/cb/team/wvarun.jpg',
 'assets/images/cb/team/wmanmohan.jpg',
 'assets/images/cb/team/wanushray.jpg',
 'assets/images/cb/team/c_badebhayia.jpg',
 'assets/images/cb/team/warnav.jpg']

In [21]:
first = list(set_urls)[0]  # Converted into list as list is not indexible

In [22]:
first   # But, this is not the complete url. We need to add https://codingblocks.com/ to make the url complete

'assets/images/cb/team/c_rishab.jpg'

In [23]:
comp_url = url + first

In [24]:
comp_url

'https://codingblocks.com/assets/images/cb/team/c_rishab.jpg'

In [25]:
fname = comp_url.split("/")[-1]   # We are splitting the url from the last '/' to get the name of the image

In [26]:
fname

'c_rishab.jpg'

In [27]:
# Now performing these operations on all the images of the set_urls

In [29]:
 def save_image(url, dest):
        result = requests.get(url)
        
        if result.status_code == 200:
            with open(dest, "wb") as photo:
                photo.write(result.content)

In [30]:
for location in list(set_urls)[:2]:    # Converted to list to do slicing
    complete_url = url + location
    fname = location.split("/")[-1]
    save_image(complete_url, fname)

In [31]:
# Above code would save the first images of the set_urls. 

# Now, to save all the images, we want all the images to be stored in a single folder

In [38]:
for location in list(set_urls):    # Converted to list to do slicing
    complete_url = url + location
    fname = location.split("/")[-1]
    save_image(complete_url, "Photos/" + fname)

In [33]:
# there is no folder named Photos. Hence, first we need to create a folder if the folder does not exist

In [34]:
import os

In [35]:
# if os.path.exists("Photos"):    This command will check if Photos folders exists or not. If it does not exist, then create the folder Photos 
#     os.mkdir("Photos")

In [37]:
if not os.path.exists("Photos"):
    os.mkdir("Photos")

for location in list(set_urls):    # Converted to list to do slicing
    complete_url = url + location
    fname = location.split("/")[-1]
    save_image(complete_url, "Photos/"+fname)