# Web Scraping with Beautiful Soup using Python

#### <i>Website :  [Midas Labs](http://midas.iiitd.edu.in)</i>

This project crawls the html content of midas labs and creates 2 files-
- imglinks.txt : It contains the urls of images in all the pages present in the navigation bar
- textdata.txt : It contains the text data of articles, paragraphs, and headings of the webpages present in the navigation bar
- jsonwebdata.json : It contains both the text data and urls of all images in a json file

In [1]:
from bs4 import BeautifulSoup
import requests
import json

class WebScrap():
    def __init__(self):
        pass

    def getHTML(self, url):
        return requests.get(url).text

    def makeSoup(self, html_content):
        return BeautifulSoup(html_content, 'lxml')

    # This function will return all the links of a webpage except the links of particular focussing an element (starting with #)
    def getAllLinks(self, html_soup):
        all_links = html_soup.find_all('a')
        valid_links=[]

        for link_content in all_links:
            link_a = link_content['href']
            #Validating the link
            if len(link_a)>0 and (link_a[0]) != '#':
                if '#' in link_a:
                    char_idx = link_a.find('#')
                    new_link = link_a[:char_idx]
                else:
                    new_link = link_a

                if new_link not in valid_links:
                    valid_links.append(new_link)
        return valid_links

    def getAllLinksText(self, html_soup):
        all_links = html_soup.find_all('a')
        valid_links_text=[]

        for link_content in all_links:
            link_a = link_content['href']
            #Validating the link
            if len(link_a)>0 and (link_a[0]) != '#':
                if '#' in link_a:
                    char_idx = link_a.find('#')
                    new_link = link_a[:char_idx]
                else:
                    new_link = link_a

                if new_link not in valid_links_text:
                    valid_links_text.append(link_content.text)
        return valid_links_text

    #This function will get the urls of all the images in a page
    def getImgLinks(self, html_soup):
        img_links = html_soup.find_all('img')
        img_links = map(lambda x: x['src'], img_links)
        return list(img_links)

    #This function will get the urls of all the images in a page
    def getContent(self, elements):
        return list(map(lambda x: x.text, elements))

    #This function will get the text data in a page
    def getTextData(self, html_soup):
        text_data=[]
        text_data += self.getContent(html_soup.find_all('p'))
        text_data += self.getContent(html_soup.find_all('a'))
        text_data += self.getContent(html_soup.find_all('div',class_='bigtitle'))
        text_data += self.getContent(html_soup.find_all('h1'))
        text_data += self.getContent(html_soup.find_all('h2'))
        text_data += self.getContent(html_soup.find_all('h3'))
        text_data += self.getContent(html_soup.find_all('h4'))
        text_data += self.getContent(html_soup.find_all('h5'))
        return text_data


In [2]:
baseURL='http://midas.iiitd.edu.in'
midaslab = WebScrap()
web_content = midaslab.getHTML(baseURL)
soup = midaslab.makeSoup(web_content) 
links = midaslab.getAllLinks(soup)
links_text = midaslab.getAllLinksText(soup)
jsondata={"imagesurl":[], "text":[]}
#Navigation pages URL
nav_pages_links = links[:8]
nav_pages_links_text = links_text[:8]

## Extracting all links

In [3]:
#Iterating over all the navigation links to find out the urls of images and writing it to the file
pageno=1
with open('imglinks.txt', 'w', encoding="utf-8") as f:
    for linkno in range(0,len(nav_pages_links)):
        imglinksarr=[]
        nav_url=nav_pages_links[linkno]
        nav_text=nav_pages_links_text[linkno]
        web_content_navpage = midaslab.getHTML(baseURL+nav_url)
        soup_navpage = midaslab.makeSoup(web_content_navpage)
        imglinks = midaslab.getImgLinks(soup_navpage)
        if pageno == 1:
            nav_text = 'Home'
        #Text Data
        f.write('Page No-{}, Name - {}\n'.format(pageno,nav_text))
        f.write('Page Link- {}{}\n'.format(baseURL,nav_url))
        f.write('Image Links--\n')
        for img_link in imglinks:
            f.write("{}{}\n".format(baseURL,img_link))
            imglinksarr.append(img_link)
        f.write('----------------------------------\n'.format(pageno))
        #Json data
        rowdict={"pageno" : pageno, "label" : nav_text, "Page URL" : baseURL+nav_url, "Image Links" : imglinksarr}
        jsondata['imagesurl'].append(rowdict)
        pageno += 1

In [4]:
with open('imglinks.txt') as f:
    print(f.read())

Page No-1, Name - Home
Page Link- http://midas.iiitd.edu.in/
Image Links--
http://midas.iiitd.edu.in/assets/themes/lab/images/logo/LOGO.png
http://midas.iiitd.edu.in/assets/themes/lab/images/banner/banner.jpeg
http://midas.iiitd.edu.in/assets/themes/lab/images/banner/mohit_bansal.jpeg
http://midas.iiitd.edu.in/assets/themes/lab/images/banner/utah.jpeg
http://midas.iiitd.edu.in/assets/themes/lab/images/banner/photo_shoot_1.jpg
http://midas.iiitd.edu.in/assets/themes/lab/images/banner/photo_shoot_2.jpg
http://midas.iiitd.edu.in/assets/themes/lab/images/banner/btp_mtp.jpg
http://midas.iiitd.edu.in/assets/themes/lab/images/banner/diwali_2019.jpg
----------------------------------
Page No-2, Name - Research
Page Link- http://midas.iiitd.edu.in/projects/
Image Links--
http://midas.iiitd.edu.in/assets/themes/lab/images/logo/LOGO.png
----------------------------------
Page No-3, Name - Team
Page Link- http://midas.iiitd.edu.in/team/
Image Links--
http://midas.iiitd.edu.in/assets/themes/lab/ima

## Extracting all texts from page links

In [5]:
## Extracting all links#Iterating over all the navigation links to find out the text data and writing it to the file
pageno=1
with open('textdata.txt', 'w', encoding="utf-8") as f:
    for linkno in range(0,len(nav_pages_links)):
        textdataarr=[]
        nav_url=nav_pages_links[linkno]
        nav_text=nav_pages_links_text[linkno]
        web_content_navpage = midaslab.getHTML(baseURL+nav_url)
        soup_navpage = midaslab.makeSoup(web_content_navpage)
        textdata = midaslab.getTextData(soup_navpage)
        if pageno == 1:
            nav_text = 'Home'
        f.write('Page No-{}, Name - {}\n'.format(pageno,nav_text))
        f.write('Page Link- {}{}\n'.format(baseURL,nav_url))
        f.write('Text Data--\n')
        for textdata_row in textdata:
            f.write("{}\n".format(textdata_row))
#             textdataarr.append(textdata_row.replace('"', '\\"'))
        f.write('----------------------------------\n\n'.format(pageno))
        #Json data
        rowdict={"pageno" : pageno, "label" : nav_text, "Page URL" : baseURL+nav_url, "Image Links" : textdataarr}
        jsondata['text'].append(rowdict)
        pageno += 1

In [6]:
with open('textdata.txt', encoding="utf-8") as f:
    print(f.read())

Page No-1, Name - Home
Page Link- http://midas.iiitd.edu.in/
Text Data--
MIDAS is a group of researchers at IIIT-Delhi who study, analyze, and build different multimedia systems for society leveraging multimodal information. MIDAS stands for Multimodal Digital Media Analysis Lab and it is founded by Dr. Rajiv Ratn Shah. Dr. Shah is an assistant professor in the Department of Computer Science and Engineering (jointly appointed with the Department of Human-Centered Design) at IIIT-Delhi. Our work at MIDAS includes Machine Learning, Multimedia Content Processing, Natural Language Processing, Image Processing, Multimodal Computing, Data Science, and Social Media Computing towards AI for Social Good. We believe in multidisciplinary collaborative research and work closely with eminent researchers from the National University of Singapore (NUS), Georgia Institute of Technology, The University of Texas at Austin, National Institute of Informatics (NII), Bloomberg, SLTI, and others.

Congratula

In [7]:
#converting dictionary into a json file and saving it
with open('jsonwebdata.json', 'w') as f:
    json.dump(jsondata, f, indent=2)

## Saving all contents of file as json file

In [8]:
with open('jsonwebdata.json') as f:
    print(f.read())

{
  "imagesurl": [
    {
      "pageno": 1,
      "label": "Home",
      "Page URL": "http://midas.iiitd.edu.in/",
      "Image Links": [
        "/assets/themes/lab/images/logo/LOGO.png",
        "/assets/themes/lab/images/banner/banner.jpeg",
        "/assets/themes/lab/images/banner/mohit_bansal.jpeg",
        "/assets/themes/lab/images/banner/utah.jpeg",
        "/assets/themes/lab/images/banner/photo_shoot_1.jpg",
        "/assets/themes/lab/images/banner/photo_shoot_2.jpg",
        "/assets/themes/lab/images/banner/btp_mtp.jpg",
        "/assets/themes/lab/images/banner/diwali_2019.jpg"
      ]
    },
    {
      "pageno": 2,
      "label": "Research",
      "Page URL": "http://midas.iiitd.edu.in/projects/",
      "Image Links": [
        "/assets/themes/lab/images/logo/LOGO.png"
      ]
    },
    {
      "pageno": 3,
      "label": "Team",
      "Page URL": "http://midas.iiitd.edu.in/team/",
      "Image Links": [
        "/assets/themes/lab/images/logo/LOGO.png",
        "/ass