<a href="https://colab.research.google.com/github/saishdesai23/Web-Scrapping-from-Project-Gutenberg/blob/main/Web_Scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Web Scrapping mini Assignment
Author: Saish Desai
references - https://elitedatascience.com/python-web-scraping-libraries

In [145]:
!pip install syllapy



# 1. Installing all the required libraries

In [146]:
# request library
# guide to use request - https://docs.python-requests.org/en/master/user/quickstart/
import requests 
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup # library for pulling out HTML and XML files
from textblob import TextBlob # part of speach tagging
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer # Tf - idf vectorization
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
import spacy
import syllapy
import random


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#2. Extracting the link for book data from the "Project Gutenberg" Website.

In [147]:
# Function for extracting book text link
def url_to_html(website_link : str,book_id : int):
    """
    A function to get the website link and book id and return the html parsed format of the chosen book.
    :param website_link: link of the website where books are stored
    :param book_id: id of the book for which the data is to be extracted
    :returns: returns content from the chosen webpage in html format
    """
    ebook_link = website_link + "/ebooks/" + book_id
    headers = {'Saish Desai': 'Web scraper - school project (sbdesai2@illinois.edu)'}
    book = requests.get(ebook_link, params = None) #using get request to get the website link for chosen book
    if book.status_code == 200:
        soup = BeautifulSoup(book.text, 'html.parser')# using the BeaitifulSoup module to parse the html format
        link = soup.find(type=re.compile("text/html")) # selecting tag to extract html format of the book
        text_link  = link.get('href')
        ebook_text_link = website_link + text_link #link for html format of the book
        s = requests.Session()
        book_data = s.get(ebook_text_link)
        print(ebook_text_link)
        soup_data = BeautifulSoup(book_data.text, 'html.parser')
        return soup_data
    else:
        print("Error:",book.status_code)

For this project, I am considering the book- "TWENTY THOUSAND LEAGUES UNDER THE SEA". The code for this book under Project Gutenberg is 164. So the user will have to enter the code - 164.

In [148]:
website_link = "https://www.gutenberg.org"
book_id = input("Enter Book ID: ")
soup_data = url_to_html(website_link,book_id)

Enter Book ID: 164
https://www.gutenberg.org/files/164/164-h/164-h.htm


In [149]:
# Function for converting Roman numerals to integers
# https://www.tutorialspoint.com/roman-to-integer-in-python
def romanToInt(s:str) -> int:
      """
      Function to convert string of roman characters to integers.
      """
      roman = {'I':1,'V':5,'X':10,'L':50,'C':100,'D':500,'M':1000,'IV':4,'IX':9,'XL':40,'XC':90,'CD':400,'CM':900}
      i = 0
      num = 0
      while i < len(s):
         if i+1<len(s) and s[i:i+2] in roman:
            num+=roman[s[i:i+2]]
            i+=2
         else:
            #print(i)
            num+=roman[s[i]]
            i+=1
      return num

#3. Extracting the book title from the html file of the book.
We look at the "h1" tag from the html file of the book. We identify that there are to places in the html file where the tag has been used. We consider the text of the first "h1" tag as the title of the book

In [150]:
l = [] # list to store test of all the "h1" tags from the html file of the book
for ele in soup_data.find_all("h1"):
  ele = (ele.text)
  ele = re.sub("[^a-zA-Z0-9., ]","",ele)
  l.append(ele)
book_title = l[0]
print(" The title of the book is: ", book_title)

 The title of the book is:  TWENTY THOUSAND LEAGUES UNDER THE SEA


#4. Storing the book contents in a dictionary

In [156]:
# Extracting text data from the book
headers_3 = soup_data.find_all("h3")
list_chap = []
for ele in headers_3:
    # print(ele)
    chap_dict = {}
    if "CHAPTER" in ele.text:
        chap_num  = romanToInt(str(ele.text.split()[1])) # storing chapter number
        chap_name = (ele.find_next("h3")).text.strip() # strong chapter name
        chap_dict['CHAPTER '+ str(chap_num)] = chap_name # generating key for the dictionary
        paras = ele.find_all_next('p',limit =5) #storing first 5 paragrahs of each chapter
        paras =  [" ".join(ele.get_text().split()) for ele in paras]
        paras = [ele.replace("—"," ") for ele in paras] # replacing the special character "—" with space " ".
        chap_text = "".join(paras)
        chap_text = re.sub("[^a-zA-Z0-9., ]","",chap_text)
        chap_dict['content'] = chap_text
        list_chap.append(chap_dict)
book = {book_title:list_chap}
# Final parent dictionary storing the book title as the key and values as a list of dictionaries, each pertaining to one chapter and its contents.
book

{'TWENTY THOUSAND LEAGUES UNDER THE SEA': [{'CHAPTER 1': 'A SHIFTING REEF',
   'content': 'The year 1866 was signalised by a remarkable incident, a mysterious and puzzling phenomenon, which doubtless no one has yet forgotten. Not to mention rumours which agitated the maritime population and excited the public mind, even in the interior of continents, seafaring men were particularly excited. Merchants, common sailors, captains of vessels, skippers, both of Europe and America, naval officers of all countries, and the Governments of several States on the two continents, were deeply interested in the matter.For some time past vessels had been met by an enormous thing, a long object, spindleshaped, occasionally phosphorescent, and infinitely larger and more rapid in its movements than a whale.The facts relating to this apparition entered in various logbooks agreed in most respects as to the shape of the object or creature in question, the untiring rapidity of its movements, its surprising p

#5. Creating Poetry from the chapter text, using 5-7-5 syllable pattern

In [152]:
# Gathering all the content present in the scrapped data
text = ""
for ele in list_chap:
    # print(ele['content'])
    text+=ele['content']
    # text.append(ele['content'].split("."))
    text = re.sub("[;]",",",text)
    text = re.sub("[^a-zA-Z0-9.,]"," ",text)
text = text.split(".")
text[:10]

['The year 1866 was signalised by a remarkable incident, a mysterious and puzzling phenomenon, which doubtless no one has yet forgotten',
 ' Not to mention rumours which agitated the maritime population and excited the public mind, even in the interior of continents, seafaring men were particularly excited',
 ' Merchants, common sailors, captains of vessels, skippers, both of Europe and America, naval officers of all countries, and the Governments of several States on the two continents, were deeply interested in the matter',
 'For some time past vessels had been met by an enormous thing, a long object, spindleshaped, occasionally phosphorescent, and infinitely larger and more rapid in its movements than a whale',
 'The facts relating to this apparition entered in various logbooks agreed in most respects as to the shape of the object or creature in question, the untiring rapidity of its movements, its surprising power of locomotion, and the peculiar life with which it seemed endowed',


1) Here we use the spacy library to identify 2-word, 3-word and 4-word pattern in chapter content. Using the matcher module from spacy we identify phrases.
Each word in the phrase extracting using the pos tagging technique od spacy.

2) In the second part of the function we use the syllapy package to identify syllable count of each pharse. Considering the 5-7-5 pattern, we consider three line poetry with 1st line having 5 syllables, 2nd line having 7 syllables and the 3rd line again having 5 syllables. This we only consider the phrases with 5 and 7 syllable count.

In [153]:
# Creating poetry from the book text
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
def phrase_gen(text):
  """
  Function to generate two, three and four word phrases
  """
  # 2-word pattern
  matcher2word = Matcher(nlp.vocab)
  pattern = [{"POS":{"IN": ["NOUN", "ADV", "ADJ", "ADP"]}},
           {"POS":{"IN": ["VERB", "NOUN"]}}]
  matcher2word.add("2words", [pattern])

  # 3-word pattern
  matcher3word = Matcher(nlp.vocab)
  pattern = [{"POS":{"IN": ["NOUN", "ADV", "ADJ", "VERB", "ADP"]}},
           {"IS_ASCII": True, "IS_PUNCT": False},
           {"POS":{"IN": ["VERB", "NOUN", "ADV", "ADJ"]}}]
  matcher3word.add("3words", [pattern])

  # Groups of 4 words
  matcher4word = Matcher(nlp.vocab)
  pattern = [{"POS":{"IN": ["NOUN", "ADV", "ADJ", "VERB", "ADP"]}},
           {"IS_ASCII": True, "IS_PUNCT": False},
           {"IS_ASCII": True, "IS_PUNCT": False},
           {"POS":{"IN": ["VERB", "NOUN", "ADV", "ADJ"]}}]
  matcher4word.add("4words", [pattern])

  # Identify patterns in the text

  doc = nlp(text)
  lines_5_syll = []
  lines_7_syll = []
  matches2word = matcher2word(doc)
  matches3word = matcher3word(doc)
  matches4word = matcher4word(doc)
  for match_id, start, end in matches2word + matches3word + matches4word:
   string_id = nlp.vocab.strings[match_id]
   span = doc[start:end]
   syllable_count = 0
   for token in span:
     syllable_count += syllapy.count(token.text)
     if syllable_count == 5:
       if span.text not in lines_5_syll:
         lines_5_syll.append(span.text)
     if syllable_count == 7:
      if span.text not in lines_7_syll:
        lines_7_syll.append(span.text)
  return (lines_5_syll,lines_7_syll )

In [154]:
t = phrase_gen(" ".join(text))

Now lets print a random poetry and see the creativity of out machine

In [155]:

print("Poetry")
print("{0}\n{1}\n{2}".format(random.choice(t[0]), random.choice(t[1]), random.choice(t[0])))


Poetry
freshened it was most
terrible points so dreaded
beacon lights dying
