Web Scrapping mini Assignment
Author: Saish Desai
references - https://elitedatascience.com/python-web-scraping-libraries

In [1]:
# request library
# guide to use request - https://docs.python-requests.org/en/master/user/quickstart/
import requests 
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup # library for pulling out HTML and XML files
from textblob import TextBlob # part of speach tagging
import re
from sklearn.feature_extraction.text import TfidfVectorizer # Tf - idf vectorization
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer


In [2]:
# Extracting book text link
def url_to_html(website_link : str,book_id : int):
    """
    A function to get the website link and book id and return the html parsed format of the chosen book.
    :param website_link: link of the website where books are stored
    :param book_id: id of the book for which the data is to be extracted
    :returns: returns content from the chosen webpage in html format
    """
    ebook_link = website_link + "/ebooks/" + book_id
    headers = {'Saish Desai': 'Web scraper - school project (sbdesai2@illinois.edu)'}
    book = requests.get(ebook_link, params = None) #using get request to get the website link for chosen book
    if book.status_code == 200:
        soup = BeautifulSoup(book.text, 'html.parser')# using the BeaitifulSoup module to parse the html format
        link = soup.find(type=re.compile("text/html")) # selecting tag to extract html format of the book
        text_link  = link.get('href')
        ebook_text_link = website_link + text_link #link for html format of the book
        s = requests.Session()
        book_data = s.get(ebook_text_link)
        print(ebook_text_link)
        soup_data = BeautifulSoup(book_data.text, 'html.parser')
        return soup_data
    else:
        print("Error:",book.status_code)

In [3]:
website_link = "https://www.gutenberg.org"
book_id = input("Enter Book ID: ")
soup_data = url_to_html(website_link,book_id)
# print(soup_data)

Enter Book ID: 164
https://www.gutenberg.org/files/164/164-h/164-h.htm


In [4]:
# https://www.tutorialspoint.com/roman-to-integer-in-python
def romanToInt(s:str) -> int:
      """
      :type s: str
      :rtype: int
      """
      roman = {'I':1,'V':5,'X':10,'L':50,'C':100,'D':500,'M':1000,'IV':4,'IX':9,'XL':40,'XC':90,'CD':400,'CM':900}
      i = 0
      num = 0
      while i < len(s):
         if i+1<len(s) and s[i:i+2] in roman:
            num+=roman[s[i:i+2]]
            i+=2
         else:
            #print(i)
            num+=roman[s[i]]
            i+=1
      return num

In [5]:
# Extracting text data from the book
# headers_2 = soup_data.find_all("a",href=re.compile("#link"))
# headers_2 = soup_data.find_all("div",class_=re.compile("chapter"))
headers_3 = soup_data.find_all("h3")
list_chap = []
for ele in headers_3:
    # print(ele)
    chap_dict = {}
    if "CHAPTER" in ele.text:
        chap_num  = romanToInt(str(ele.text.split()[1])) # storing chapter number
        chap_name = (ele.find_next("h3")).text.strip() # strong chapter name
        chap_dict['CHAPTER '+ str(chap_num)] = chap_name
        paras = ele.find_all_next('p',limit =10) #storing first 2 paragrahs of each chapter
        paras =  [" ".join(ele.get_text().split()) for ele in paras]
        # print(paras)
        paras = [ele.replace("—"," ") for ele in paras] # replacing the special character "—" with space " ".
        # paras = [re.sub("[\]","",ele) for ele in paras] # replacing the special character "\" with space "".
        chap_dict['content'] = "".join(paras)
        list_chap.append(chap_dict)
print(list_chap)



In [6]:
# Gathering all the content present in the scrapped data
text = ""
for ele in list_chap:
    text+=ele['content']
    # text.append(ele['content'].split("."))
text = text.split(".")

In [7]:
# Applying Part of Speech tagging to each word in the text
prop_noun = [] # list of all proper nouns in the text
for ele in list_chap:
    PoS = TextBlob(ele['content']).tags
    prop_noun+=[ele[0].lower() for ele in PoS if ele[1] =='NNP' and ele[0] not in stopwords.words("english")]
prop_noun = list(set(prop_noun))

In [8]:
# function to clean data
# initializing stemming algorithm
ps = PorterStemmer() # porter stemmer
ss = SnowballStemmer('english') # snowball stemmer
# initializing lemmatizing algorithm
wnl = WordNetLemmatizer() # lemmatization
def text_clean_preprocess(text : str):
    """
    This function will clean the data and perform certain preprocessing step of stemming/lemmatizing the words in 
    the tweet. Finally the function will remove stopwords and will only consider with words with a character length
    ranging from 2 to 10
    
    """
    vowel = ['a','e','i','o','u']
    # text = BeautifulSoup(raw_text) # using the BeautifulSoup library to scrape the HTML tags from the text
    text = re.sub("[^a-zA-Z]"," ",text) # removing all the punctions except the expressions ":)" and ":("
    text = text.lower() # converting all words to lowercase
    text = text.strip() # striping leading and trailing white spaces
    words = text.split()
    meaningful_words = [w for w in words if w not in stopwords.words("english")]
    # meaningful_words = [ss.stem(w) for w in meaningful_words]
    return " ".join(meaningful_words)

In [9]:
data = []
vectorizer = TfidfVectorizer(vocabulary=prop_noun)
for ele in text:
    ele = text_clean_preprocess(ele)
    data.append(ele)
vectorized_data = vectorizer.fit_transform(data)
vector_dataframe = pd.DataFrame(vectorized_data.toarray(), columns = vectorizer.get_feature_names())



In [10]:
vector_dataframe.head()

Unnamed: 0,creator,tell,amongst,mont,nebraska,s.,pomotou,bougainville,magellan,african,...,niger,vanikoro,edwards,europe.our,straits,w.n.w.,hatteras,anderson,ceylon,government
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
vector_dataframe.to_csv("noun.csv")