In [1]:
##################################################################################################
##   Notebook used for extracting text from html files. Some basic preprocessing tasks 
##   v1.0 Reading text using BeautifulSoup   
##   Required Packages: os, BeautifulSoup
##   The html files are not included in the repository
##   They can be downloaded form the following link
##   https://www.rbi.org.in/scripts/SearchResults.aspx?search=rajan&sp=speeches
##################################################################################################

In [2]:
import os 
from bs4 import BeautifulSoup as bs

In [3]:
## Reading all the html files in the directory
##

rootDir = 'E:\\NLP Session\\RBIGovernorSpeeches\\'

htmlFiles = [f for f in os.listdir(rootDir) if f.endswith('.html')]
htmlFiles

['Reserve Bank of India - Speeches_1.html',
 'Reserve Bank of India - Speeches_10.html',
 'Reserve Bank of India - Speeches_11.html',
 'Reserve Bank of India - Speeches_12.html',
 'Reserve Bank of India - Speeches_13.html',
 'Reserve Bank of India - Speeches_14.html',
 'Reserve Bank of India - Speeches_15.html',
 'Reserve Bank of India - Speeches_16.html',
 'Reserve Bank of India - Speeches_17.html',
 'Reserve Bank of India - Speeches_18.html',
 'Reserve Bank of India - Speeches_19.html',
 'Reserve Bank of India - Speeches_2.html',
 'Reserve Bank of India - Speeches_20.html',
 'Reserve Bank of India - Speeches_21.html',
 'Reserve Bank of India - Speeches_22.html',
 'Reserve Bank of India - Speeches_23.html',
 'Reserve Bank of India - Speeches_24.html',
 'Reserve Bank of India - Speeches_25.html',
 'Reserve Bank of India - Speeches_26.html',
 'Reserve Bank of India - Speeches_27.html',
 'Reserve Bank of India - Speeches_28.html',
 'Reserve Bank of India - Speeches_29.html',
 'Reserve Ba

In [4]:
## Selecting the first html file in the set
##

fileName = rootDir + htmlFiles[0] 
print fileName

E:\NLP Session\RBIGovernorSpeeches\Reserve Bank of India - Speeches_1.html


In [5]:
## Opening the file and converting it to a 'soup' object

soup = bs(open(fileName), 'html.parser')
soup



In [6]:
## Prints out a pretty version of the soup
##
print(soup.prettify())

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!-- saved from url=(0055)https://rbi.org.in/scripts/BS_SpeechesView.aspx?Id=1021 -->
<html class=" js no-flexbox canvas canvastext webgl no-touch geolocation postmessage websqldatabase indexeddb hashchange history draganddrop websockets rgba hsla multiplebgs backgroundsize borderimage borderradius boxshadow textshadow opacity cssanimations csscolumns cssgradients cssreflections csstransforms csstransforms3d csstransitions fontface generatedcontent video audio localstorage sessionstorage webworkers applicationcache svg inlinesvg smil svgclippaths gr__rbi_org_in" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
   <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible">
    <title>
     Reserve Bank of India - Speeches
    </title>
    <meta content="" name="description">
     <meta content="" n

In [7]:
## Prints the title in the page
##
soup.title

<title>Reserve Bank of India - Speeches</title>

In [8]:
## Prints the title in the page, and extracts out the string
##
soup.title.string

u'Reserve Bank of India - Speeches'

In [9]:
## Prints the first paragraph in the page
##
soup.p

<p>Good afternoon. It is great to be invited to speak at St. Stephen's College. In 1980, I toyed with the idea of joining my best friends in applying for admission to the BA in Economics here. Because I had worked so hard for the IIT exam, however, I succumbed to the sunk cost fallacy and studied Electrical Engineering. I don\u2019t regret a moment of that misspent youth but I hope you will grant me temporary membership of your club today!</p>

In [10]:
## Prints the first paragraph in the page and extracts the string 
##
soup.p.string

u"Good afternoon. It is great to be invited to speak at St. Stephen's College. In 1980, I toyed with the idea of joining my best friends in applying for admission to the BA in Economics here. Because I had worked so hard for the IIT exam, however, I succumbed to the sunk cost fallacy and studied Electrical Engineering. I don\u2019t regret a moment of that misspent youth but I hope you will grant me temporary membership of your club today!"

In [11]:
## Prints out all the links in the webpage
##
for link in soup.find_all('a'):
    print(link.get('href'))

https://rbi.org.in/scripts/BS_SpeechesView.aspx?Id=1021#mainsection
javascript:__doPostBack('UsrFontCntr$LinkBtnFontIncrease','')
None
javascript:__doPostBack('UsrFontCntr$LinkBtnFontDecrease','')
javascript:__doPostBack('UsrFontCntr$LinkBtnAccessibilty','')
None
None
https://www.rbi.org.in/hindi/Home.aspx
https://rbi.org.in/home.aspx
https://rbi.org.in/home.aspx
https://rbi.org.in/Scripts/AboutusDisplay.aspx
https://rbi.org.in/Scripts/NotificationUser.aspx
https://rbi.org.in/Scripts/BS_PressReleaseDisplay.aspx
https://rbi.org.in/Scripts/BS_ViewSpeeches.aspx
https://rbi.org.in/scripts/BS_SpeechesView.aspx?Id=1021#
https://rbi.org.in/Scripts/Publications.aspx?publication=Annual
https://rbi.org.in/Scripts/Publications.aspx?publication=HalfYearly
https://rbi.org.in/Scripts/Publications.aspx?publication=Quarterly
https://rbi.org.in/Scripts/Publications.aspx?publication=Bimonthly
https://rbi.org.in/Scripts/Publications.aspx?publication=Monthly
https://rbi.org.in/Scripts/Publications.aspx?pu

In [12]:
## Extracts the text from the soup object
##
print(soup.get_text())





Reserve Bank of India - Speeches













//<![CDATA[
var theForm = document.forms['form1'];
if (!theForm) {
    theForm = document.form1;
}
function __doPostBack(eventTarget, eventArgument) {
    if (!theForm.onsubmit || (theForm.onsubmit() != false)) {
        theForm.__EVENTTARGET.value = eventTarget;
        theForm.__EVENTARGUMENT.value = eventArgument;
        theForm.submit();
    }
}
//]]>















//Redirect to mobile site start
    function detectmob() {
        if (screen.width <= 599 || screen.Height <= 975) {
            return true;
        } else {
            return false;
        }
    }

    if (detectmob()) {
       window.location.href = "https://m.rbi.org.in" + window.location.pathname;
		
       
    }
	//Redirect to mobile site End
    $(document).ready(function () {
        Search();
    });

    function Search() {
        highlightTextAutocomplete();
        $(".autoSuggest").autocomplete({


            source: function (request, response) {
   

In [14]:
soup = bs(open(fileName), 'html.parser') # Parses text so that html tags can be extracted

# Removes the styling and other information
for script in soup(["script", "style","title",'[document]', 'head', 'title']):
    script.extract() 

# Extracts the text from the soup
cleaned=str(soup.get_text(separator=' ').encode('ascii','ignore'))

# Strips out the spaces
cleanedtext = cleaned.strip()
cleanedtext

