In [1]:
# Reading all lines of text file into single variable
with open('english_speeches_date_place_title_text.txt','r') as myfile:
    data = myfile.readlines()

In [2]:
# Currently the data variable consists of a list with each element being one line of the file.
# We combine all the elements i.e. all lines in the file into a single string
data = ''.join(data)

In [3]:
# Every new speech starts after the #DELIMITER# keyword. So we split the string along this keyword
# This provides us with a list with every speech being one element of the list
data = data.split('#DELIMITER#\n')

In [4]:
# We will do basic preprocessing of each speech and after that store them in a new list
speeches = []

In [5]:
# \n character has been automatically appended wherever there was a new line.
# So for each speech we will first split it along this char. The first two lines of each speech have been
# the date and occasion where the speech was given. We remove both of these lines as our focus right now
# is only on the speech content. Also, we store titles into one list. Once done we then recombine all lines into a single string and add this
# to our speeches list. Then adding this one speech to our list
titles = []
for speech in data:
    speech = speech.split('\n')
    speech.remove(speech[0])
    titles.append(speech[0])    
    speech.remove(speech[0])
    speech = '\n '.join(speech)
    speeches.append(speech)

In [6]:
# Commas and full stops are placed right next to any word. As a result during tokenizing, it'll
# assume for eg: "India." to be a word. Rather we want it to understand "India" and "." as two
# different entities. So we insert an extra space before question marks and commas so that it considers
# these two punctuation symbols as separate entities from the word
speeches = [speech.replace(","," ,") for speech in speeches]
speeches = [speech.replace("."," .") for speech in speeches]

In [7]:
# So this is one speech after all the basic preprocessing.
speeches[132]

'Ladies and Gentlemen ,\n I am delighted that Delhi is hosting the World Diamond Conference . We are especially pleased that President Putin is here with us today .\n He is a leader of India`s key strategic partner and , personally , a great friend of India . In addition , Russia is the source of more than a quarter of the world`s production of diamonds .\n I understand that this is the first conference of its kind in the world . This is a source of great pride for us .\n I want to congratulate and thank Gem and Jewellery Export Promotion Council , the Ministry of Commerce and Industry , and , the World Diamond Mark for organising this event .\n India is the natural venue for this conference .\n For one , it is generally believed that diamond is India`s gift to the world . More than two thousand years ago , diamond was deeply valued in India . It was even traded with China over the Silk Route .\n और अभी पुतिन जी बता रहे थे कि किस प्रकार से भारत का हीरा दुनिया में जगमगा रहा था।\n It was

In [10]:
# A sample title
titles[132]

'World Diamond Conference, Delhi'

In [8]:
# Removing \n
speeches = [speech.replace("\n"," \n ") for speech in speeches]

In [9]:
# Removing any Non-English content present in the speech
import re
fin_speeches = [re.sub(r"[^A-Za-z0-9 ^,!.\/'+\-=]","",stg).strip() for stg in speeches]

In [10]:
# Removing extra white spaces in between
fin_speeches = [' '.join(speech.split()) for speech in fin_speeches]

In [11]:
import pandas as pd

In [12]:
# Storing titles and speech in a CSV file
df = pd.DataFrame()
df['title'] = titles
df.head()

Unnamed: 0,title
0,Niti Aayog
1,South Africa backs India's bid to join Nuclear...
2,Banquet speech by Prime Minister during his vi...
3,Mann Ki Baat
4,Afghanistan visit


In [13]:
df['speech'] = fin_speeches
df.head()

Unnamed: 0,title,speech
0,Niti Aayog,There was a time when development was believed...
1,South Africa backs India's bid to join Nuclear...,His Excellency President of the Republic of So...
2,Banquet speech by Prime Minister during his vi...,"Your Excellency , President Filipe Nyusi Ladie..."
3,Mann Ki Baat,"My dear countrymen , my greetings namaskar to ..."
4,Afghanistan visit,H .E . President Mohammad Ashraf Ghani Foreign...


In [14]:
df.to_csv('modi_english_speech.csv',index=None)

In [16]:
with open('modi_english_speech.txt','w') as w:
    w.write(('\n#DELIMITER#\n').join(fin_speeches))