In [16]:
# Reading all lines of text file into single variable
with open('narendra_modi_hindi.txt','r') as myfile:
    data = myfile.readlines()

In [17]:
# Currently the data variable consists of a list with each element being one line of the file.
# We combine all the elements i.e. all lines in the file into a single string
data = ''.join(data)

In [18]:
# Every new speech starts after the ==========\n keyword. So we split the string along this keyword
# This provides us with a list with every speech being one element of the list
data = data.split('==========\n')

In [19]:
# Removing the \u200d character present in case of joint letters
data = [x.replace("\u200d","") for x in data]

In [20]:
# We will do basic preprocessing of each speech and after that store them in a new list
speeches = []

In [21]:
# Removing the \n\n, storing title of speech in another list and then joining back the list
titles = []
for speech in data:
    speech = speech.split('\n\n')
    titles.append(speech[0])    
    speech.remove(speech[0])
    speech = '\n '.join(speech)
    speeches.append(speech)

In [22]:
# Commas and full stops are placed right next to any word. As a result during tokenizing, it'll
# assume for eg: "India." to be a word. Rather we want it to understand "India" and "." as two
# different entities. So we insert an extra space before question marks and commas so that it considers
# these two punctuation symbols as separate entities from the word
speeches = [speech.replace(","," ,") for speech in speeches]
speeches = [speech.replace("।"," ।") for speech in speeches]

In [23]:
# A sample title
titles[0]

'Independence Day speech 2019'

In [24]:
# Removing \n
speeches = [speech.replace("\n"," \n ") for speech in speeches]

In [25]:
# What was observed is many times there is no space between any English word and immediate next Hindi word.
# Same holds true for some punctuation marks. So we ensure spaces ec-zist in such cases
import re
fin_speeches = [re.sub(r"(?<=[A-Za-z])(?=[^\sA-Za-z])", r" ", stmt).strip() for stmt in speeches]
fin_speeches = [speech.replace("?"," ? ") for speech in fin_speeches]
fin_speeches = [speech.replace("!"," ! ") for speech in fin_speeches]
fin_speeches = [speech.replace("।"," । ") for speech in fin_speeches]
fin_speeches = [speech.replace(";"," ; ") for speech in fin_speeches]

In [26]:
# Removing extra white spaces which might have possibly risen due to regex in between
fin_speeches = [' '.join(speech.split()) for speech in fin_speeches]

In [27]:
import pandas as pd

In [28]:
# Storing titles and speech in a CSV file
df = pd.DataFrame()
df['title'] = titles
df.head()

Unnamed: 0,title
0,Independence Day speech 2019
1,Article 370
2,Kargil commemoration
3,Book release on former PM
4,Address to party workers in Uttar Pradesh


In [29]:
df['speech'] = fin_speeches
df.head()

Unnamed: 0,title,speech
0,Independence Day speech 2019,"मेरे प्यारे देशवासियो , स्वतंत्रता के इस पवित्..."
1,Article 370,"मेरे प्यारे देशवासियों , एक राष्ट्र के तौर पर ..."
2,Kargil commemoration,"रक्षामंत्री , श्रीमान राजनाथ सिंह जी ; राज्य र..."
3,Book release on former PM,"आदरणीय उप-राष्ट्रपति जी , लोकसभा के स्पीकर श्र..."
4,Address to party workers in Uttar Pradesh,हर-हर महादेव । भारतीय जनता पार्टी के कार्यकारी...


In [30]:
df.to_csv('modi_hindi_speech.csv',index=None)

In [31]:
with open('modi_hindi_speech.txt',"w") as w:
    w.write('\n#DELIMITER#\n'.join(fin_speeches))