In [2]:
import re
import pandas as pd
import numpy as np
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import RegexpStemmer
stop_words = set(stopwords.words('english'))

In [241]:
# read a text file and filter lines for a pattern
def read_file(file_name, pattern):
    with open(file_name, 'r') as f:
        lines = f.readlines()
    lines = [line for line in lines if pattern in line]
    return lines

# read a text file and filter lines for a regex pattern
def read_file_regex(file_name, pattern):
    with open(file_name, 'r') as f:
        lines = f.readlines()
    lines = [line for line in lines if re.search(pattern, line)]
    return lines

def read_file_chunks(file_name, pattern):
    with open(file_name, 'r') as f:
        lines = f.readlines()
        
    chunks = []
    chunk = []
    for line in lines:
        if re.search(pattern ,line):
            if (len(chunk)>0):
                chunks.append(chunk)
                chunk=[]
        else:
            chunk.append(line)
    return chunks





In [361]:
pattern_chunk_boundary = r'^===+$'
pattern_sloka_number = r'^.*#\s*(\d+\.\d+\.\d+)\s*@.*$'
pattern_sloka_translate_line = r"^\s*[\u0900-\u097F\s].*,.*$"
pattern_sloka_translates = r"^\s*([\u0900-\u097F\s]+)([\w\s]+.*)"
chunks = read_file_chunks("./scraped/ramayana-iitk-kanda-1to5.txt",pattern_chunk_boundary )
sloka_numbers = [
    [ re.search(pattern_sloka_number, line).group(1) for line in chunk if re.search(pattern_sloka_number, line) ]  
    for chunk in chunks
]
sloka_translates = [
    [ 
        [sloka_number[0]] + [x.strip() for x in re.match(pattern_sloka_translates, sloka_line_part.replace(":","ः")).groups() ] 
        for sloka_line_part in sloka_line.strip().split(',') 
        if re.match(pattern_sloka_translates, sloka_line_part)
    ]
    for chunk, sloka_number in zip(chunks, sloka_numbers) if len(sloka_number)>0
    for sloka_line in chunk if re.search(pattern_sloka_translate_line, sloka_line)   
]

iitk_ramayana_translates = pd.DataFrame(
    [x1 for x in sloka_translates for x1 in x],
    columns = ['sloka', 'pada', 'english'] 
)
iitk_ramayana_translates['speech_part'] = iitk_ramayana_translates.english.apply( lambda x: [ 
   nltk.pos_tag(re.split("\s+",ws) ) for ws in sent_tokenize(x)  ])

verb_part = iitk_ramayana_translates.speech_part.apply(lambda x: [ (w,t) for w, t in  x[0] if re.match("^V.*", t)] if x else [])
iitk_ramayana_translates['verb_part'] = verb_part

is_verb = iitk_ramayana_translates.verb_part.apply(lambda x: len(x))
iitk_ramayana_translates['is_verb'] = is_verb
iitk_ramayana_translates.to_pickle("./scraped/iitk_ramayana_translates.pkl")


In [450]:
iitk_ramayana_translates = pd.read_pickle("./scraped/iitk_ramayana_translates.pkl")
iitk_ramayana_verbs = iitk_ramayana_translates[iitk_ramayana_translates.is_verb>0]\
    .drop_duplicates(subset=['sloka','pada','english'], keep='first')\
    .dropna()\
    .reset_index()\
    .drop(columns=['index'])\
    [["sloka", "pada", "english", "verb_part"]]

iitk_ramayana_verbs.sort_values(by=['pada'], inplace=True)
iitk_ramayana_verbs = \
    iitk_ramayana_verbs[iitk_ramayana_verbs.pada.apply(lambda x: len(x)>0 )].reset_index().drop(columns=['index'])

iitk_ramayana_verbs.to_csv("./scraped/iitk_ramayana_verbs.tsv", index=False, sep="\t")
iitk_ramayana_verbs = pd.read_csv("./scraped/iitk_ramayana_verbs.tsv", sep="\t")
iitk_ramayana_verbs

Unnamed: 0,sloka,pada,english,verb_part
0,2.71.9,अंशुधाने,at a place known as Anshudhana,"[('known', 'VBN')]"
1,2.91.76,अंशुमतश्चापि,bristled,"[('bristled', 'VBN')]"
2,1.38.22,अंशुमान् नाम,named Anshuman,"[('named', 'VBN')]"
3,5.47.15,अंशुमालिकः,garlanded by rays,"[('garlanded', 'VBN')]"
4,5.54.44,अंशुमाली,surrounded by rays of light,"[('surrounded', 'VBN')]"
...,...,...,...,...
36058,1.4.29,ह्लादयत्,rejoicing,"[('rejoicing', 'VBG')]"
36059,1.34.17,ह्लादयन्,delighting,"[('delighting', 'VBG')]"
36060,2.3.27,ह्लादयन्तम्,gladdening,"[('gladdening', 'VBG')]"
36061,2.112.8,ह्लादितः,was gladdened,"[('was', 'VBD'), ('gladdened', 'VBN')]"


In [451]:
iitk_ramayana_verbs

Unnamed: 0,sloka,pada,english,verb_part
0,2.71.9,अंशुधाने,at a place known as Anshudhana,"[('known', 'VBN')]"
1,2.91.76,अंशुमतश्चापि,bristled,"[('bristled', 'VBN')]"
2,1.38.22,अंशुमान् नाम,named Anshuman,"[('named', 'VBN')]"
3,5.47.15,अंशुमालिकः,garlanded by rays,"[('garlanded', 'VBN')]"
4,5.54.44,अंशुमाली,surrounded by rays of light,"[('surrounded', 'VBN')]"
...,...,...,...,...
36058,1.4.29,ह्लादयत्,rejoicing,"[('rejoicing', 'VBG')]"
36059,1.34.17,ह्लादयन्,delighting,"[('delighting', 'VBG')]"
36060,2.3.27,ह्लादयन्तम्,gladdening,"[('gladdening', 'VBG')]"
36061,2.112.8,ह्लादितः,was gladdened,"[('was', 'VBD'), ('gladdened', 'VBN')]"
