# Parse Tilaks Kaggle dataset into a csv

## Setup

In [10]:
import glob
import re
import statistics
from pathlib import Path
import pandas as pd
import tiktoken

directory = Path("../text/KaggleTilak")
out_directory = Path("../data")
glossary = directory/'kaggla_tilak_glossary.txt'
stories = directory/'kaggle_tilak_mahabharata.txt'

## Defining Directories

In [11]:
directory = Path('../text/KaggleTilak')
subdirectory = directory/'books'
file = directory/'1-18 books combined.txt'
source = 'https://www.kaggle.com/datasets/tilakd/mahabharata'

## Split the books

In [12]:

book_text = ""
current_book = "NA"
outfile = ""
book_number = 0

with open(file) as books:
     for line in books:
          ## Match start of a book
          if re.match(r".{1,20}Parva *$", line):
               ## Skip initial empty text
               if not (current_book == "NA"):
                    outfile = directory/f"books/{book_number}_{current_book.replace(' ', '_')}.txt"
                    outfile.write_text(book_text.strip())

               # print("Parva", line)
               current_book = line.replace("\n", "").strip()
               book_number += 1
               book_text = ""
               continue
          book_text = book_text + "\n" + line

     ## Last book
     outfile = directory/f"books/{book_number}_{current_book.replace(' ', '_')}.txt"
     outfile.write_text(book_text.strip())



## Read each book and split into chapters

Chapter patterns

`Chapter One -> '^(Chapter) (?!Commentary).* *$' `

Commentary pattern 

`Chapter Commentary -> '^(Chapter) (Commentary).* *$'`

In [13]:
file_list = glob.glob(f"{subdirectory}/*.txt")

chapter_pattern = '^(Chapter) (?!Commentary).* *$'
commentary_patterns = '^(Chapter) (Commentary).* *$'

data = []

In [14]:

for file_name in file_list:

     ## Get book name and number from file name
     file = file_name.split("/")[-1]
     book_number = file[:len(file)-4].split('_')[0]
     book_name = " ".join(file[:len(file)-4].split('_')[1:3])

     next_line = "Chapter Heading"
     current_chapter = "NA"
     commentary = False
     section_number = 0

     with open(file_name) as book:
          for line in book:
               if not line.strip():
                    # print("___Empty Line___")
                    continue
          
               ## Match Chapter
               if re.match(chapter_pattern, line):
                    # print("Chapter", line)
                    current_chapter = line.replace("\n", "").strip()
                    commentary = False
                    next_line = 'Chapter Heading'
                    continue

               ## Match chapter heading
               if next_line == "Chapter Heading":
                    # print("Chapter Heading ->", line)
                    current_chapter_heading = line.replace("\n", "").strip()
                    next_line = "NA"
                    continue

               ## Match Commentary
               if re.match(commentary_patterns, line):
                    commentary = True
               
               ## Every line is read as section. Hence adding section number to maintain the chronology. 
               section_number += 1
               data = data + [[book_number, book_name, current_chapter, current_chapter_heading, commentary, line, section_number]]

## Prepare the Dataframe

In [15]:
df_lines = pd.DataFrame(data, columns=['book_number', 'book_name', 'chapter_name', 'title', 'commentary', 'text', 'section_number'])
df_lines['book_number'] = df_lines['book_number'].astype('int')
df_lines['source'] = source
df_lines.head()

Unnamed: 0,book_number,book_name,chapter_name,title,commentary,text,section_number,source
0,5,Udyoga Parva,Chapter One,The Pandavas Prepare for War,False,"After Abhimanyu's marriage, there was royal fe...",1,https://www.kaggle.com/datasets/tilakd/mahabha...
1,5,Udyoga Parva,Chapter One,The Pandavas Prepare for War,False,After listening to the words of His younger br...,2,https://www.kaggle.com/datasets/tilakd/mahabha...
2,5,Udyoga Parva,Chapter One,The Pandavas Prepare for War,False,After Lord Balarama finished expressing His op...,3,https://www.kaggle.com/datasets/tilakd/mahabha...
3,5,Udyoga Parva,Chapter One,The Pandavas Prepare for War,False,"After Satyaki's speech, the eminent king Drupa...",4,https://www.kaggle.com/datasets/tilakd/mahabha...
4,5,Udyoga Parva,Chapter One,The Pandavas Prepare for War,False,"Lord Krishna then gave the final opinion, King...",5,https://www.kaggle.com/datasets/tilakd/mahabha...


In [16]:
## df_chapters This is a separate dataframe where the entire chapter is concatinated as one row. 
df_chapters = df_lines.groupby(
    ['book_number', 'book_name', 'chapter_name', 'title', 'commentary']
    ).agg({'text': ' '.join}, {'section_number': ','.join}).reset_index()
df_chapters['book_number'] = df_chapters['book_number'].astype('int')
df_chapters['source']= source

df_chapters.head()

Unnamed: 0,book_number,book_name,chapter_name,title,commentary,text,source
0,1,Adi Parva,Chapter Eight,The Preceptor Drona,False,"Seeing the princes enter adolescence, Maharaja...",https://www.kaggle.com/datasets/tilakd/mahabha...
1,1,Adi Parva,Chapter Eight,The Preceptor Drona,True,Chapter Commentary\n After Drona tested Yudhis...,https://www.kaggle.com/datasets/tilakd/mahabha...
2,1,Adi Parva,Chapter Eighteen,Arjuna Goes on Pilgrimage,False,After leaving Indraprastha in the dress of a m...,https://www.kaggle.com/datasets/tilakd/mahabha...
3,1,Adi Parva,Chapter Eighteen,Arjuna Goes on Pilgrimage,True,Chapter Commentary\n One may question how it i...,https://www.kaggle.com/datasets/tilakd/mahabha...
4,1,Adi Parva,Chapter Eleven,Tuition for Drona,False,Drona saw that all his students were now adept...,https://www.kaggle.com/datasets/tilakd/mahabha...


In [17]:
df_lines.to_csv(out_directory/"kaggletilak_complete_text_lines.csv", index=False, sep="|")
df_chapters.to_csv(out_directory/"kaggletilak_complete_text_chapters.csv", index=False, sep="|")
