# Parse Tilaks Kaggle dataset into a csv

## Setup

In [10]:
import glob
import re
import statistics
from pathlib import Path
import pandas as pd
import tiktoken
import uuid

directory = Path("../text/KaggleTilak")
out_directory = Path("../data")
glossary = directory/'kaggla_tilak_glossary.txt'
stories = directory/'kaggle_tilak_mahabharata.txt'

In [11]:
## Calculate number of tokens in a string. 
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens

## Defining Directories

In [12]:
directory = Path('../text/KaggleTilak')
subdirectory = directory/'books'
file = directory/'1-18 books combined.txt'
source = 'https://www.kaggle.com/datasets/tilakd/mahabharata'

## Split the books

In [13]:

book_text = ""
current_book = "NA"
outfile = ""
book_number = 0

with open(file) as books:
     for line in books:
          ## Match start of a book
          if re.match(r".{1,20}Parva *$", line):
               ## Skip initial empty text
               if not (current_book == "NA"):
                    outfile = directory/f"books/{book_number}_{current_book.replace(' ', '_')}.txt"
                    outfile.write_text(book_text.strip())

               # print("Parva", line)
               current_book = line.replace("\n", "").strip()
               book_number += 1
               book_text = ""
               continue
          book_text = book_text + "\n" + line

     ## Last book
     outfile = directory/f"books/{book_number}_{current_book.replace(' ', '_')}.txt"
     outfile.write_text(book_text.strip())



## Read each book and split into chapters

Chapter patterns

`Chapter One -> '^(Chapter) (?!Commentary).* *$' `

Commentary pattern 

`Chapter Commentary -> '^(Chapter) (Commentary).* *$'`

In [14]:
file_list = glob.glob(f"{subdirectory}/*.txt")

chapter_pattern = '^(Chapter) (?!Commentary).* *$'
commentary_patterns = '^(Chapter) (Commentary).* *$'

data = []

In [15]:

for file_name in file_list:

     ## Get book name and number from file name
     file = file_name.split("/")[-1]
     book_number = file[:len(file)-4].split('_')[0]
     book_name = " ".join(file[:len(file)-4].split('_')[1:3])

     next_line = "Chapter Heading"
     current_chapter = "NA"
     commentary = False
     section_number = 0

     with open(file_name) as book:
          for line in book:
               if not line.strip():
                    # print("___Empty Line___")
                    continue
          
               ## Match Chapter
               if re.match(chapter_pattern, line):
                    # print("Chapter", line)
                    current_chapter = line.replace("\n", "").strip()
                    commentary = False
                    next_line = 'Chapter Heading'
                    continue

               ## Match chapter heading
               if next_line == "Chapter Heading":
                    # print("Chapter Heading ->", line)
                    current_chapter_heading = line.replace("\n", "").strip()
                    next_line = "NA"
                    continue

               ## Match Commentary
               if re.match(commentary_patterns, line):
                    commentary = True
               
               ## Every line is read as section. Hence adding section number to maintain the chronology. 
               section_number += 1
               ## Adding chunk_id, unique across all books
               chunk_id = uuid.uuid4().hex
               data = data + [[
                    book_number, 
                    book_name, 
                    current_chapter, 
                    current_chapter_heading, 
                    commentary, 
                    line, 
                    section_number, 
                    chunk_id]]

## Prepare the Dataframe

In [16]:
df_lines = pd.DataFrame(
    data, 
    columns=[
        'book_number', 
        'book_name', 
        'chapter_name', 
        'title', 
        'commentary', 
        'text', 
        'section_number', 
        'chunk_id']
    )
    
df_lines['book_number'] = df_lines['book_number'].astype('int')
df_lines['source'] = source
df_lines.head()


Unnamed: 0,book_number,book_name,chapter_name,title,commentary,text,section_number,chunk_id,source
0,5,Udyoga Parva,Chapter One,The Pandavas Prepare for War,False,"After Abhimanyu's marriage, there was royal fe...",1,749287ce7d7d45ce83aa34cb21d331e3,https://www.kaggle.com/datasets/tilakd/mahabha...
1,5,Udyoga Parva,Chapter One,The Pandavas Prepare for War,False,After listening to the words of His younger br...,2,cec334e39d534944843a1f61869085a0,https://www.kaggle.com/datasets/tilakd/mahabha...
2,5,Udyoga Parva,Chapter One,The Pandavas Prepare for War,False,After Lord Balarama finished expressing His op...,3,6a7037a38f2f4c468af5e180e30db35f,https://www.kaggle.com/datasets/tilakd/mahabha...
3,5,Udyoga Parva,Chapter One,The Pandavas Prepare for War,False,"After Satyaki's speech, the eminent king Drupa...",4,4c009fedc8c6435d970e27aab0a0795f,https://www.kaggle.com/datasets/tilakd/mahabha...
4,5,Udyoga Parva,Chapter One,The Pandavas Prepare for War,False,"Lord Krishna then gave the final opinion, King...",5,f432b5d49cfd48828cb95e9b319e9ccd,https://www.kaggle.com/datasets/tilakd/mahabha...


## Write the dataframe to csv

In [17]:

df_lines['num_tokens'] = df_lines['text'].apply(num_tokens_from_string)

In [18]:
df_lines.to_csv(out_directory/"kaggle_tilak_summaries.csv", index=False, sep="|")
