In [10]:
import os
import re
from pathlib import Path
import pandas as pd
import glob
import tiktoken
import statistics
import uuid

## Directories and files

In [11]:
directory = Path("../text/KMGanguli")
out_directory = Path("../data")
source = "KM Ganguli Translation"
file_list = glob.glob(f"{directory}/*.txt")
file_list.sort()


In [12]:
parva_dictionary = {
    1: "Adi Parva",
    2: "Sabha Parva",
    3: "Vana Parva",
    4: "Virata Parva",
    5: "Udyoga Parva",
    6: "Bhishma Parva",
    7: "Drona Parva",
    8: "Karna Parva",
    9: "Shalya Parva",
    10: "Sauptika Parva",
    11: "Stri Parva",
    12: "Shanti Parva",
    13: "Anushasana Parva",
    14: "Ashwamedha Parva",
    15: "Ashramavasika Parva",
    16: "Mausala Parva",
    17: 'Mahaprasthanika Parva',
    18: "Svargarohana Parva"
}


## Helper functions

In [7]:
# encoder_name = "cl100k_base"
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens


def write_book_to_csv(rows, csv_name): 
    print(f"____ Writing {csv_name} To csv file ... ")
    df_kmg = pd.DataFrame(rows)
    print('Total number of Tokens ->', sum(df_kmg['num_tokens']))
    print("Average token per row ->", statistics.mean(df_kmg['num_tokens']) )
    print("Total number of rows -> ", df_kmg.shape[0])
    ## Write to CSV
    df_kmg.to_csv(out_directory/csv_name, index=False, sep="|")
    print("Done \n\n")


## Parse books into csv

In [13]:
output_csv_prefix = "km_ganguli_translation"

min_tokens = 800
max_tokens = 1000

for file in file_list:
    rows = []
    book_number = int(file.split("/")[-1].replace(".txt", ""))
    parva = parva_dictionary[book_number]
    num_tokens = 0
    
    # print(book_number)
    current_section = "NA"
    current_section_name = "NA"
    text = ""
    para_number = 1 ## The para number within a section or a sub parva.

    with open(file) as book:
        for line in book:
            num_tokens = num_tokens_from_string(text)

            ## New row if tokens of current text are more than 1K
            if ( (line.strip()=="") and (num_tokens > min_tokens) ) or (num_tokens > max_tokens ):
                # print('Row complete')
                rows = rows + [{
                    'book_number': book_number, 
                    'section': current_section, 
                    'section_name': current_section_name, 
                    'text': text, 
                    'para_number': para_number,
                    'book_name': parva,
                    'num_tokens': num_tokens,
                    'chunk_id': uuid.uuid4().hex,
                    }]
                text = ""
                para_number += 1 
                continue

            if re.match('^(SECTION) .+$', line) or re.match('^\d+ *$', line):
                # New section.
                # First write the residual text from previous section to a row.
                rows = rows + [{
                    'book_number': book_number, 
                    'section': current_section, 
                    'section_name': current_section_name, 
                    'text': text, 
                    'para_number': para_number,
                    'book_name': parva,
                    'num_tokens': num_tokens,
                    'chunk_id': uuid.uuid4().hex,
                    }]
                text = ""
                ## Reset Section                 
                current_section = line.strip()
                current_section_name = ""
                para_number = 1
                continue

            if re.match("^\(.+(Parva).*\).*$",  line, flags=re.DOTALL) or re.match(" *\[\(.*(Bhagavad Gita).*\)\].*$",  line, flags=re.DOTALL):
                # print(line)
                current_section_name = line.strip().replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace("\n", "")
                continue

            text = text + line

        ## Write the final row with remaining text. 
        rows = rows + [{
                    'book_number': book_number, 
                    'section': current_section, 
                    'section_name': current_section_name, 
                    'text': text, 
                    'para_number': para_number,
                    'book_name': parva,
                    'num_tokens': num_tokens,
                    'chunk_id': uuid.uuid4().hex,
                    }]
        
    ## Write the book to dataframe and csv
    write_book_to_csv(rows, f"{output_csv_prefix}_{book_number}.csv")

##
    



____ Writing km_ganguli_translation_1.csv To csv file ... 
Total number of Tokens -> 339976
Average token per row -> 662.7212475633528
Total number of rows ->  513
Done 


____ Writing km_ganguli_translation_2.csv To csv file ... 
Total number of Tokens -> 112122
Average token per row -> 651.8720930232558
Total number of rows ->  172
Done 


____ Writing km_ganguli_translation_3.csv To csv file ... 
Total number of Tokens -> 477391
Average token per row -> 676.1912181303117
Total number of rows ->  706
Done 


____ Writing km_ganguli_translation_4.csv To csv file ... 
Total number of Tokens -> 92965
Average token per row -> 641.1379310344828
Total number of rows ->  145
Done 


____ Writing km_ganguli_translation_5.csv To csv file ... 
Total number of Tokens -> 282802
Average token per row -> 654.6342592592592
Total number of rows ->  432
Done 


____ Writing km_ganguli_translation_6.csv To csv file ... 
Total number of Tokens -> 249229
Average token per row -> 730.8768328445748
Total 

In [9]:
df = pd.read_csv(out_directory/"km_ganguli_translation_1.csv", sep="|")
df.tail()

Unnamed: 0,book_number,section,section_name,text,para_number,book_name,num_tokens,chunk_id
508,1,SECTION CCXXXIV,Khandava-daha Parva continued,"\n\n""Jaritari said, 'The person that is wise r...",1,Adi Parva,935,23d5e6649cb846c6b7d796f6a496ff77
509,1,SECTION CCXXXIV,Khandava-daha Parva continued,"""Vaisampayana continued, 'Thus addressed by Dr...",2,Adi Parva,261,76fc95399282484fb3c4527742850dce
510,1,SECTION CCXXXV,Khandava-daha Parva continued,"\n\n""Vaisampayana said, 'O thou of Kuru's race...",1,Adi Parva,830,9a3f2082621b4407829b327b6c2b1054
511,1,SECTION CCXXXV,Khandava-daha Parva continued,"""Mandapala then said, 'Who amongst these is th...",2,Adi Parva,448,3e4be01701d44cb8841e6bf3240d697e
512,1,SECTION CCXXXVI,Khandava-daha Parva continued,"\n\n""Vaisampayana said, 'Mandapala then addres...",1,Adi Parva,676,cc06d30133704dd2a614aa152f45aa88
