In [2]:
import os
import re
from pathlib import Path
import pandas as pd
import glob
import tiktoken
import statistics

## Directories and files

In [3]:
directory = Path("../text/KMGanguli")
out_directory = Path("../data")
source = "http://100words.lauragibbs.net/"
file_list = glob.glob(f"{directory}/*.txt")
file_list.sort()


In [4]:
parva_dictionary = {
    1: "Adi Parva",
    2: "Sabha Parva",
    3: "Vana Parva",
    4: "Virata Parva",
    5: "Udyoga Parva",
    6: "Bhishma Parva",
    7: "Drona Parva",
    8: "Karna Parva",
    9: "Shalya Parva",
    10: "Sauptika Parva",
    11: "Stri Parva",
    12: "Shanti Parva",
    13: "Anushasana Parva",
    14: "Ashwamedha Parva",
    15: "Ashramavasika Parva",
    16: "Mausala Parva",
    17: 'Mahaprasthanika Parva',
    18: "Svargarohana Parva"
}


## Helper functions

In [5]:
# encoder_name = "cl100k_base"
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens


def write_book_to_csv(rows, csv_name): 
    print(f"____ Writing {csv_name} To csv file ... ")
    df_kmg = pd.DataFrame(rows)
    print('Total number of Tokens ->', sum(df_kmg['num_tokens']))
    print("Average token per row ->", statistics.mean(df_kmg['num_tokens']) )
    print("Total number of rows -> ", df_kmg.shape[0])
    ## Write to CSV
    df_kmg.to_csv(out_directory/csv_name, index=False, sep="|")
    print("Done \n\n")


## Parse books into csv

In [10]:
output_csv_prefix = "km_ganguli_translation"

min_tokens = 300
max_tokens = 500

for file in file_list:
    rows = []
    book_number = int(file.split("/")[-1].replace(".txt", ""))
    parva = parva_dictionary[book_number]
    num_tokens = 0
    
    # print(book_number)
    current_section = "NA"
    current_section_name = "NA"
    text = ""
    para_number = 1

    with open(file) as book:
        for line in book:
            num_tokens = num_tokens_from_string(text)

            ## New row if tokens of current text are more than 1K
            if ( (line.strip()=="") and (num_tokens > min_tokens) ) or (num_tokens > max_tokens ):
                # print('Row complete')
                rows = rows + [{
                    'book_number': book_number, 
                    'section': current_section, 
                    'section_name': current_section_name, 
                    'text': text, 
                    'para_number': para_number,
                    'book_name': parva,
                    'num_tokens': num_tokens,
                    }]
                text = ""
                para_number += 1
                continue

            if re.match('^(SECTION) .+$', line) or re.match('^\d+ *$', line):
                # First write the residual text to a row
                rows = rows + [{
                    'book_number': book_number, 
                    'section': current_section, 
                    'section_name': current_section_name, 
                    'text': text, 
                    'para_number': para_number,
                    'book_name': parva,
                    'num_tokens': num_tokens,
                    }]
                text = ""
                ## Reset Section                 
                current_section = line.strip()
                current_section_name = ""
                para_number = 1
                continue

            if re.match("^\(.+(Parva).*\).*$",  line, flags=re.DOTALL) or re.match(" *\[\(.*(Bhagavad Gita).*\)\].*$",  line, flags=re.DOTALL):
                # print(line)
                current_section_name = line.strip().replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace("\n", "")
                continue

            text = text + line

        ## Write the final row with remaining text. 
        rows = rows + [{
                    'book_number': book_number, 
                    'section': current_section, 
                    'section_name': current_section_name, 
                    'text': text, 
                    'para_number': para_number,
                    'book_name': parva,
                    'num_tokens': num_tokens,
                    }]
        
    ## Write the book to dataframe and csv
    write_book_to_csv(rows, f"{output_csv_prefix}_{book_number}.csv")

##
    



____ Writing km_ganguli_translation_1.csv To csv file ... 
Total number of Tokens -> 336673
Average token per row -> 354.01997896950576
Total number of rows ->  951
Done 


____ Writing km_ganguli_translation_2.csv To csv file ... 
Total number of Tokens -> 110953
Average token per row -> 359.07119741100325
Total number of rows ->  309
Done 


____ Writing km_ganguli_translation_3.csv To csv file ... 
Total number of Tokens -> 470740
Average token per row -> 373.6031746031746
Total number of rows ->  1260
Done 


____ Writing km_ganguli_translation_4.csv To csv file ... 
Total number of Tokens -> 91801
Average token per row -> 351.72796934865903
Total number of rows ->  261
Done 


____ Writing km_ganguli_translation_5.csv To csv file ... 
Total number of Tokens -> 278653
Average token per row -> 368.58862433862436
Total number of rows ->  756
Done 


____ Writing km_ganguli_translation_6.csv To csv file ... 
Total number of Tokens -> 245842
Average token per row -> 392.09250398724083


In [8]:
df = pd.read_csv(out_directory/"km_ganguli_translation_1.csv", sep="|")
df.tail()

Unnamed: 0,book_number,section,section_name,text,para_number,book_name,num_tokens
946,1,SECTION CCXXXV,Khandava-daha Parva continued,"""Mandapala then said, 'Who amongst these is th...",3,Adi Parva,407
947,1,SECTION CCXXXV,Khandava-daha Parva continued,"""Vaisampayana continued, 'After this, all his ...",4,Adi Parva,41
948,1,SECTION CCXXXVI,Khandava-daha Parva continued,"\n\n""Vaisampayana said, 'Mandapala then addres...",1,Adi Parva,306
949,1,SECTION CCXXXVI,Khandava-daha Parva continued,"""Vaisampayana continued, 'Then Partha asked fr...",2,Adi Parva,359
950,1,SECTION CCXXXVI,Khandava-daha Parva continued,END OF ADI PARVA\n\nFOOTNOTES\n\n1. These are ...,3,Adi Parva,12
