# Wikipedia Summaries of Books (Parvas)

## Setup

In [1]:
import re
from pathlib import Path
import pandas as pd
import wikipedia
import uuid
import tiktoken

In [2]:
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens

## Read Metadata File

This metadata is copied from the table in the README of this [github repository](https://github.com/kunjee17/mahabharata)

In [3]:

## Reading metadata
directory = Path("../text/KMGanguli")
out_directory = Path("../data")
csv_metadata = directory/"metadata_books.csv"

Metadata `title` column (named `title_raw` here) follows this pattern

`[Adi Parva](https://en.wikipedia.org/wiki/Adi_Parva) (The Book of the Beginning)`

In [4]:

df_data = pd.read_csv(csv_metadata, header=0, sep=";", names=['book_number', 'title_raw', 'sp', 'description', 'comments'])
df_data['book_number'] = df_data['book_number'].astype('int')

## Split title into separate fields
title_list = []
for title in df_data['title_raw']:
    title_list = title_list + [re.split(r"\[|\]|\(|\)", title)]

df_title_splits = pd.DataFrame(title_list, columns=['book_number', 'book', '3', 'source', '5', 'title', '7'])
df_title_splits.drop(['3', '5', '7', 'book_number'], axis=1, inplace=True)

## The subparva column follows this pattern: start-end
## Split sub parvas into start and end sub parvas
sub_parva_list = []
for sp in df_data['sp']:
    sub_parva_list = sub_parva_list + [re.split(r"–|-", sp)]

df_subparvas = pd.DataFrame(sub_parva_list, columns=['start_chapter', 'end_chapter'])

## Drop the splitted columns from the main dataframe
df_data.drop(['comments', 'sp', 'title_raw'], axis=1, inplace=True)

## Combine the dataframes
mtdt = pd.concat([df_title_splits, df_subparvas, df_data], axis=1)
mtdt.head()


Unnamed: 0,book,source,title,start_chapter,end_chapter,book_number,description
0,Adi Parva,https://en.wikipedia.org/wiki/Adi_Parva,The Book of the Beginning,1,19,1,How the Mahabharata came to be narrated by Sau...
1,Sabha Parva,https://en.wikipedia.org/wiki/Sabha_Parva,The Book of the Assembly Hall,20,28,2,Maya Danava erects the palace and court (sabha...
2,Vana Parva,https://en.wikipedia.org/wiki/Vana_Parva,The Book of the Forest,29,44,3,The twelve years of exile in the forest (aranya).
3,Virata Parva,https://en.wikipedia.org/wiki/Virata_Parva,The Book of Virata,45,48,4,The year spent incognito at the court of Virata.
4,Udyoga Parva,https://en.wikipedia.org/wiki/Udyoga_Parva,The Book of the Effort,49,59,5,Preparations for war and efforts to bring abou...


## Add the wikipedia summary

In [5]:
def wiki_summary(key):
    # print(key)
    summary =  wikipedia.summary(key)
    return summary

mtdt['text'] = mtdt['book'].apply(lambda x: wiki_summary(x))
mtdt['chunk_id'] = mtdt.apply(lambda _: uuid.uuid4().hex, axis=1)
mtdt['num_tokens'] = mtdt['text'].apply(num_tokens_from_string)
mtdt.head()

Unnamed: 0,book,source,title,start_chapter,end_chapter,book_number,description,text,chunk_id,num_tokens
0,Adi Parva,https://en.wikipedia.org/wiki/Adi_Parva,The Book of the Beginning,1,19,1,How the Mahabharata came to be narrated by Sau...,The Adi Parva or The Book of the Beginning is ...,8ca5cdf8c0e14c1da269e27a0d25fb08,208
1,Sabha Parva,https://en.wikipedia.org/wiki/Sabha_Parva,The Book of the Assembly Hall,20,28,2,Maya Danava erects the palace and court (sabha...,"Sabha Parva, also called the ""Book of the Asse...",530dad607a0e4402b857bab6dd605c5a,299
2,Vana Parva,https://en.wikipedia.org/wiki/Vana_Parva,The Book of the Forest,29,44,3,The twelve years of exile in the forest (aranya).,"The Vana Parva, also known as the ""Book of the...",301337a352214d7a94987d6241cd744b,187
3,Virata Parva,https://en.wikipedia.org/wiki/Virata_Parva,The Book of Virata,45,48,4,The year spent incognito at the court of Virata.,"Virata Parva, also known as the “Book of Virat...",9902bcf93a9544648afe9baa86e64f7e,266
4,Udyoga Parva,https://en.wikipedia.org/wiki/Udyoga_Parva,The Book of the Effort,49,59,5,Preparations for war and efforts to bring abou...,"The Udyoga Parva (Sanskrit: उद्योग पर्वः), or ...",b40a341d28b142a8b398d13e68343588,214


## Write the final dataframe to csv

In [6]:
mtdt.to_csv(out_directory/"wikipedia_parva_summaries.csv", sep="|", index=False)
