# Combine summaries from Tinytales, Wikipedia, and Tilaks Kaggle dataset

In [16]:
import statistics
from pathlib import Path
import pandas as pd
import tiktoken

## Define directories

In [2]:
directory = Path("../text")
directory_tilak = Path("../text/KaggleTilak/books")
directory_tinytales = Path("../text/TinyTales")
directory_wikipedia = Path("../text/Wikipedia")

## Read files into dataframes

In [4]:
kaggle_data = pd.read_csv(directory_tilak/'complete_text_lines.csv', sep=";")
kaggle_data.columns

Index(['book_number', 'book_name', 'chapter_name', 'title', 'commentary',
       'text', 'section_number', 'source'],
      dtype='object')

In [6]:
tinytales_data = pd.read_csv(directory_tinytales/'mahabharata_tiny_tales_stories.csv', sep=";")
tinytales_data.columns

Index(['text', 'section_number', 'title', 'chapter_number', 'chapter_name',
       'source'],
      dtype='object')

In [7]:
wikipedia_data = pd.read_csv(directory_wikipedia/'wikipedia_parva_summary.csv', sep=";")

## Droping unnecessary columns
wikipedia_data.drop(['start_chapter', 'end_chapter'], axis=1, inplace=True)
wikipedia_data.columns

Index(['book', 'source', 'title', 'book_number', 'description', 'text'], dtype='object')

## Combine the dataframes into one bid dataframe

In [11]:
df_combined = pd.concat([kaggle_data, tinytales_data, wikipedia_data])
print(
    "Kaggle data dims",  kaggle_data.shape, "\n",
    "TinyTales data dims", tinytales_data.shape, "\n",
    "Wikipedia data dims", wikipedia_data.shape, "\n",
    "Final data dims", df_combined.shape)

print("Final data columns \n", df_combined.columns)

Kaggle data dims (2376, 8) 
 TinyTales data dims (200, 6) 
 Wikipedia data dims (19, 6) 
 Final data dims (2595, 11)
Final data columns 
 Index(['book_number', 'book_name', 'chapter_name', 'title', 'commentary',
       'text', 'section_number', 'source', 'chapter_number', 'book',
       'description'],
      dtype='object')


## Calculate tokens for each text row

In [18]:

encoder_name = "cl100k_base"
encoding = tiktoken.get_encoding(encoder_name)
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens

## Calculat the text tokens per row
df_combined['num_text_tokens'] = df_combined['text'].apply(num_tokens_from_string)

print("Average tokens per row", statistics.mean(df_combined['num_text_tokens']))
print("Totak number of tokens", sum(df_combined['num_text_tokens']))

Average tokens per row 167.22851637764933
Totak number of tokens 433958


## Write the final dataframe into a csv file

In [19]:

df_combined.to_csv(directory/'combined.csv', index=False, sep=";")

In [20]:
df_combined.dtypes

book_number        float64
book_name           object
chapter_name        object
title               object
commentary          object
text                object
section_number     float64
source              object
chapter_number     float64
book                object
description         object
num_text_tokens      int64
dtype: object