## Shakespeare - Quicklook

In [20]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import numpy as np
import pandas as pd
import altair as alt

In [4]:
# read in shakespeare
# Load the YAML file
with open('../data/shakespeare_char/input.txt', "r") as text_file:
    shakespeare = text_file.read() 

In [9]:
print(type(shakespeare))

<class 'str'>


In [5]:
print(f"{len(shakespeare)} Characters in the Corpus")

1115394 Characters in the Corpus


In [6]:
print(f"{len(shakespeare.split(' '))} words with no cleaning.")

169893 words with no cleaning.


In [23]:
#unconstrained vocabulary
# keep white spaces and special characters
text_vectorization_counts = TextVectorization(
    output_mode="count",
)
# Convert the single string to a tensor
data_tensor = tf.constant([shakespeare])

# Adapt the TextVectorization layer to the single string
text_vectorization_counts.adapt(data_tensor)


2023-11-14 12:40:16.250502: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


## Vocabulary Size: 13000, Total Words: 202646

In [24]:
top_n = 5
vocabulary = text_vectorization_counts.get_vocabulary()
#single_batch = next(iter(dataset))
word_counts = text_vectorization_counts(data_tensor).numpy().sum(axis=0)
top_indices = np.argsort(word_counts)[::-1][:top_n]
top_words = [vocabulary[i] for i in top_indices]
top_counts = word_counts[top_indices]
print(top_words)
print(f"Number of words counted in 1 batch: {np.sum(word_counts)}")
print(f"Vocabulary Size: {len(vocabulary)}")

['the', 'and', 'to', 'i', 'of']
Number of words counted in 1 batch: 202646.0
Vocabulary Size: 12849


## Of the total Vocab, 95% is covered by around 5000 words

In [21]:
total_words = np.sum(word_counts)
word_counts_proportion_list = word_counts / total_words
cdf = np.cumsum(word_counts_proportion_list)

df_tmp = pd.DataFrame(np.array(cdf),columns=['CDF'])
df_tmp = df_tmp.reset_index()
# Create an Altair chart
chart = alt.Chart(df_tmp.sample(5000)).mark_point().encode(
    x=alt.X('index:Q',title="Total Vocabulary"),  # Hide x-axis labels
    y=alt.Y('CDF:Q',title="Percent of All Words in Corpus"),  # Add a custom title to the y-axis
    tooltip = ['index','CDF']
).properties(
    width=800,  # Adjust the width of the chart
    height=400,  # Adjust the height of the chart
    title='Vocabulary'
).configure_title(
    fontSize=16,
    anchor='middle'
)
chart

In [25]:
a = """There ries the hand

    I wither can''t up look his alls that get all the ame

    [Verse 4]"""


In [26]:
len(a)

93