In [1]:
import pathlib

author_name="steven_king_books"
book_name="doctor_sleep"
folder = pathlib.Path(r"C:\Users\amrul\programming\various_projects\scribd_parsing")/author_name/book_name
files = [file for file in folder.iterdir() if "txt" in file.suffix]
print(f"there are {len(files)} txt files in {folder}")

there are 270 txt files in C:\Users\amrul\programming\various_projects\scribd_parsing\steven_king_books\doctor_sleep


In [2]:
def format_number(number):
    return f"{number:,.2f}"

In [3]:
all_text_list = [file.read_text(encoding="utf-8") for file in files]
all_text = " ".join(all_text_list)
print(f"all_text length is {format_number(len(all_text))}")

all_text length is 931,301.00


In [4]:
# trying to estimate how much would it cost me if I were to use OpenAI Whisper TTS model to convert the whole text into speech
price_per_thousand_chars=0.015
total_price = (len(all_text)/1000)*price_per_thousand_chars
print(f"if you were to whisper all text you would pay : {format_number(total_price)}$")

if you were to whisper all text you would pay : 13.97$


In [5]:
# I know that some words in my text are split by hyphen to leak to next page. 
# I want to find out how many cases are there
import re
all_hyphen_words = re.findall("\w+\-\s+\w+",all_text)
print(f"found {format_number(len(all_hyphen_words))} matches")

found 1,004.00 matches


In [6]:
# this matches two words separated with hyphen and spaces
# it will utilize matching groups to concatenate two groups represented by two words
repaired_text = re.sub(r"(\w+)\-\s+(\w+)",r"\1\2",all_text)

In [None]:
# remove duplicate spaces
repaired_text = re.sub(r"\s+"," ",repaired_text)

In [None]:
print(f"repaired text length is {format_number(len(repaired_text))}")

In [None]:
# let's try to use nltk sentence tokenizer to tokenize our text into sentences
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

sentences = sent_tokenize(repaired_text)

print(f"tokenized into total of {format_number(len(sentences))} sentences")

In [None]:
print("\n".join(sentences[200:300]))

# Let's extract nouns only with nltk and see their frequencies in the text

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

In [None]:
list_of_words = [word_tokenize(sentence) for sentence in sentences]
flat_words = [word.lower() for sublist in list_of_words for word in sublist if word.isalpha()]
print(f"extracted total of {format_number(len(flat_words))} flat words")

In [None]:
stop_words = set(stopwords.words('english'))
print(f"there are total of {len(stop_words)} stop words in english")
filtered_words = [word for word in flat_words if word not in stop_words]
print(f"I filtered {format_number(len(filtered_words))} non stopwords")

In [None]:
print(f"some filtered words : {filtered_words[:10]}")

In [None]:
tagged_words = nltk.pos_tag(filtered_words)
print(f"tagged words have length {format_number(len(tagged_words))}")

In [None]:
nouns = [word for word,tag in tagged_words if tag in ('NN','NNS','NNP','NNPS')]
print(f"extracted {format_number(len(nouns))} nouns")

In [None]:
from collections import Counter

noun_freq = Counter(nouns)
num=100
print(f"{num} most frequent nouns : {noun_freq.most_common(num)}")

In [None]:

all_words = all_text.split()
word_counter=Counter(all_words)
num=100
print(f"counter has {len(word_counter)} elements vs all words length of {len(all_words)} with ratio {len(word_counter)/len(all_words)}")
print(f"most frequent {num} words : {word_counter.most_common(num)}")

In [None]:
new_line_splits = all_text.split("\n")
print(f"new line splits count : {len(new_line_splits)}")

In [None]:
import re
sentences = re.split("[.!?]",all_text)
print(f"total of {len(sentences)} sentences")

In [None]:
all_chars = list(set(list(all_text)))
print(f"total of {len(all_chars)} distinct characters in the text")

In [None]:
allcharcounter = Counter(list(all_text))
for key,val in allcharcounter.most_common(len(allcharcounter)):
    print(f"{key} : {val}")