In [37]:
from collections import Counter
import os
import pandas as pd
import matplotlib.pyplot as plt

def count_words_appearances(file_contents):
    skips = [".", ", ", ":", ";", "'", '"']
    for ch in skips:
        file_contents = file_contents.replace(ch, "")  # replaces unnecessary characters
    word_count = Counter(file_contents.split()) # returns a dictionary with the amount of each word.
    return word_count


def get_word_count(word_counts):
    num_unique = len(word_counts) # The length represents the amount of unique words
    counts = word_counts.values() # Obtains the values of the dictionary (the number of appearances of each unique word)
    return num_unique, counts


def read_book(title_path):
    with open(title_path, "r", encoding="unicode_escape") as current_file:
        file_contents = current_file.read()
        file_contents = file_contents.replace("\n", "").replace("\r", "")
        file_contents = file_contents.lower()
        return file_contents

def get_title_stats(title_path):
    file_contents = read_book(title_path)
    word_count = count_words_appearances(file_contents)
    return get_word_count(word_count)

def back_one_directory(path):
    path = path.split("/")
    path.pop()
    path = "/".join(path)
    return path

#Reads information from given directory and stores it in list. Only for following structure: books > Language > Author > book file
def get_books_information(path, depth, current_info, final_information):
    for path_name in os.listdir(path):
        alt_current_info = current_info.copy()
        alt_current_info[depth] = path_name
        path += "/" + path_name
        if os.path.isfile(path):
            num_unique_words, word_count = get_title_stats(path)
            alt_current_info[3] = num_unique_words
            alt_current_info[4] = word_count
            final_information.append(alt_current_info)
            path = back_one_directory(path)
        else:
            final_information, path = get_books_information(path, depth+1, alt_current_info, final_information)
            path = back_one_directory(path)
            current_info = [None]* 5
            current_info[0] = path.split("/").copy().pop()
    return final_information, path

def prepare_stats(final_information):
    title_num = 1
    library_stats = pd.DataFrame(columns=(
        "language",
        "author",
        "title",
        "length",
        "unique"
    ))
    for book_info in final_information:
        library_stats.loc[title_num]= book_info[0], \
            book_info[1].capitalize(), \
            str(book_info[2]).replace(".txt", ""), \
            sum(book_info[4]), \
            book_info[3]
        title_num += 1
    return library_stats

books_directory = "./books"
os.listdir(books_directory)

initial_current_info = [None]* 5
final_info = []
books_final_info, final_path = get_books_information(books_directory, 0, initial_current_info, final_info)

stats = prepare_stats(books_final_info)

plt.plot(stats.length, stats.unique, "bo-")

plt.loglog(stats.length, stats.unique, "ro")

plt.figure(figsize =(10, 10))
subset = stats[stats.language == "English"]
plt.loglog(subset.length,
           subset.unique,
           "o",
           label ="English",
           color ="crimson")

subset = stats[stats.language == "French"]
plt.loglog(subset.length,
           subset.unique,
           "o",
           label ="French",
           color ="forestgreen")

subset = stats[stats.language =="German"]
plt.loglog(subset.length,
           subset.unique,
           "o",
           label ="German",
           color ="orange")

subset = stats[stats.language =="Portuguese"]
plt.loglog(subset.length,
           subset.unique,
           "o",
           label ="Portuguese",
           color ="blueviolet")

plt.legend()
plt.xlabel("Book Length")
plt.ylabel("Number of Unique words")
plt.show()
