In [None]:
"""Word Frequency Counter by Book Chapter   Schyuler Lujan   June 24, 2020"""

In [33]:
# Import relevant libraries
import pandas as pd
import os

In [34]:
def read_toc(name):
    """Opens the text file that contains the table of contents and 
    returns a dictionary with chapter titles as keys"""
    # Initialize toc_dictionary
    toc_dictionary = {}
    
    # Open table of contents file
    filename = name+".txt"
    toc = open(filename, 'r', encoding="utf8")
    toc = toc.readlines()
    
    # Append chapter titles to toc_dictionary
    for line in toc:
        line = line.replace("\n","")
        line = line.replace(".","")
        line = line.replace("—"," ")
        toc_dictionary[line] = []
    
    # remove any empty dictionary keys (removes empty lines)
    toc_dictionary.pop("", None)
    
    return toc_dictionary

In [68]:
def read_file(name):
    """Opens a text file and returns the contents as a single string"""
    filename = name+".txt"
    text = open(filename, 'r', encoding ="utf8")
    text = text.read().lower()

    # Remove punctuation and other special characters
    #special_characters = [".", ",", ";", ":", "!", "?", "“", "”", "’","_",'''(''', ''')''', "[", "]", "&", "‘"]
    special_characters = [".", ",", ";", ":", "!", "?", '''"''', '''"''',"_", "(", ")", "[", "]", "&", "‘", "`"]
    for character in special_characters:
        text = text.replace(character, "")
    
    # Special treatment for dashes to avoid unintentional combining of words
    dashes = ["-", "--"]
    #dashes = ["--"]
    for dash in dashes:
        text = text.replace(dash, " ")
    
    # Split book on line breaks to prep for split on chapter titles
    #text = text.split('\n')
    return text

In [69]:
def count_words(file):
    """Counts the frequency of each word in the file and returns a dictionary with word:count pairs"""
    # Initialize dictionary
    word_dictionary = {}
    
    # Remove punctuation and other special characters
    #special_characters = [".", ",", ";", ":", "!", "?", "“", "”","_", "(", ")", "[", "]", "&", "‘", "`"]
    special_characters = [".", ",", ";", ":", "!", "?", '''"''', '''"''',"_", "(", ")", "[", "]", "&", "‘", "`"]
    for character in special_characters:
        file = file.replace(character, "")
    
    # Special treatment for dashes to avoid unintentional combining of words
    #dashes = ["--"]
    #for dash in dashes:
    #    file = file.replace(dash, " ")
    
    # Turn our string into a list of words
    words = file.split(" ")
    
    # Loop thru words and append words with frequencies to word_dictionary
    for word in words:
        if word in word_dictionary.keys():
            word_dictionary[word] += 1
        else:
            word_dictionary[word] = 1
    
    # Convert word_dictionary to a dataframe --- FIXME!!!
    return word_dictionary

In [70]:
def split_by_chapter(file, toc):
    """Splits the given file by chapter, which is listed in the toc"""
    # Convert the given toc dictionary into a list of chapters
    toc_list = [key.lower() for key in toc.keys()]
    for chapter in toc_list:
        chapter = chapter.replace(".","")
        chapter = chapter.replace("—"," ")
    
    # Create a new dictionary
    words_by_chap = {}
    
    # Split file on \n
    file = file.split("\n")
    
    # Create new list for chapter index positions
    chapter_positions = []
    
    # Get positions of chapter titles in test_text
    for i in range(len(file)):
        # remove leading or trailing whitespaces from each line
        file[i] = file[i].lstrip().rstrip()
        if file[i] in toc_list:
            chapter_positions.append(i)
    
    # Append text contents by chapter to words_by_chap
    for item in chapter_positions:
        next_index = chapter_positions.index(item) + 1
        
        # Avoid error when reaching the end of list index
        if next_index >= len(chapter_positions):
            next_index = len(file)
        else:
            next_index = chapter_positions[next_index]
        
        # Append to dictionary as chapter:[contents]
        #words_by_chap[file[item]] = file[item+1:next_index]
        words_by_chap[file[item]] = file[item:next_index]
    
    # Clean up dictionary values
    for key, value in words_by_chap.items():
        string = ''
        for phrases in value:
            string += " "+phrases
        words_by_chap[key] = string.split(" ")
    
    # Remove spaces from dictionary values
    for key, value in words_by_chap.items():
        for word in value:
            if word == '':
                value.remove(word)
        
    return words_by_chap

In [71]:
def df_by_chapter(dictionary):
    """Accepts a dictionary as a parameter and returns it formatted
    as a dataframe with associated word counts by chapter"""
    # Create empty dataframe
    master_df = pd.DataFrame(columns=['Chapter', 'Words', 'Word_Count'])
    # Loop thru dictionary and append to master_df
    for key, value in dictionary.items():
        temp_dict = {}
        words = []
        counts = []
        
        # Loop thru text assoc. w/chapter and sum word frequency
        for word in value:
            if word in words:
                index = words.index(word)
                counts[index] += 1
            else:
                words.append(word)
                counts.append(1)
        # Append our values to temp_dict
        temp_dict['Chapter'] = [key for i in range(len(words))]
        temp_dict['Words'] = words
        temp_dict['Word_Counts'] = counts
        
        # Create tuple and make into dataframe
        data_tuples = list(zip(temp_dict['Chapter'], temp_dict['Words'], temp_dict['Word_Counts']))
        temp_df = pd.DataFrame(data_tuples, columns = ['Chapter', 'Words', 'Word_Count'])
        
        # Append to master_df
        master_df = master_df.append(pd.DataFrame(temp_df), ignore_index = True)
    
    return master_df

In [72]:
cmc_toc = read_toc('plain_toc_count_of_monte_cristo')

In [73]:
cmc = read_file("full_text_count_of_monte_cristov3")

In [74]:
cmc_chap = split_by_chapter(cmc, cmc_toc)

In [75]:
cmc_chap

{'chapter 1': ['chapter',
  '1',
  'marseilles',
  'the',
  'arrival',
  'on',
  'the',
  '24th',
  'of',
  'february',
  '1810',
  'the',
  'look',
  'out',
  'at',
  'notre',
  'dame',
  'de',
  'la',
  'garde',
  'signalled',
  'the',
  'three',
  'master',
  'the',
  'pharaon',
  'from',
  'smyrna',
  'trieste',
  'and',
  'naples',
  'as',
  'usual',
  'a',
  'pilot',
  'put',
  'off',
  'immediately',
  'and',
  'rounding',
  'the',
  'chateau',
  "d'if",
  'got',
  'on',
  'board',
  'the',
  'vessel',
  'between',
  'cape',
  'morgion',
  'and',
  'rion',
  'island',
  'immediately',
  'and',
  'according',
  'to',
  'custom',
  'the',
  'ramparts',
  'of',
  'fort',
  'saint',
  'jean',
  'were',
  'covered',
  'with',
  'spectators',
  'it',
  'is',
  'always',
  'an',
  'event',
  'at',
  'marseilles',
  'for',
  'a',
  'ship',
  'to',
  'come',
  'into',
  'port',
  'especially',
  'when',
  'this',
  'ship',
  'like',
  'the',
  'pharaon',
  'has',
  'been',
  'built',
  '

In [76]:
cmc_df = df_by_chapter(cmc_chap)

In [65]:
cmc_df

Unnamed: 0,Chapter,Words,Word_Count
0,chapter 1,chapter,1
1,chapter 1,1,1
2,chapter 1,marseilles,5
3,chapter 1,the,214
4,chapter 1,arrival,2
5,chapter 1,on,18
6,chapter 1,24th,1
7,chapter 1,of,66
8,chapter 1,february,1
9,chapter 1,1810,1


In [77]:
cmc_df.to_csv('cmc.csv', encoding="utf8")

**Resources**

Create dataframe from multiple lists:
    https://cmdlinetips.com/2018/01/how-to-create-pandas-dataframe-from-multiple-lists/

Create dataframe from dictionary:
    https://thispointer.com/python-pandas-how-to-create-dataframe-from-dictionary/

Create empty dataframe and append:
    https://thispointer.com/pandas-how-to-create-an-empty-dataframe-and-append-rows-columns-to-it-in-python/

Pandas version of rbind (from R):
    https://stackoverflow.com/questions/14988480/pandas-version-of-rbind

Remove item from list:
    https://note.nkmk.me/en/python-list-clear-pop-remove-del/