In [36]:
import numpy as  np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
import re, time, os

In [37]:
class FrequentCounter:
    def __init__(self):
        self.register = dict()

    def misraGries(self, k):
        # update counter-value for element in stream by 1 if already registered.
        if self.new_element in self.register:
            self.register[self.new_element] += 1
        else:
            # if element isnt in register, only register element if max-number of counters (k) is not reached 
            if ( len(self.register) < (k-1) ):
                self.register.update({self.new_element: 1})
            else:
                # if max-counters reached, decrease all counter value by 1.
                for element in self.register.copy():
                    self.register[element] -= 1
                    # remove element from register if counter value is 0
                    if self.register[element] == 0:
                        del self.register[element]

        self.register = dict(sorted(self.register.items(), key=lambda k:k[1], reverse=True))

    def getMisraGries(self):
        return self.register

    def getExactFrequent(self):
        """computes the exact maximum occurring element in a data stream

        Returns:.
            tuple: most frequent word and its corresponding frequency in literary word
        """
        # build register of words and corresponding frequency
        temp_register = dict()
        for word in self.stream:
            if word not in temp_register:
                temp_register.update({word: 1})
            else:
                temp_register[word] += 1

        # sorting items in register by key/word
        temp_register = dict(sorted(temp_register.items(), key=lambda k:k[1], reverse=True))
        return temp_register

In [38]:
class StreamAlg(FrequentCounter):
    def __init__(self, stream):
        """
        Parameters:
            stream [float, int, string]: The list of values to represent a stream.
            k [int]: the k-1 items that occur more than a 1/k fraction of the time in the input.
                    : keeps at most k-1 candidates at the same time.
        """
        self.stream = stream
        FrequentCounter.__init__(self)
        self.init = False

    def exec(self, k):
        """
        Passes each value of the data stream to the alg function for register update. 
        """
        for element in self.stream:
            self.new_element = element
            self.misraGries(k=k)


In [39]:
def filePreprocessor(file_path):
    """formats file content and remove all stop words in the respective language.

    Args:
        file_path (str): path to literary work

    Returns:
        list: tokenized valid words in the literary work.
    """
    file_language = re.findall('.*-(\w+).*', file_path)  # get language of literary work
    with open(file_path, 'r', encoding="utf-8") as f:
        file_content = f.readlines()
        file_content = ''.join(file_content).lower()  # convert lines to string
        stopset = set(stopwords.words(file_language))  # get all stopwords in respective language
        tokens = word_tokenize(file_content)  # convert strings to list
        return [token.lower() for token in tokens if token not in stopset and len(token) > 2]  # perform actual cleanup

In [40]:
# get all literary works under study
literary_works_dir = 'shakespeare'
literary_works_path = os.path.join(os.getcwd(), literary_works_dir)
literary_works = os.listdir(literary_works_path) 

for literary_work in literary_works:
    print('Working on', literary_work)

    file_path = os.path.join(literary_works_dir, literary_work) #  construct the file path
    file_tokenized = filePreprocessor(file_path)
    print('length of token', len(file_tokenized))

    FC = StreamAlg(file_tokenized)
    
    start_exact = time.time()
    exact = FC.getExactFrequent()
    print('number of unique tokens:', len(exact), 'it took:', (time.time()-start_exact), 'secs')
    
    register_df = pd.DataFrame(exact.keys(), columns=['exact'])  # registering exact rankings.
    
    n_ks = [_ for _ in range(2, 202, 2)] + [len(exact)//2, len(exact)+1]  # list of k to consider
    print('number of k considered:', len(n_ks))
    
    start_misragries = time.time()
    reg = {}
    for k in n_ks:
        # perform misra_gries for each k
        FC.exec(k)
        misra_gries = FC.getMisraGries()

        if len(misra_gries) > 0:
            m_g_series = pd.Series(misra_gries.keys())
            reg.update({f'k{k}': m_g_series})
    print('It took:', (time.time()-start_misragries), 'secs')
    
    register_df = pd.concat([register_df, pd.DataFrame(reg)], axis=1)
    register_df.index = range(1, len(register_df)+1)  # reformat index to depict ranking order
    register_df.to_csv('./output/'+literary_work.split('.')[0]+'.csv', index_label='rank')

    stop_ = time.time()

    print('-'*40)
print('Done!!!')


Working on aMidsummerNightsDream-english.txt
length of token 9060
number of unique tokens: 2883 it took: 0.01563286781311035 secs
number of k considered: 102
It took: 17.40912127494812 secs
--------------------------------------
Done!!!
