In [None]:
import random
import numpy as np
import os
import collections
import pandas as pd
import time
import enum

In [None]:
class ApproximateCounter():
    """
    Counts the frequency of every letter in a given text literary document.
    Metrics such as min, max and average number of occurences are also reported.
    Average pecentage of letter occurence in the text is also reported.
    """

    def __init__(self):
        self.student_counter = 106382
        random.seed = self.student_counter
        self.textFilesDirectory = os.path.join(os.getcwd(), 'TextFiles')
        
        self.counter = 0

        self.text_data = None
        self.count_method = None

        self.save_estimate = None
        self.save_counter = None


    def InitializeRegisters(self):
        # containers for holding actual counter and actual estimate of items
        # creating necessary columns

        columns = ['Letter'] + [f'Exp {exp_number}' for exp_number in range(1, self.n_experiments + 1)]
        self.counter_register = pd.DataFrame(columns=columns)
        # for exp_number in range(1, self.n_experiments + 1):
        #     self.counter_register = pd.concat([f'Exp {exp_number}'] = np.NaN


    def LoadTextFile(self, filename='sample'):
        self.filename = filename
        filename = filename + '.txt'
        text_file_path = os.path.join(self.textFilesDirectory, 'shakespeare', filename)
        with open(text_file_path, 'r', encoding="utf8") as file:
            text_data = file.readlines()

        # combine all lines from text into a big string
        self.text_data = ''.join(text_data)
        
        # remove all non-alphanumerics characters, and converts to upper(most imp) 
        self.text_data = ''.join(filter(str.isalpha, self.text_data)).upper()
    

    
    def Save2CSV(self, filename=None, dir_name='output'):
        """
        Saves register df to csv
        Args:
            filename [str] - filename to save csv without extension. default original textfile name
            dir_name [str] - output folder to save csv in. default 'output'
            save_estimate [bool] - whether to save letter:estimate or not. Default True, save file.
            save_counter [bool] - whether to save letter:count or not. Default True, save file.
        Writes:
            filename_estimate_decreaseprob [csv] - letter estimates for each trial of the experiment.
            filename_counter_exact [csv] - letter counter/approx frequency for each trial of the experiment.
        """
        
        if filename is not None:
            if not isinstance(filename, str):
                filename = str(filename)
        else:
            filename = self.filename

        # create directory to save outputs
        output_directory = os.path.join(self.textFilesDirectory, dir_name)
        if not os.path.exists(output_directory):
            os.mkdir(output_directory)
        
        if self.save_estimate is True:
            # filename_estimate_decreaseprob.csv
            estimate_filename = filename + '_' + self.count_method + '_estimate' + '.csv'
            estimate_file_path = os.path.join(output_directory, estimate_filename)
            self.estimate_register.to_csv(estimate_file_path, index = False, header=True)

        if self.save_counter is True:
            # filename_counter_exact.csv
            counter_filename = filename + '_' + self.count_method + '_counter' + '.csv'
            counter_file_path = os.path.join(output_directory, counter_filename)
            self.counter_register.to_csv(counter_file_path, index = False, header=True)


    def RegisterFormat(self, register):
        """
        extract the compiled register.
        counts formatted as integer.
        NaN values replaced by 0's.
        Obtain metrics (min, max, std, var, average, avg %, mae)
        """
        register = register.copy()

        register.fillna(0, inplace=True)
        register.replace([np.inf, -np.inf],0, inplace=True)

        # set column types to int
        for exp in register.columns[1:]:
            register[exp] = register[exp].astype(int)
        
        # get metrics
        Min = register.iloc[:,1:].min(axis=1, skipna=True)
        Max = register.iloc[:,1:].max(axis=1, skipna=True)
        Average = register.iloc[:,1:].mean(axis=1, skipna=True).apply(np.ceil).astype(int)
        Std = register.iloc[:,1:].std(axis=1, skipna=True).round(4)
        Var = register.iloc[:,1:].var(axis=1, skipna=True).round(4)        
        
        # computing average percentage
        avg_sum = np.sum(Average)
        avg_percent = (Average/avg_sum)*100
        Average_Percent = avg_percent.round(4) # round to 2dps

        # computing the mean absolute error sum(|e_i|)/n
        MAE = register.iloc[:,1:self.n_experiments+1].apply(lambda k: abs(k-Average[k.index])).mean(axis=1)

        # combining metrics
        metrics = pd.concat((Min, Max, Average, Std, Var, Average_Percent, MAE), axis=1)
        metrics_df = pd.DataFrame(metrics)
        metrics_df.columns = ['Minimum', 'Maximum', 'Average', 'Average %', 'STD', 'Variance', 'MAE']     
        
        # combine metrics with register and sort
        register = pd.concat((register, metrics_df), axis=1)
        register.sort_values(by='Average', ascending=False, inplace=True)

        return register.copy()


    def FormatRegisters(self):
        # computing estimates
        if self.save_estimate is True:

            counter_register_copy = self.counter_register.iloc[:,1:self.n_experiments+1].copy()
            self.estimate_register = self.counter_register[['Letter']].copy()

            if self.count_method == 'exact':
                self.estimate_register = self.estimate_register.join(counter_register_copy)
            elif self.count_method == 'fixed':
                self.estimate_register = self.estimate_register.join(counter_register_copy.apply(lambda k: 2*k))
            else: # decreasing probaibility
                self.estimate_register =  self.estimate_register.join(counter_register_copy.apply(lambda k: k))
            self.estimate_register = self.RegisterFormat(self.estimate_register)

        if self.save_counter is True:
            self.counter_register = self.RegisterFormat(self.counter_register)   


    def Add2Register(self, exp_number, counter=None):
        """
        Implementation:
            Adds letter:frequency to register. If letter already exists, just add frequency to current exp_number. 
        Args:
            exp_number [int] - current experiment number
            counter [] - counter for each letter counted
        Updates:
            counter_register [] - add letter frequency/count to register database

        """
        # updating counter_register
        for letter in counter:
            if letter not in self.counter_register.values:
                self.counter_register = self.counter_register.append({'Letter':letter, f'Exp {exp_number}':int(counter[letter])}, ignore_index = True)
            else:
                self.counter_register.loc[self.counter_register['Letter']==letter, f'Exp {exp_number}'] = counter[letter]    


    def ExactCounter(self, n_experiments=1, save_estimate=True, save_counter=True):
        """
        Get all letter, frequency for each experiment.
        Add letter, frequency to register
        """
        # necessary global vars
        self.n_experiments = n_experiments
        self.save_estimate = save_estimate
        self.save_counter = save_counter
        self.count_method = 'exact'

        # create register and cols
        self.InitializeRegisters()
    
        for exp_number in range(1, self.n_experiments + 1):

            counters = dict()
            for letter_index in range(len(self.text_data)):
                letter = self.text_data[letter_index] 
                if letter in counters:
                    counters[letter] += 1
                else:
                    counters.update({letter : 1})
            
            self.Add2Register(exp_number, counter=counters)

        self.FormatRegisters()
        self.Save2CSV()


    def FixedProbCounter(self, n_experiments=1, save_estimate=True, save_counter=True):
        """
        Finds the exact count/frequency of distint letters in a literary work, with a probability of 1/2 of counting a new occurence.
        """
        # necessary global vars
        self.n_experiments = n_experiments
        self.save_estimate = save_estimate
        self.save_counter = save_counter
        self.count_method = 'fixed'

        # create register and experiment headings
        self.InitializeRegisters()
    
        for exp_number in range(1, self.n_experiments + 1):

            counters = dict()
            for letter in self.text_data:
                
                # using this because first occurences will always result in an error
                if letter in counters:
                    k = counters[letter] # count of letter already made
                    prob_not_increase = 1 - 1/2 # prob of increment is 1/2
                else:
                    prob_not_increase = 0

                exp_prob = random.random()
                if exp_prob > prob_not_increase:
                    if letter in counters:
                        counters[letter] += 1
                    else:
                        counters.update({letter : 1})
            
            self.Add2Register(exp_number, counter=counters)

        self.FormatRegisters()
        self.Save2CSV()


    def DecreasingProbCounter(self, n_experiments:int=1, save_estimate:bool=True, save_counter=True):
        """
        counts new letter occurence with decreasing probability each time. 
        For instance, the 10'th occurence will have probability 1/sqrt(2)**10 probability of getting counted.
        This means the more frequent 
        """
        # necessary global vars
        self.n_experiments = n_experiments
        self.save_estimate = save_estimate
        self.save_counter = save_counter
        self.count_method = 'decrease'

        # create register and cols
        self.InitializeRegisters()
    
        for exp_number in range(1, self.n_experiments + 1):

            counters = dict()
            for letter in self.text_data:
            
                # using this because first occurences will always result in an error
                if letter in counters:
                    k = counters[letter] # count of letter already made
                    prob_not_increase = 1 - 1/(np.sqrt(2)**(k)) # prob of increment is 1/sqrt(2)**k
                else:
                    prob_not_increase = 0

                # update counter if exp_prob is larger
                exp_prob = random.random()
                if exp_prob > prob_not_increase:              
                    if letter in counters:
                        counters[letter] += 1
                    else:
                        counters.update({letter : 1})
                        
            self.Add2Register(exp_number, counter=counters)

        self.FormatRegisters()
        self.Save2CSV()
            

    


In [None]:
literary_works = [
    # 'sample',
    # 'aMidsummerNightsDream-english',
    # 'aMidsummerNightsDream-french',
    # 'aMidsummerNightsDream-german',
    'hamlet-english',
    # 'hamlet-french',
    # 'hamlet-german',
    # 'juliusCaesar-english',
    # 'juliusCaesar-french',
    # 'juliusCaesar-german',
    # 'kingRichardIII-english',
    # 'kingRichardIII-french',
    # 'kingRichardIII-german',
    # 'macbeth-english',
    # 'macbeth-french',
    # 'macbeth-german',
    # 'merchantOfVenice-english',
    # 'merchantOfVenice-french',
    # 'merchantOfVenice-german',
    # 'othello-english',
    # 'othello-french',
    # 'othello-german',
    # 'romeoAndJuliet-english',
    # 'romeoAndJuliet-french',
    # 'romeoAndJuliet-german',
    # 'theTempest-english',
    # 'theTempest-french',
    # 'theTempest-german',
]

for literary_work in literary_works:
    AC = ApproximateCounter()
    AC.LoadTextFile(filename=literary_work)
    print('Working on', literary_work)
    
    print('started ExactCounter')
    start_exact = time.time()
    AC.ExactCounter(n_experiments=1, save_estimate=True, save_counter=True)
    print('It took (', time.time() - start_exact, ') secs')

    print('started FixedProbCounter')
    start_fixed = time.time()
    AC.FixedProbCounter(n_experiments=5, save_estimate=True, save_counter=True)
    print('It took (', time.time() - start_fixed, ') secs')

    print('started DecreasingProbCounter')
    start_decreased = time.time()
    AC.DecreasingProbCounter(n_experiments=5, save_estimate=True, save_counter=True)
    print('It took (', time.time() - start_decreased, ') secs')

    print('-'*50)
    
print('Done!!!')

