In [686]:
import random
import numpy as np
import os
import collections
import pandas as pd
import time

In [684]:
class ApproximateCounter:
    """
    106 words in sample.txt
    899641 words in shakespeare without license by word2016.
    """

    def __init__(self):
        self.student_counter = 274356
        random.seed = self.student_counter
        self.textFilesDirectory = os.path.join(os.getcwd(), 'TextFiles')
        
        self.counter = 0
        self.prob = 1/2 # probability of the event happening

        self.text_data = None
        self.register = None
        self.count_method = None


    def LoadTextFile(self, filename='sample'):
        self.filename = filename
        filename = filename + '.txt'
        text_file_path = os.path.join(self.textFilesDirectory, filename)
        with open(text_file_path, 'r') as file:
            text_data = file.readlines()

        # combine all lines from text into a big string
        self.text_data = ''.join(text_data)
        
        # remove all non-alphanumerics characters, and converts to upper(most imp) 
        self.text_data = ''.join(filter(str.isalnum, self.text_data)).upper()
    

    def Save2CSV(self, filename=None, dir_name='Output'):
        """
        Saves register df to csv
        Args:
            filename [str] - filename to save csv without extension. default original textfile name
            dir_name [str] - output folder to save csv in. default 'output'
        """
        if filename is not None:
            if not isinstance(str, filename):
                filename = str(filename) + '.csv'
        else:
            filename = self.filename + '_' + self.count_method + '.csv'

        output_directory = os.path.join(self.textFilesDirectory, dir_name)
        output_file_path = os.path.join(output_directory, filename)
     
        if not os.path.exists(output_directory):
            os.mkdir(output_directory)

        self.register.to_csv(output_file_path, index = False, header=True)
        print('Save Complete!!')


    def FormatRegister(self):
        """
        extract the compiled register.
        counts formatted as integer.
        NaN values replaced by 0's.
        Obtain metrics (min, max, average)
        """
        self.register = self.register.fillna(0)
        for column in self.register.columns[1:]:
            self.register[column] = self.register[column].astype('int64')

        Min = self.register.iloc[:,1:].min(axis=1, skipna=True)
        Max = self.register.iloc[:,1:].max(axis=1, skipna=True)
        Average = self.register.iloc[:,1:].mean(axis=1, skipna=True)

        # change average from float to int rounded up
        self.register['Minimum'] = Min
        self.register['Maximum'] = Max
        self.register['Average'] = Average.apply(np.ceil).astype(int)

        self.register = self.register.sort_values(by='Average')


    def AddCounts2Register(self, exp_number, rows):
        """
        Implementation:
            Adds letter:frequency to register. If letter already exists, just add frequency to current exp_number. 
        Args:
            exp_number [int] - current experiment number
        Updates:
            register [int] - add letter frequency/count to register
        """
        for letter in rows:
            if letter not in self.register.values:
                self.register = self.register.append({'Letter':letter, f'Exp {exp_number}':int(rows[letter])}, ignore_index = True)
            else:
                self.register.loc[self.register['Letter']==letter, f'Exp {exp_number}'] = rows[letter]


    def FixedProbCounter(self, n_experiments=1):
        """
        Finds the actual number of distint letters in a literary work.
        """
        self.register = pd.DataFrame(columns=['Letter'])
        self.count_method = 'FixedProbCounter'

        for exp_number in range(1, n_experiments + 1):
            self.register[f'Exp {exp_number}'] = 0

            rows = dict()
            for letter in self.text_data:
                exp_prob = random.random()
                
                if exp_prob > self.prob:
                    if letter in rows:
                        rows[letter] += 1
                    else:
                        rows.update({letter : 1})
            
            print('Experiment (', exp_number, ') for', self.count_method,'Done!')

            self.AddCounts2Register(exp_number, rows)
        self.FormatRegister()


    def DecreasingProbCounter(self, n_experiments=1):
        self.register = pd.DataFrame(columns=['Letter'])
        self.count_method = 'DecreasingProbCounter'

        for exp_number in range(1, n_experiments + 1):
            self.register[f'Exp {exp_number}'] = 0

            rows = dict()
            for letter in self.text_data:
                
                exp_prob = random.random()
                prob = 1 - (np.sqrt(self.prob)**self.text_data.index(letter))

                if exp_prob > prob:
                    if letter in rows:
                        rows[letter] += 1
                    else:
                        rows.update({letter : 1})
            
            print('Experiment (', exp_number, ') for', self.count_method,'Done!')

            self.AddCounts2Register(exp_number, rows)
        self.FormatRegister()
            

    def ExactCounter(self, n_experiments=1):
        """
        Get all letter, frequency for each experiment.
        Add letter, frequency to register
        """
        self.register = pd.DataFrame(columns=['Letter'])
        self.count_method = 'ExactCounter'
        for exp_number in range(1, n_experiments + 1):
            self.register[f'Exp {exp_number}'] = 0

            rows = dict()
            for letter in self.text_data:
                if letter in rows:
                    rows[letter] += 1
                else:
                    rows.update({letter : 1})

            print('Experiment (', exp_number, ') for', self.count_method,'Done!')

            self.AddCounts2Register(exp_number, rows)
        self.FormatRegister()



In [688]:
AC = ApproximateCounter()
AC.LoadTextFile(filename='shakespeare_without_lic')

start_exact = time.time()
AC.ExactCounter(n_experiments=50)
AC.Save2CSV()
print('It took (', time.time() - start_exact, ') secs to complete ExactCounter')

start_fixed = time.time()
AC.FixedProbCounter(n_experiments=50)
AC.Save2CSV()
print('It took (', time.time() - start_fixed, ') secs to complete FixedProbCounter')

start_decreased = time.time()
AC.DecreasingProbCounter(n_experiments=50)
AC.Save2CSV()
print('It took (', time.time() - start_decreased, ') secs to complete DecreasingProbCounter')



Experiment ( 1 ) for ExactCounter Done!
Experiment ( 2 ) for ExactCounter Done!
Experiment ( 3 ) for ExactCounter Done!
Experiment ( 4 ) for ExactCounter Done!
Experiment ( 5 ) for ExactCounter Done!
Experiment ( 6 ) for ExactCounter Done!
Experiment ( 7 ) for ExactCounter Done!
Experiment ( 8 ) for ExactCounter Done!
Experiment ( 9 ) for ExactCounter Done!
Experiment ( 10 ) for ExactCounter Done!
Experiment ( 11 ) for ExactCounter Done!
Experiment ( 12 ) for ExactCounter Done!
Experiment ( 13 ) for ExactCounter Done!
Experiment ( 14 ) for ExactCounter Done!
Experiment ( 15 ) for ExactCounter Done!
Experiment ( 16 ) for ExactCounter Done!
Experiment ( 17 ) for ExactCounter Done!
Experiment ( 18 ) for ExactCounter Done!
Experiment ( 19 ) for ExactCounter Done!
Experiment ( 20 ) for ExactCounter Done!
Experiment ( 21 ) for ExactCounter Done!
Experiment ( 22 ) for ExactCounter Done!
Experiment ( 23 ) for ExactCounter Done!
Experiment ( 24 ) for ExactCounter Done!
Experiment ( 25 ) for Exa