In [1]:
PATH_TO_DATASET_FOLDER = "C:\\Users\\Vojta\\Desktop\\diploma\\data"

In [2]:
from enum import Enum
import os.path
import tensorflow as tf
import pandas as pd
from tqdm import tqdm

In [3]:
class DataSet(Enum):
    Gutenberg = "gutenberg"

In [4]:
class DataSetType(Enum):
    Sentence = "Sentence"
    Article = "Article"

In [5]:
DATA_NAME = 'data.csv'

In [6]:
def create_path(directory, dataset, dataset_type, k=None):
    is_sentence_type = dataset_type == DataSetType.Sentence 
    if is_sentence_type and k is None:
        raise Exception(f"Sentence should be specified with k argument!")
    
    return os.path.join(directory, dataset.value, dataset_type.value + str(k), DATA_NAME) if is_sentence_type else os.path.join(directory, dataset.value, dataset_type.value, DATA_NAME)

In [7]:
create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, DataSetType.Sentence, 10)

'C:\\Users\\Vojta\\Desktop\\diploma\\data\\gutenberg\\Sentence10\\data.csv'

In [8]:
def process_text(text):
    return text.lower()

In [9]:
def get_dataset_object_from_path(csv_filename, delim, text_pipeline_func=None):
    dataset = tf.data.TextLineDataset(filenames=csv_filename)
    
    def parse_csv(line):
        csv_line = bytes.decode(line.numpy())
        text, author = csv_line.split(delim)
        if text_pipeline_func is not None:
            text = text_pipeline_func(text)
        return text, author 

    dataset = dataset.map(lambda tpl: tf.py_function(parse_csv, [tpl], [tf.string, tf.string]))
    return dataset

In [10]:
ds = get_dataset_object_from_path(create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, DataSetType.Sentence, 3), ';', process_text)

# Create statistics for specified dataset 

In [20]:
NAME_OF_STATISTICS_FILE = 'stats_'

In [21]:
def create_stats_file_name(name_of_file):
    return f'{NAME_OF_STATISTICS_FILE}{name_of_file}.xlsx'

In [22]:
def build_input_for_statistics(path, sep, metric_instances = [] ,text_pipeline_func = None, save = True):
    path_parts = path.split(os.path.sep)
    name_of_file = path_parts[-1]
    del path_parts[-1]
    path_parts.append(create_stats_file_name(name_of_file))
    path_to_save = os.path.sep.join(path_parts)
    return get_dataset_object_from_path(path, sep, text_pipeline_func), MetricWrapper(metric_instances, path_to_save if save else None)

In [23]:
build_input_for_statistics(create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, DataSetType.Sentence, 3), ';', process_text)

(<MapDataset shapes: (<unknown>, <unknown>), types: (tf.string, tf.string)>,
 <__main__.MetricWrapper at 0x22005ca2430>)

### TODO: create more metrics


In [15]:
class LabelMetric:
    def __init__(self):
        self.state = {}
    
    def update_state(self, text, label):
        self.state[label] = self.state.get(label, 0) + 1
        
    def get_dataframe(self):
        return pd.DataFrame.from_dict(self.state, orient='index')

In [16]:
class MetricWrapper:
    def __init__(self, metric_instances = [], path_to_save = None):
        self.path_to_save = path_to_save
        self.metric_instances = metric_instances
    
    def update_state(self, text, label):
        #TODO: Can derive from this class and update __init__ and update_states
        for instance in self.metric_instances:
            instance.update_state(text, label)
    
    def process_row(self, record):
        text, label = record
        text = bytes.decode(text.numpy())
        label = bytes.decode(label.numpy())
        self.update_state(text, label)
    
    def save(self):
        if self.path_to_save is not None:
            print(f'Saving to {self.path_to_save}')
            with pd.ExcelWriter(self.path_to_save, engine='xlsxwriter') as writer:
                for metric_instance in self.metric_instances:
                    metric_instance.get_dataframe().to_excel(writer, sheet_name=type(metric_instance).__name__)            
        else:
            print('Saving path is not specified!')
            
    def __str__(self):
        string = ''
        for instance in self.metric_instances:
            string += type(instance).__name__
            string += instance.get_dataframe().to_string()
            string += '\n'
        return string

In [17]:
def create_statistics_from(dataset, metric_instance):
    for i, record in enumerate(dataset):
        metric_instance.process_row(record)
    metric_instance.save()
    return metric_instance

In [18]:
metric_instance = create_statistics_from(
    *build_input_for_statistics(
        create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, DataSetType.Sentence, 3), 
        ';', 
        [LabelMetric()], 
        process_text,
        True
    )
)

Saving to C:\Users\Vojta\Desktop\diploma\data\gutenberg\Sentence3\stats_data.csv.xlsx


In [19]:
print(metric_instance)

LabelMetric                                    0
Twain, Mark                      3933
Meredith, George                  374
Jacobs, W. W. (William Wymark)  18356
Fenn, George Manville             302
Balzac, Honoré de                 495

