In [2]:
import tensorflow as tf
from os import listdir
from os.path import isfile, join
import os
import json
from enum import Enum
from nltk import tokenize
import csv
from tqdm import tqdm
import pandas as pd
import numpy as np
import math
import random

In [3]:
PROJECT_CSV_DELIMITER = ';'

In [4]:
AUTHOR_DIRECTORY_NAME = "Authors"

In [5]:
class GutenbergJsonAttributes(Enum):
    Title = "Title"
    Author = "Author"
    Subject = "Subject"
    Alias = "Alias"
    Birthdate = "Birthdate"
    Deathdate = "Deathdate"
    Aliases = "Aliases"
    Text = "Text"
    Id = "Id"
    AuthorId = "AuthorId"

In [6]:
pattern = "*.json"

In [7]:
raw_path = 'C:\\Users\\Vojta\\Desktop\\diploma\\gutenberg_json\\' + pattern

In [8]:
def load_files(path):
    return tf.data.Dataset.list_files(path, shuffle=False)

In [9]:
def load_json(path):
    with open(path) as f:
        return json.load(f)

In [10]:
def iterate_over_gutenberg(files_path, process_func):
    print(f"Loading files from {files_path}")
    for path_to_file in tqdm(load_files(files_path)):
        path = bytes.decode(path_to_file.numpy())
        data = load_json(path)
        process_func(data)

In [11]:
PATH_TO_SAVE_GENERAL_GUTENBERG = "C:\\Users\\Vojta\\Desktop\\diploma\\gutenberg_from_raw"
PATH_TO_SAVE_DATASET = ""

In [17]:
authors = pd.read_csv(PATH_TO_ALL_AUTHORS, sep=AUTHORS_CSV_DELIMITER, encoding='utf-8')

In [18]:
authors.head()

Unnamed: 0,gutenberg_author_id,author.y,n
0,761,"Lytton, Edward Bulwer Lytton, Baron",215
1,1800,"Ebers, Georg",164
2,53,"Twain, Mark",144
3,8659,"Kingston, William Henry Giles",132
4,1285,"Parker, Gilbert",131


In [19]:
def process_data_as_raw_row(data):
    current_id = data[GutenbergJsonAttributes.Id.value]
    copied = data.copy()
    
    #TODO: Maybe some kind of general processing
    processed_text = copied[GutenbergJsonAttributes.Text.value][0]

    copied[GutenbergJsonAttributes.Text.value] = [processed_text]
    
    path = PATH_TO_SAVE_GENERAL_GUTENBERG + "\\" + str(current_id[0]) + ".json"
    
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(copied, f)

In [22]:
iterate_over_gutenberg(raw_path, process_data_as_raw_row)

NameError: name 'process_data_as_raw_row' is not defined

In [23]:
def preprocess_newlines(document):
    return document.replace('\n', '')

In [24]:
def preprocess_project_delimiter(document):
    return document.replace(PROJECT_CSV_DELIMITER, '')

In [13]:
def chunk_document_by_sentence(document, k):
    preprocessed_document_for_sentences = preprocess_newlines(document)
    preprocessed_document_for_sentences = preprocess_project_delimiter(preprocessed_document_for_sentences)
    sentences = tokenize.sent_tokenize(preprocessed_document_for_sentences)
    chunked_sentences = [' '.join(sentences[sent_index:sent_index+k]) for sent_index in range(0, len(sentences), k)]
    return chunked_sentences

In [14]:
def build_process_func(k, name, path, authors_tuple):
    number_of_authors = len(authors_tuple)
    authors_ids, authors_names = zip(*authors_tuple)
    
    directory_for_file = os.sep.join([path, f'{number_of_authors}Authors', name])
    
    if not os.path.exists(directory_for_file):
        os.makedirs(directory_for_file)
    
    authors_dataframe = pd.DataFrame.from_dict({
        GutenbergJsonAttributes.AuthorId.value: authors_ids, 
        GutenbergJsonAttributes.Author.value: authors_names
    },)
    
    
    name_of_authors_file = "authors.csv"
    authors_save = os.sep.join([directory_for_file, name_of_authors_file])
                               
    print(f'Saving authors csv to {authors_save}')
                               
    authors_dataframe.to_csv(authors_save, index=False, sep=';')
    
    def process_to_create_dataset(data):
        current_author = data[GutenbergJsonAttributes.Author.value][0] 
        current_text = data[GutenbergJsonAttributes.Text.value][0]
        current_author_id = data[GutenbergJsonAttributes.AuthorId.value][0] 
        
        is_required_author = current_author_id in authors_ids
        
        if is_required_author:
            
            chunked_sentences = chunk_document_by_sentence(current_text, k)

            name_of_file = "data.csv"
            with open(os.sep.join([directory_for_file, name_of_file]), 'a', newline='') as f:
                writer = csv.writer(f, delimiter=';')

                for chunk in chunked_sentences:
                    value = [chunk, current_author_id]
                    writer.writerow(value)

            return chunked_sentences
        
        return []

    return process_to_create_dataset

In [15]:
DIRECTORY_TO_SAVE = "C:\\Users\\Vojta\\Desktop\\diploma\\data\\gutenberg"

In [16]:
PATH_TO_ALL_AUTHORS = "C:\\Users\\Vojta\\Desktop\\diploma\\gutenberg_downloaded\\authors\\authors.csv"
AUTHORS_CSV_DELIMITER = ","

In [20]:
class AuthorsGenerator:
    def __init__(self, path: str, sep: str):
        self.path = path
        self.sep = sep
    
    def generate_top_k(self, k: int):
        data = pd.read_csv(self.path, sep=self.sep)
        rows = data.shape[0]
        authors_ids = data.iloc[0:rows-1, 0].astype(int).values[0:k]
        authors_names = data.iloc[0:rows-1, 1].astype(str).values[0:k]
        authors_tuple = list(zip(authors_ids, authors_names))
        del data
        return authors_tuple

In [21]:
authors_generator = AuthorsGenerator(PATH_TO_ALL_AUTHORS, AUTHORS_CSV_DELIMITER)

In [22]:
PATH_TO_SAVE_TEST = "C:\\Users\\Vojta\\Desktop\\diploma\\gutenberg_test"

In [23]:
test_path = os.sep.join([PATH_TO_SAVE_TEST, pattern])

In [24]:
iterate_over_gutenberg(test_path, build_process_func(3, "Sentence" + str(3), DIRECTORY_TO_SAVE, authors_generator.generate_top_k(10)))

Saving authors csv to C:\Users\Vojta\Desktop\diploma\data\gutenberg\10Authors\Sentence3\authors.csv
Loading files from C:\Users\Vojta\Desktop\diploma\gutenberg_test\*.json


100%|██████████| 1/1 [00:00<00:00, 90.93it/s]


# Have big csv ... iterate over csv and create test train valid sets deps on statistcs

In [48]:
PATH_TO_DATASET_FOLDER = "C:\\Users\\Vojta\\Desktop\\diploma\\data"

In [49]:
from enum import Enum
import os.path
import tensorflow as tf
import pandas as pd
from tqdm import tqdm

In [50]:
class DataSet(Enum):
    Gutenberg = "gutenberg"

In [51]:
class DataSetType(Enum):
    Sentence = "Sentence"
    Article = "Article"

In [52]:
DATA_NAME = 'data.csv'

In [70]:
def create_path(directory, dataset, authors_directory, dataset_type, k=None):
    is_sentence_type = dataset_type == DataSetType.Sentence 
    if is_sentence_type and k is None:
        raise Exception(f"Sentence should be specified with k argument!")
    
    return os.path.join(directory, dataset.value, authors_directory, dataset_type.value + str(k), DATA_NAME) if is_sentence_type else os.path.join(directory, dataset.value, authors_directory, dataset_type.value, DATA_NAME)

In [73]:
create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, create_author_directory(10), DataSetType.Sentence, 10)

'C:\\Users\\Vojta\\Desktop\\diploma\\data\\gutenberg\\10Authors\\Sentence10\\data.csv'

In [74]:
def process_text(text):
    return text.lower()

In [56]:
def get_dataset_object_from_path(csv_filename, delim, text_pipeline_func=None):
    dataset = tf.data.TextLineDataset(filenames=csv_filename)
    
    def parse_csv(line):
        csv_line = bytes.decode(line.numpy())
        text, author = csv_line.split(delim)
        if text_pipeline_func is not None:
            text = text_pipeline_func(text)
        return text, author 

    dataset = dataset.map(lambda tpl: tf.py_function(parse_csv, [tpl], [tf.string, tf.string]))
    return dataset

In [57]:
ds = get_dataset_object_from_path(create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, DataSetType.Sentence, 3), ';', process_text)

# Create statistics for specified dataset 

In [103]:
NAME_OF_STATISTICS_FILE = 'stats_'

In [104]:
def create_stats_file_name(name_of_file):
    return f'{NAME_OF_STATISTICS_FILE}{name_of_file}.xlsx'

In [105]:
def build_input_for_statistics(path, sep, metric_instances = [] ,text_pipeline_func = None, save = True):
    path_parts = path.split(os.path.sep)
    name_of_file = path_parts[-1]
    del path_parts[-1]
    path_parts.append(create_stats_file_name(name_of_file))
    path_to_save = os.path.sep.join(path_parts)
    return get_dataset_object_from_path(path, sep, text_pipeline_func), MetricWrapper(metric_instances, path_to_save if save else None)

In [107]:
#build_input_for_statistics(create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, DataSetType.Sentence, 3), ';', process_text)

### TODO: create more metrics


In [108]:
def create_author_directory(k):
    return f'{k}{AUTHOR_DIRECTORY_NAME}'

In [109]:
class LabelMetric:
    def __init__(self):
        self.state = {}
    
    def update_state(self, text, label):
        self.state[label] = self.state.get(label, 0) + 1
        
    def get_dataframe(self):
        return pd.DataFrame.from_dict(self.state, orient='index')

In [121]:
class MetricWrapper:
    def __init__(self, metric_instances = [], path_to_save = None):
        self.path_to_save = path_to_save
        self.metric_instances = metric_instances
    
    def update_state(self, text, label):
        #TODO: Can derive from this class and update __init__ and update_states
        for instance in self.metric_instances:
            instance.update_state(text, label)
    
    def process_row(self, record):
        text, label = record
        text = bytes.decode(text.numpy())
        label = bytes.decode(label.numpy())
        self.update_state(text, label)
    
    def save(self):
        if self.path_to_save is not None:
            print(f'Saving to {self.path_to_save}')
            with pd.ExcelWriter(self.path_to_save, engine='xlsxwriter') as writer:
                for metric_instance in self.metric_instances:
                    metric_instance.get_dataframe().to_excel(writer, sheet_name=type(metric_instance).__name__)            
        else:
            print('Saving path is not specified!')
            
    def __str__(self):
        string = ''
        for instance in self.metric_instances:
            string += type(instance).__name__
            string += instance.get_dataframe().to_string()
            string += '\n'
        return string

In [122]:
def create_statistics_from(dataset, metric_instance):
    for i, record in enumerate(dataset):
        metric_instance.process_row(record)
    metric_instance.save()
    return metric_instance

In [123]:
metric_instance = create_statistics_from(
    *build_input_for_statistics(
        create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, DataSetType.Sentence, 3), 
        ';', 
        [LabelMetric()], 
        process_text,
        True
    )
)

AttributeError: 'int' object has no attribute 'value'

In [124]:
print(metric_instance)

NameError: name 'metric_instance' is not defined

In [125]:
def split_file_to_train_test_valid(path_to_file, train_size=1, test_size=0, valid_size=0, min_label_size=None):
    print(train_size, test_size, valid_size, path_to_file, min_label_size)

In [126]:
split_file_to_train_test_valid(
    create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, create_author_directory(10), DataSetType.Sentence, 3),
)

1 0 0 C:\Users\Vojta\Desktop\diploma\data\gutenberg\10Authors\Sentence3\data.csv None


In [1]:
PATH_TO_DATASET_FOLDER = "C:\\Users\\Vojta\\Desktop\\diploma\\data"

In [2]:
from enum import Enum
import os.path
import tensorflow as tf
import pandas as pd
from tqdm import tqdm

In [3]:
class DataSet(Enum):
    Gutenberg = "gutenberg"

In [4]:
class DataSetType(Enum):
    Sentence = "Sentence"
    Article = "Article"

In [5]:
DATA_NAME = 'data.csv'

In [6]:
def create_path(directory, dataset, dataset_type, k=None):
    is_sentence_type = dataset_type == DataSetType.Sentence 
    if is_sentence_type and k is None:
        raise Exception(f"Sentence should be specified with k argument!")
    
    return os.path.join(directory, dataset.value, dataset_type.value + str(k), DATA_NAME) if is_sentence_type else os.path.join(directory, dataset.value, dataset_type.value, DATA_NAME)

In [7]:
create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, DataSetType.Sentence, 10)

'C:\\Users\\Vojta\\Desktop\\diploma\\data\\gutenberg\\Sentence10\\data.csv'

In [8]:
def process_text(text):
    return text.lower()

In [9]:
def get_dataset_object_from_path(csv_filename, delim, text_pipeline_func=None):
    dataset = tf.data.TextLineDataset(filenames=csv_filename)
    
    def parse_csv(line):
        csv_line = bytes.decode(line.numpy())
        text, author = csv_line.split(delim)
        if text_pipeline_func is not None:
            text = text_pipeline_func(text)
        return text, author 

    dataset = dataset.map(lambda tpl: tf.py_function(parse_csv, [tpl], [tf.string, tf.string]))
    return dataset

In [10]:
ds = get_dataset_object_from_path(create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, DataSetType.Sentence, 3), ';', process_text)

# Create statistics for specified dataset 

In [20]:
NAME_OF_STATISTICS_FILE = 'stats_'

In [21]:
def create_stats_file_name(name_of_file):
    return f'{NAME_OF_STATISTICS_FILE}{name_of_file}.xlsx'

In [22]:
def build_input_for_statistics(path, sep, metric_instances = [] ,text_pipeline_func = None, save = True):
    path_parts = path.split(os.path.sep)
    name_of_file = path_parts[-1]
    del path_parts[-1]
    path_parts.append(create_stats_file_name(name_of_file))
    path_to_save = os.path.sep.join(path_parts)
    return get_dataset_object_from_path(path, sep, text_pipeline_func), MetricWrapper(metric_instances, path_to_save if save else None)

In [23]:
build_input_for_statistics(create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, DataSetType.Sentence, 3), ';', process_text)

(<MapDataset shapes: (<unknown>, <unknown>), types: (tf.string, tf.string)>,
 <__main__.MetricWrapper at 0x22005ca2430>)

### TODO: create more metrics


In [15]:
class LabelMetric:
    def __init__(self):
        self.state = {}
    
    def update_state(self, text, label):
        self.state[label] = self.state.get(label, 0) + 1
        
    def get_dataframe(self):
        return pd.DataFrame.from_dict(self.state, orient='index')

In [16]:
class MetricWrapper:
    def __init__(self, metric_instances = [], path_to_save = None):
        self.path_to_save = path_to_save
        self.metric_instances = metric_instances
    
    def update_state(self, text, label):
        #TODO: Can derive from this class and update __init__ and update_states
        for instance in self.metric_instances:
            instance.update_state(text, label)
    
    def process_row(self, record):
        text, label = record
        text = bytes.decode(text.numpy())
        label = bytes.decode(label.numpy())
        self.update_state(text, label)
    
    def save(self):
        if self.path_to_save is not None:
            print(f'Saving to {self.path_to_save}')
            with pd.ExcelWriter(self.path_to_save, engine='xlsxwriter') as writer:
                for metric_instance in self.metric_instances:
                    metric_instance.get_dataframe().to_excel(writer, sheet_name=type(metric_instance).__name__)            
        else:
            print('Saving path is not specified!')
            
    def __str__(self):
        string = ''
        for instance in self.metric_instances:
            string += type(instance).__name__
            string += instance.get_dataframe().to_string()
            string += '\n'
        return string

In [17]:
def create_statistics_from(dataset, metric_instance):
    for i, record in enumerate(dataset):
        metric_instance.process_row(record)
    metric_instance.save()
    return metric_instance

In [18]:
metric_instance = create_statistics_from(
    *build_input_for_statistics(
        create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, DataSetType.Sentence, 3), 
        ';', 
        [LabelMetric()], 
        process_text,
        True
    )
)

Saving to C:\Users\Vojta\Desktop\diploma\data\gutenberg\Sentence3\stats_data.csv.xlsx


In [19]:
print(metric_instance)

LabelMetric                                    0
Twain, Mark                      3933
Meredith, George                  374
Jacobs, W. W. (William Wymark)  18356
Fenn, George Manville             302
Balzac, Honoré de                 495



In [175]:
TRAIN_SIZE = 0.7
VALIDATION_SIZE = 0.15
TEST_SIZE = 0.15

In [190]:
TRAIN_NAME = 'train.csv'
TEST_NAME = 'test.csv'
VALIDATION_NAME = 'valid.csv'

In [191]:
def check_size(train_size=1, test_size=0, valid_size=0):
    if np.sum([train_size, test_size, valid_size]) != 1:
        assert Exception("Is not valid split!")

In [361]:
def delete_from(path):
    if os.path.exists(path):
        os.remove(path)
        print(f'Deleting file {path}')
    else:
        print(f'File {path} do not exists')

In [426]:
class DataSetSplitter:
    def __init__(self, path_to_save, train_size, test_size, valid_size, label_counter, normalization_size):
        self.directory_path = path_to_save
        self.size_file_name = 'subset_sizes.csv'
        self.create_state(train_size, test_size, valid_size)
        self.delete_files_in_directory()
        self.counter_dataframe = label_counter.copy()
        self.normalize(normalization_size)  
        self.prepare_counters()
        self.save_counters()

        
    def create_state(self, train_size, test_size, valid_size):
        self.state = {
            'train': {
                'path': os.path.sep.join([self.directory_path, TRAIN_NAME]),
                'size': train_size
            
            },
            'test': {
                'path': os.path.sep.join([self.directory_path, TEST_NAME]),
                'size': test_size
            
            },
            'valid': {
                'path': os.path.sep.join([self.directory_path, VALIDATION_NAME]),
                'size': valid_size
            
            },
        }
        
    def normalize(self, normalization_size):
        if normalization_size is not None:
            self.counter_dataframe.iloc[:, 0] =  normalization_size
               
    def prepare_counters(self):
        dataset_counters = {name:{} for name in self.state.keys()}
    
        for key in dataset_counters.keys():
            set_key_counter = {}
            for author_id, row in self.counter_dataframe.iterrows():
                count = row[0]
                set_key_counter[author_id] = math.floor(count * self.state[key]['size'])
            dataset_counters[key] = set_key_counter
    
        
        self.dataset_counters = dataset_counters
        
    def get_path(self, label):
        picks = []
    
        for key in self.dataset_counters.keys():
            if self.dataset_counters[key][label] > 0:
                picks.append(key)
        
        if len(picks) == 0:
            return None
        #choice one
        pick = random.choice(picks)
        #subtract
        self.dataset_counters[pick][label] -= 1
        #return path according to pick
        return self.state[pick]['path']
    
    def save_counters(self):
        counters = pd.DataFrame.from_dict(self.dataset_counters, orient='index')
        counters.to_csv(os.path.sep.join([self.directory_path, self.size_file_name]), sep=';')
        
        
    def delete_files_in_directory(self):
        for v in self.state.values():
            current_subset_path = v['path']
            delete_from(current_subset_path)
        delete_from(os.path.sep.join([self.directory_path, self.size_file_name]))
                
                
    def build_subsets(self, dataset):
        for line in tqdm(dataset.shuffle(10000).as_numpy_iterator()):
            #label = author Id
            text, label = line
            text = bytes.decode(text)
            
            #TODO: DELETE!
            label = bytes.decode(label)
            
            path = self.get_path(label)
            
            if path is None:
                continue
                
            with open(path, 'a', newline='') as f:
                writer = csv.writer(f, delimiter=';')
                value = [text, label]
                writer.writerow(value)

In [427]:
def split_file_to_train_test_valid(
    path_to_load, 
    path_to_save, 
    label_metric=None,
    normalization_size=None,
    train_size=TRAIN_SIZE, 
    test_size=TEST_SIZE, 
    valid_size=VALIDATION_SIZE,
):
    check_size(train_size, test_size, valid_size)
    splitter = DataSetSplitter(path_to_save, train_size, test_size, valid_size, label_metric, normalization_size)
    
    dataset = get_dataset_object_from_path(path_to_load, ';', None)
    splitter.build_subsets(dataset)

In [428]:
def run_split_deps_on_stats(path_to_load, path_to_save, normalization = True, train_size=TRAIN_SIZE, test_size=TEST_SIZE, valid_size=VALIDATION_SIZE):
    check_size(train_size, test_size, valid_size)
    
    metric_instance = create_statistics_from(
        *build_input_for_statistics(
            path_to_load,
            ';', 
            [LabelMetric()], 
            process_text,
            False
        )
    )
    
    number_of_min_label = None
    label_metric = metric_instance.metric_instances[0].get_dataframe()
    
    if normalization:
        sorted_label_metric_frame = label_metric.sort_values(by=0)
        id_of_min_label = sorted_label_metric_frame.iloc[0].name
        number_of_min_label = sorted_label_metric_frame.iloc[0][0]
    
    split_file_to_train_test_valid(path_to_load, path_to_save, label_metric, number_of_min_label)
    return label_metric

In [429]:
p = create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, create_author_directory(10), DataSetType.Sentence, 3)
p

'C:\\Users\\Vojta\\Desktop\\diploma\\data\\gutenberg\\10Authors\\Sentence3\\data.csv'

In [430]:
res = run_split_deps_on_stats(
    p,
    os.path.sep.join(p.split(os.path.sep)[0:-1]),
    False
)

Saving path is not specified!
Deleting file C:\Users\Vojta\Desktop\diploma\data\gutenberg\10Authors\Sentence3\train.csv
Deleting file C:\Users\Vojta\Desktop\diploma\data\gutenberg\10Authors\Sentence3\test.csv
Deleting file C:\Users\Vojta\Desktop\diploma\data\gutenberg\10Authors\Sentence3\valid.csv
Deleting file C:\Users\Vojta\Desktop\diploma\data\gutenberg\10Authors\Sentence3\subset_sizes.csv


23460it [00:23, 999.02it/s] 
