# Creating sentence-author dataset 

In [None]:
import glob
from nltk import tokenize, download
import numpy as np
import random
import pandas as pd

from typing import List

In [None]:
download('punkt')

To improve the operation of the offer tokenizer, some character combinations are replaced

In [None]:
def split_text(filepath: str, min_char: int = 5) -> List[str]:
    
    text = str()
    with open(filepath, "r", encoding="utf8") as file:
        text = file.read().replace('\n', '. ')
        text = text.replace('.”', '”.').replace('."', '".').replace('?”', '”?').replace('!”', '”!')
        text = text.replace('--', ' ').replace('. . .', '').replace('_', '')
        # ...
    
    sentences = tokenize.sent_tokenize(text)    
    sentences = [sentence for sentence in sentences if len(sentence) >= min_char]

    return list(sentences)

In [None]:
chekhov = []
for path in glob.glob('../input/russian-literature/prose/Chekhov/*.txt'):
    chekhov += split_text(path)
    
dostoevsky = []
for path in glob.glob('../input/russian-literature/prose/Dostoevsky/*.txt'):
    dostoevsky += split_text(path)

tolstoy = []
for path in glob.glob('../input/russian-literature/prose/Tolstoy/*.txt'):
    tolstoy += split_text(path)

gogol = []
for path in glob.glob('../input/russian-literature/prose/Gogol/*.txt'):
    gogol += split_text(path)
    
gorky = []
for path in glob.glob('../input/russian-literature/prose/Gorky/*.txt'):
    gorky += split_text(path)
    
turgenev = []
for path in glob.glob('../input/russian-literature/prose/Gorky/*.txt'):
    turgenev += split_text(path)

In [None]:
chekhov[:10]

In [None]:
text_dict = { 'Chekhov': chekhov, 'Dostoevsky': dostoevsky, 'Tolstoy': tolstoy, 'Gogol': gogol, 
             'Gorky': gorky, 'Turgenev': turgenev }

for key in text_dict.keys():
    print(key, ':', len(text_dict[key]), ' sentences')

All sentences from 11516 to 77817. Let's choose about 10_000 sents for dataset.

## Combine mixed sentences

In [None]:
np.random.seed(1)

max_len = 10_000

names = [chekhov, dostoevsky, tolstoy, gogol, gorky, turgenev]

combined = []
for name in names:
    name = np.random.choice(name, max_len, replace=False)
    combined += list(name)

print('Length of combo and internally shuffled list:', len(combined))

## Create a labeled list

In [None]:
labels = ['Chekhov'] * max_len + ['Dostoevsky'] * max_len + ['Tolstoy'] * max_len + ['Gogol'] * max_len\
            + ['Gorky'] * max_len + ['Turgenev'] * max_len

print('Length of the labeled list:', len(labels))

Control length of data and labels:

In [None]:
len(combined) == len(labels)

## Randomly shuffle the data

In [None]:
random.seed(3)

zipped = list(zip(combined, labels))
random.shuffle(zipped)
combined, labels = zip(*zipped)

## Exporting the resulting dataset

In [None]:
out_data = pd.DataFrame()
out_data['text'] = combined
out_data['author'] = labels

In [None]:
print(out_data.head())
print(out_data.tail())

In [None]:
out_data.to_csv('author_data.csv', index=False)