In [145]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split

In [146]:
filename = './Пушкин Александр. Полное собрание стихотворений - royallib.ru.txt'

In [147]:
with open(filename, 'rb') as file:
    data = file.read().decode('cp1251')

In [148]:
poems = data.split('\r\n\r\n\r\n')
library = []

for poem in poems:
    if '\xa0г' in poem and 'Стихотворения' in poem:
        period = re.sub('([^\W\d]|\s)', '', poem)
        if '–' in period:
            period = period.split('–')
            period = str(int(sum(map(int, period)) / len(period)))
    
    couplets = poem.split('\r\n\r\n\t\t')
    
    clear_name = couplets[0]
    if len(clear_name.split('\r\n\t\t')) > 3:
        library.append(
            {
                'year': period,
                'name': '* * *',
                'poem': clear_name,
            }
        )
        clear_name = '* * *'
        
    if clear_name.find('(') != -1:
        clear_name = clear_name[:clear_name.find('(')]
        
    if clear_name.find('[') != -1:
        clear_name = clear_name[:clear_name.find('[')]
        
    if not clear_name.startswith('\r\n') and '*' not in clear_name:
        clear_name = '* * *'
        
    if len(clear_name.split('\r\n\r\n')) > 3:
        clear_name = '* * *'
        
    clear_name = re.sub('[\r\n|\xa0|<|>]', ' ', clear_name)
    clear_name = re.sub('\s+', ' ', clear_name)
    
    if '***' in clear_name or clear_name.strip() == '':
        clear_name  = '* * *'
    
    clear_couplets = []
    for couplet in couplets[1:]:
        ru_words = re.findall(r'[А-я]+',  couplet)
        eng_words = re.findall(r'[A-z]+',  couplet)
        if len(ru_words) > len(eng_words):
            clear_couplets.append(couplet)
        
    if not clear_couplets:
        continue
        
    library.append(
        {
            'year': period,
            'name': clear_name.strip(),
            'poem': clear_couplets,
        }
    )

In [149]:
df = pd.DataFrame(library)
df['poem'] = df['poem'].map(lambda x: ''.join(x))
df['name'] = df['year'].str.cat(df['name'], sep=' ')
df = df.drop(columns=['year'])

In [150]:
train, valid = train_test_split(df, test_size=0.20, random_state=42, shuffle=True)

In [151]:
train.to_csv('pushkin_train.csv', index=False)
valid.to_csv('pushkin_valid.csv', index=False)