In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymorphy2
import os
import string
import tempfile
import tensorflow as tf

from gensim.utils import tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.preprocessing import sequence
from tensorboard import summary as summary_lib

tf.logging.set_verbosity(tf.logging.INFO)

In [0]:
esenin = pd.read_csv('esenin.csv')
random = pd.read_csv('random.csv')

In [7]:
esenin.head()

Unnamed: 0,id,poem
0,0,"Есть музыка, стихи и танцы,\r\r\nЕсть ложь и л..."
1,1,"Радость, как плотвица быстрая,\r\r\nЮрко свети..."
2,2,Алый мрак в небесной черни\r\r\nНачертил пожар...
3,3,"Сестре Шуре\r\r\n\r\r\nАх, как мн..."
4,4,"Ах, метель такая, просто черт возьми!\r\r\nЗаб..."


## Data preprocessing

In [0]:
vocab_size = 5000
sentence_size = 200
embedding_size = 50
pad_id = 0

stops = stopwords.words('russian')
stops.extend(list(string.punctuation))
morph = pymorphy2.MorphAnalyzer()
model_dir = tempfile.mkdtemp()

def text_preprocessor(str_input):
    '''Dropping stop-words and normalization.'''
    output = []
    for token in list(tokenize(str_input, lowercase=True)):
        if token in stops:
            continue
        else:
            output.append(morph.parse(token)[0].normal_form)
    return ' '.join(output)

esenin['preprocessed_poem'] = esenin['poem'].apply(text_preprocessor)
random['preprocessed_poem'] = random['poem'].apply(text_preprocessor)

tf_idf = TfidfVectorizer(stop_words=stops, max_features=vocab_size)
tf_idf.fit(esenin['preprocessed_poem'])

esenin['numerical_poem'] = esenin['preprocessed_poem'].apply(
    lambda x: [tf_idf.vocabulary_[w] if w in tf_idf.vocabulary_ else 4999 for w in x.split()])
random['numerical_poem'] = random['preprocessed_poem'].apply(
    lambda x: [tf_idf.vocabulary_[w] if w in tf_idf.vocabulary_ else 4999 for w in x.split()])

In [0]:
df = pd.concat((esenin, random))
df['target'] = [1] * esenin.shape[0] + [0] * random.shape[0]
del esenin, random

In [10]:
df.head()

Unnamed: 0,id,poem,preprocessed_poem,numerical_poem,target
0,0,"Есть музыка, стихи и танцы,\r\r\nЕсть ложь и л...",музыка стих танец ложь лесть пускай бранить ст...,"[1833, 3947, 4133, 1681, 4999, 2881, 133, 3907...",1
1,1,"Радость, как плотвица быстрая,\r\r\nЮрко свети...",радость плотвица быстрый юрко светить вода рук...,"[2958, 4999, 169, 4962, 3369, 287, 3252, 1823,...",1
2,2,Алый мрак в небесной черни\r\r\nНачертил пожар...,алый мрак небесный чернь начертить пожар грань...,"[10, 1826, 1904, 4776, 1900, 2336, 4999, 2610,...",1
3,3,"Сестре Шуре\r\r\n\r\r\nАх, как мн...",сестра шура ах свет кошка мы ты счесть сердце ...,"[3482, 4917, 23, 3364, 1527, 4999, 4999, 4088,...",1
4,4,"Ах, метель такая, просто черт возьми!\r\r\nЗаб...",ах метель такой просто черта взять забивать кр...,"[23, 1760, 4999, 2801, 4780, 248, 998, 1577, 5...",1


In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    df['numerical_poem'], df['target'], random_state=42, test_size=0.2)

x_train = sequence.pad_sequences(X_train,
                                 maxlen=sentence_size,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
x_test = sequence.pad_sequences(X_test, 
                                maxlen=sentence_size,
                                truncating='post',
                                padding='post', 
                                value=pad_id)

In [12]:
x_train[0]

array([1942, 4999, 4999, 1993, 4999, 4999, 4999, 1744, 4999, 4999,  114,
       2889, 1284,   46, 3205,   46, 4999, 4999, 4999, 4999,  218, 2640,
        864, 2135, 3716, 4999,  561, 3679,  651,  174, 4999, 2091, 4260,
       2958, 4999, 3417, 4999, 3465, 3398, 4999, 2745, 4714, 4999, 4999,
       1989, 4759, 2228, 2939, 4047,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [0]:
x_len_train = np.array([min(len(x), sentence_size) for x in X_train])
x_len_test = np.array([min(len(x), sentence_size) for x in X_test])

def parser(x, length, y):
    features = {"num_poem": x, "len": length}
    return features, y

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((x_train, x_len_train, y_train))
    dataset = dataset.shuffle(buffer_size=len(X_train))
    dataset = dataset.batch(100)
    dataset = dataset.map(parser)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((x_test, x_len_test, y_test))
    dataset = dataset.batch(100)
    dataset = dataset.map(parser)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

## Building LSTM

In [0]:
def train_and_evaluate(classifier):
    classifier.train(input_fn=train_input_fn, steps=5000)
    eval_results = classifier.evaluate(input_fn=eval_input_fn)
    predictions = np.array([p['logistic'][0] for p in classifier.predict(input_fn=eval_input_fn)])
        
    # Reset the graph to be able to reuse name scopes
    tf.reset_default_graph() 
    # Add a PR summary in addition to the summaries that the classifier writes
    pr = summary_lib.pr_curve('precision_recall', predictions=predictions, 
                              labels=y_test.astype(bool), num_thresholds=21)
    with tf.Session() as sess:
        writer = tf.summary.FileWriter(os.path.join(classifier.model_dir, 'eval'), sess.graph)
        writer.add_summary(sess.run(pr), global_step=0)
        writer.close()

In [15]:
head = tf.contrib.estimator.binary_classification_head()

def lstm_model_fn(features, labels, mode):    
    # [batch_size x sentence_size x embedding_size]
    inputs = tf.contrib.layers.embed_sequence(
        features['num_poem'], vocab_size, embedding_size,
        initializer=tf.random_uniform_initializer(-1.0, 1.0))

    # create an LSTM cell of size 100
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(100)
    
    # create the complete LSTM
    _, final_states = tf.nn.dynamic_rnn(
        lstm_cell, inputs, sequence_length=features['len'], dtype=tf.float32)

    # get the final hidden states of dimensionality [batch_size x sentence_size]
    outputs = final_states.h

    logits = tf.layers.dense(inputs=outputs, units=1)

    # This will be None when predicting
    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])

    optimizer = tf.train.AdamOptimizer()

    def _train_op_fn(loss):
        return optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())

    return head.create_estimator_spec(
        features=features,
        labels=labels,
        mode=mode,
        logits=logits,
        train_op_fn=_train_op_fn)


lstm_classifier = tf.estimator.Estimator(model_fn=lstm_model_fn,
                                         model_dir=os.path.join(model_dir, 'lstm'))
train_and_evaluate(lstm_classifier)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp_a61vz4_/lstm', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0d3bf08240>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmp_a61vz4_/lstm/model.ckpt