In [53]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from razdel import sentenize, tokenize
from collections import Counter, OrderedDict
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import vocab

<h2>Read raw data</h2>

In [92]:
import re
import os

dir_path = "/Users/igorpostoev/Projects/Postgraduate/author_classification_task/sources/"

def get_raw_text_from_dir(path):
    contents = []
    
    for filename in os.listdir(path):
        if not filename.endswith(".txt"):
            continue
        filepath = path + filename
        
        # read input, remove redudant escape chars
        with open(filepath) as f:
            contents += [re.sub('[^А-Яа-я0-9 .,!?:;()]+,-', '', s) for s in f.readlines()]
    contents = ' '.join(contents)
    
    return contents

def get_raw_text_from(path):
    contents = []
    if not path.endswith(".txt"):
        return contents
        # read input, remove redudant escape chars
    with open(path) as f:
        contents += [re.sub('[^А-Яа-я0-9 .,!?:;()]+,-', '', s) for s in f.readlines()]
    contents = ' '.join(contents)
    return contents

bulgakov_raw_data = get_raw_text_from_dir(dir_path + "bulgakov/")
gogol_raw_data = get_raw_text_from_dir(dir_path + "gogol/")

# input

In [93]:
def create_datadarame(raw_text, sentenizer, label):
    sentences = [s.text.replace('\xa0', '').replace('\n', '') for s in sentenizer(raw_text)]
    labels = [label for item in range(0, len(sentences))]
    df = pd.DataFrame({
        "sentence": sentences,
        "label": labels,
    })
    return df

In [94]:
bul_df = create_datadarame(bulgakov_raw_data, sentenize, 0)
gog_df = create_datadarame(gogol_raw_data, sentenize, 1)
bul_df = bul_df.iloc[:int(len(bul_df) * 0.7)]
merged_df = pd.merge(bul_df, gog_df, how='outer')
#shuffle here
merged_df = merged_df.sample(frac=1)
all_items = merged_df.values
num_validate = int(len(all_items) * 0.2)
num_test = int(len(all_items) * 0.1)

train_items = all_items[:len(all_items)- num_test - num_validate]
val_items = all_items[len(train_items): len(train_items) + num_validate]
test_items = all_items[len(train_items) + num_test: len(train_items) + num_test + num_validate]

# text metrics

In [None]:
sum([len(list(tokenize(sentence))) for sentence in bul_df["sentence"]])

In [None]:
sum([len(list(tokenize(sentence))) for sentence in gog_df["sentence"]])

In [None]:
len(bul_df)

In [None]:
len(gog_df)

In [57]:
np.array([np.array([len(w.text) for w in list(tokenize(sentence))]).mean() for sentence in gog_df["sentence"]]).mean()

4.099675450218777

In [None]:
tokens = np.array([np.array([w.text for w in list(tokenize(sentence))]) for sentence in bul_df["sentence"]], dtype=object)
tokens = np.concatenate(tokens)

In [None]:
from nltk.probability import FreqDist
dist = FreqDist(tokens)
toks = [pair[0] for pair in dist.most_common(30)]
nums = [pair[1] for pair in dist.most_common(30)]

In [65]:
names = ['group_a', 'group_b', 'group_c']
values = [1, 10, 100]

plt.figure(figsize=(20, 10), dpi=100)

plt.bar(toks, nums)

NameError: name 'plt' is not defined

<h2>text processing pipelines</h2>

In [7]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def cut_sentence(substrings, min_length, max_length):
    return [t for t in substrings if len(t) > min_length and len(t) < max_length]

def normalize(sentence, tokenizer, min_length=0, max_length=float("inf")):
    tokenized = list(tokenizer(sentence))
    tokenized = [morph.parse(t.text)[0].normal_form for t in tokenized]
    tokenized = cut_sentence(tokenized, min_length, max_length)
    return tokenized

In [8]:
#BOW

import nltk
#--------#

from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

#Create lemmatizer and stopwords list
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

#Preprocess function
def ntlk_preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token.replace('\n', ' ').replace(' ', '') for token in tokens]
    tokens = [token for token in tokens if len(token) > 0
              # token not in russian_stopwords\
              #and token != " "]
              and token.strip() not in punctuation]
    
    return tokens

def create_counter(dfs, tokenizer):
    sentences = [df["sentence"].to_numpy().tolist() for df in dfs]
    sentences = sum(sentences, [])
    tokens = sum([[w for w in ntlk_preprocess_text(s)] for s in sentences], [])
    counter = Counter(tokens)
    return counter, len(tokens)

def yield_bow_tokens(data_iter):
    for _text, _ in data_iter:
        yield iter([w for w in ntlk_preprocess_text(_text)])

In [9]:
tokens_counter, total = create_counter([bul_df, gog_df], tokenize)

In [78]:
vocab_size = 4000
most_common_tokens = tokens_counter.most_common(vocab_size)
token_to_tf = dict([(token, count / total) for (token, count) in most_common_tokens])

In [79]:
bow_vocab = vocab(OrderedDict(most_common_tokens), specials=["<unk>"])
bow_vocab.set_default_index(bow_vocab["<unk>"])

In [12]:
def bow_pipeline(sentence):
    tokenized_sentence =  list(tokenize(sentence))
    return [token in tokenized_sentence for token in most_common_tokens]

def bow_norm_pipeline(sentence, vocabulary):
    tokenized_sentence_tfs = [(vocabulary[t], token_to_tf[t] if t in token_to_tf.keys() else 0) for t in ntlk_preprocess_text(sentence)]
    indeces = [item[0] for item in tokenized_sentence_tfs]
    tfs = [1 for item in tokenized_sentence_tfs]
    bow_vector = np.zeros((1,len(vocabulary)))
    bow_vector[0][indeces] = tfs
    return bow_vector

In [33]:
#EMB
def yield_tokens(data_iter):
    for _text, _ in data_iter:
        yield iter([w.text for w in tokenize(_text)])
        
def yield_norm_tokens(data_iter):
    for _text, _ in data_iter:
        yield iter([w for w in normalize(_text, tokenize)])

def emb_text_pipeline(sentence):
    return list(([bow_vocab[w.text] for w in tokenize(sentence)]))

In [58]:
#NAVEC

from navec import Navec
import torch
from slovnet.model.emb import NavecEmbedding

In [59]:
navec = Navec.load('navec_hudlit_v1_12B_500K_300d_100q.tar')

In [76]:
def navec_text_pipeline(sentence):
    return list([navec[w] if w in navec else navec['<unk>'] for w in ntlk_preprocess_text(sentence)])

# models

In [77]:
class TextClassificationModel(nn.Module):

    def __init__(self, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text_bow):
        return self.fc(text_bow)

In [26]:
class TextClassificationModelEmbedding(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModelEmbedding, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [None]:
class TextClassificationModel2Layer(nn.Module):

    def __init__(self, embed_dim, num_class):
        super(TextClassificationModel2Layer, self).__init__()
        self.fc1 = nn.Linear(embed_dim, 10000)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(10000, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc2.bias.data.zero_()

    def forward(self, text_bow):
        fc1_out = self.fc1(text_bow)
        relu_out = self.relu(fc1_out)
        return self.fc2(relu_out)

In [80]:
model = TextClassificationModel(300, 2)

# prepare dataloaders

In [81]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BATCH_SIZE = 16

def collate_batch_bow(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(bow_norm_pipeline(_text, bow_vocab), dtype=torch.float32)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list).sum(1)
    return text_list.to(device), label_list.to(device)

def collate_batch_emb(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(emb_text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return text_list.to(device), label_list.to(device), offsets.to(device)


def collate_batch_navec(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(navec_text_pipeline(_text), dtype=torch.float32)
        if len(processed_text) == 0:
            processed_text = torch.tensor(navec_text_pipeline('<unk>')[0], dtype=torch.float32)
        else:
            processed_text = processed_text.sum(0)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list)
    return text_list.to(device), label_list.to(device)

train_dataloader = DataLoader(train_items, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch_navec)
valid_dataloader = DataLoader(val_items, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch_navec)

In [103]:
x, y = train_items[0]

In [106]:
[(x,y) for (x,y) in train_items[:1]]

[('Лишь только доктор повернулся, изумление выросло в глазах преследователя, и доктору показалось, что это монгольские раскосые глаза.',
  0)]

In [113]:
collate_batch_navec(train_items[:2])[0].size()

torch.Size([2, 300])

# train

In [116]:
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (text, label) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()
            

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [118]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
import time
# Hyperparameters
EPOCHS = 20  # epoch
LR = 10e-3#5  # learning rate

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)
#split_train_, split_valid_ = \
   # random_split(train_dataset, [num_train, len(train_dataset) - num_train])

| epoch   1 |   500/ 1007 batches | accuracy    0.660
| epoch   1 |  1000/ 1007 batches | accuracy    0.664
-----------------------------------------------------------
| end of epoch   1 | time: 19.96s | valid accuracy    0.656 
-----------------------------------------------------------
| epoch   2 |   500/ 1007 batches | accuracy    0.669
| epoch   2 |  1000/ 1007 batches | accuracy    0.670
-----------------------------------------------------------
| end of epoch   2 | time: 17.89s | valid accuracy    0.662 
-----------------------------------------------------------
| epoch   3 |   500/ 1007 batches | accuracy    0.671
| epoch   3 |  1000/ 1007 batches | accuracy    0.670
-----------------------------------------------------------
| end of epoch   3 | time: 18.20s | valid accuracy    0.662 
-----------------------------------------------------------
| epoch   4 |   500/ 1007 batches | accuracy    0.668
| epoch   4 |  1000/ 1007 batches | accuracy    0.684
-------------------------

<h2>Save results</h2>

In [None]:
def append_log(embedding_desc, model, learning_desc, validation):
    date_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
    data = [[embedding_desc, model.__str__(), learning_desc, validation, date_time]]
    df = pandas.read_csv(dir_path + "results.csv")
    df.loc[len(df.index)] = [''] + data[0]
    df = pd.DataFrame(data, columns=columns)
    df.to_csv(dir_path + "results.csv")

In [None]:
m = nn.Linear(vocab_size, 2)
#input = torch.tensor(bow_tensor_bul[0:1].clone().detach(), dtype=torch.float32)
output = m(bow_tensor_bul[0:5])
print(output)
print(output.size())

# Reslut plot 

In [None]:
equal = predicted[:100].argmax(1).numpy() != np.array(true[:100])

In [None]:
len(test_items)

In [None]:
softmax(model(text))

In [None]:
items_to_eval = train_items[:1000]
col = collate_batch_bow(items_to_eval)
true = [item[1] for item in items_to_eval]
softmax = nn.Softmax(dim=1)
result = model(col[0])
predicted = softmax(result)

In [None]:
data_to_draw = torch.tensor([p[0] - p[1] if t == 0 else p[1] - p[0] for t, p in zip(true, predicted)])
data_to_draw = data_to_draw.numpy().tolist()

In [None]:
np.arange(100, step=10)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

rang = range(len(data_to_draw))
negative_data_min = [x if x < 0 and x >= -0.5 else 0 for x in data_to_draw]
negative_data_max = [x if x < -0.5 else 0 for x in data_to_draw]
positive_data_min = [x if x > 0 and x <= 0.5 else 0 for x in data_to_draw]
positive_data_max = [x if x > 0.5 else 0 for x in data_to_draw]

fig = plt.figure(figsize=(20, 15), dpi=100)
plt.xticks(np.arange(len(data_to_draw), step=50))
ax = plt.subplot(111)
ax.bar(rang, negative_data_min, width=1, color='r', alpha=0.4)
ax.bar(rang, negative_data_max, width=1, color='r', alpha=0.8)
ax.bar(rang, positive_data_min, width=1, color='g', alpha=0.4)
ax.bar(rang, positive_data_max, width=1, color='g', alpha=0.8)


In [None]:
mu, sigma = 100, 15
x = mu + sigma * np.random.randn(10000)

# the histogram of the data
n, bins, patches = plt.hist(x, 6, density=True, facecolor='g', alpha=1)

plt.xlabel('Inputs')
plt.ylabel('Distance')
plt.title('BoW + FF 1')
#plt.text(60, .025, r'$\mu=100,\ \sigma=15$')
plt.axis([30, 500, -2, 2])
plt.grid(True)
plt.show()

In [None]:
gog_text = "Не только в вашей стране, но и в других странах будут сочинять подражания его пьесам и писать переделки этих пьес. Ученые различных стран напишут подробные исследования его произведений и шаг за шагом постараются проследить его таинственную жизнь. Они докажут вам, что этот человек, который сейчас у вас в руках подает лишь слабые признаки жизни, будет влиять на многих писателей будущих столетий, в том числе на таких, неизвестных вам, но известных мне, как соотечественники мои Грибоедов, Пушкин и Гоголь."

In [None]:
gog_input = collate_batch_bow([(gog_text, 0)])

In [None]:
model(gog_input[0])

In [None]:
len(bins)

<h2>SVM</h2>

In [None]:
from sklearn import svm
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from numpy import mean
from numpy import std

In [None]:
X = df['X']
y = df['y']

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = svm.SVC(C=1, decision_function_shape='ovo')

# evaluate results

In [None]:
bow_norm_tokenize(te)

<h2>Embeddings option</h2>