In [126]:
import torch
import torch.nn as nn
import pandas as pd
from razdel import sentenize, tokenize
from collections import Counter, OrderedDict
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import vocab

<h2>Read raw data</h2>

In [2]:
import re
import os

dir_path = "/Users/igorpostoev/Projects/Postgraduate/author_classification_task/sources/"

def get_raw_text_from(path):
    contents = []
    
    for filename in os.listdir(path):
        if not filename.endswith(".txt"):
            continue
        filepath = path + filename
        
        # read input, remove redudant escape chars
        with open(filepath) as f:
            contents += [re.sub('[^А-Яа-я0-9 .,!?:;()]+,-', '', s) for s in f.readlines()]
    contents = ' '.join(contents)
    
    return contents

bulgakov_raw_data = get_raw_text_from(dir_path + "bulgakov/")
gogol_raw_data = get_raw_text_from(dir_path + "gogol/")

<h2>Create dataframe</h2>

In [84]:
def create_datadarame(raw_text, sentenizer, label):
    sentences = [s.text.replace('\xa0', '').replace('\n', '') for s in sentenizer(raw_text)]
    labels = [label for item in range(0, len(sentences))]
    df = pd.DataFrame({
        "sentence": sentences,
        "label": labels,
    })
    return df

bul_df = create_datadarame(bulgakov_raw_data, sentenize, 0)
gog_df = create_datadarame(gogol_raw_data, sentenize, 1)

# input

In [85]:
bul_df = bul_df.iloc[:len(bul_df) // 2]
merged_df = pd.merge(bul_df, gog_df, how='outer')
merged_df = merged_df.sample(frac=1)
all_items = merged_df.values
num_validate = int(len(all_items) * 0.1)
num_test = int(len(all_items) * 0.1)

train_items = all_items[:len(all_items)- num_test - num_validate]
val_items = all_items[len(train_items): len(train_items) + num_validate]
test_items = all_items[len(train_items) + num_test: len(train_items) + num_test + num_validate]

#train_items = train_items[:len(train_items) // 2]
#test_items = train_items[:len(test_items) // 2]
#val_items = val_items[:len(val_items) // 2]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

<h2>text processing pipelines</h2>

In [None]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def cut_sentence(substrings, min_length, max_length):
    return [t for t in substrings if len(t) > min_length and len(t) < max_length]

def normalize(sentence, tokenizer, min_length=0, max_length=float("inf")):
    tokenized = list(tokenizer(sentence))
    tokenized = [morph.parse(t.text)[0].normal_form for t in tokenized]
    tokenized = cut_sentence(tokenized, min_length, max_length)
    return tokenized

In [117]:
#BOW

import nltk
#--------#

from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

#Create lemmatizer and stopwords list
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")

#Preprocess function
def ntlk_preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token.replace('\n', ' ').replace(' ', '') for token in tokens]
    tokens = [token for token in tokens if len(token) > 0]
              # token not in russian_stopwords\
              #and token != " "]
              #and token.strip() not in punctuation]
    
    return tokens

def create_counter(dfs, tokenizer):
    sentences = [df["sentence"].to_numpy().tolist() for df in dfs]
    sentences = sum(sentences, [])
    tokens = sum([[w for w in ntlk_preprocess_text(s)] for s in sentences], [])
    counter = Counter(tokens)
    return counter, len(tokens)

def yield_bow_tokens(data_iter):
    for _text, _ in data_iter:
        yield iter([w for w in ntlk_preprocess_text(_text)])

In [101]:
tokens_counter, total = create_counter([bul_df, gog_df], tokenize)

In [192]:
vocab_size = 3000
most_common_tokens = tokens_counter.most_common(vocab_size)
token_to_tf = dict([(token, count / total) for (token, count) in most_common_tokens])
bow_vocab = vocab(OrderedDict(most_common_tokens))

In [185]:
import numpy as np
arr = np.zeros((2,1))

In [189]:
train_items[5,0]

'Отец любит свое дитя, мать любит свое дитя, дитя любит отца и мать.'

In [211]:
bow_norm_pipeline(train_items[5,0], bow_vocab)

[184, 326, 32, 788, 0, 239, 326, 32, 788, 0, 788, 326, 184, 2, 239, 1]
[0.0005369330384841784, 0.00030625069602430914, 0.00344830329159839, 0.00013522758006268197, 0.10190193614076395, 0.00042159186725424376, 0.00030625069602430914, 0.00344830329159839, 0.00013522758006268197, 0.10190193614076395, 0.00013522758006268197, 0.00030625069602430914, 0.0005369330384841784, 0.036348378064495604, 0.00042159186725424376, 0.04623590053613758]


array([[0.10190194, 0.0462359 , 0.03634838, ..., 0.        , 0.        ,
        0.        ]])

In [206]:
bow_vector = np.zeros((1,3))
bow_vector[0][1] = 2

In [207]:
bow_vector

array([[0., 2., 0.]])

In [210]:
def bow_pipeline(sentence):
    tokenized_sentence =  list(tokenize(sentence))
    return [token in tokenized_sentence for token in most_common_tokens]

def bow_norm_pipeline(sentence, vocabulary):
    tokenized_sentence_tfs = [(vocabulary[t], token_to_tf[t]) for t in ntlk_preprocess_text(sentence)]
    indeces = [item[0] for item in tokenized_sentence_tfs]
    tfs = [item[1] for item in tokenized_sentence_tfs]
    bow_vector = np.zeros((1,len(vocabulary)))
    bow_vector[0][indeces] = tfs
    return bow_vector

In [113]:
#EMB
def yield_tokens(data_iter):
    for _text, _ in data_iter:
        yield iter([w.text for w in tokenize(_text)])
        
def yield_norm_tokens(data_iter):
    for _text, _ in data_iter:
        yield iter([w for w in normalize(_text, tokenize)])
        
def emb_text_pipeline(sentence):
    return list(([vocab[w.text] for w in tokenize(sentence, tokenize)]))

In [114]:
vocab = build_vocab_from_iterator(yield_tokens(iter(train_items)), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

def emb_text_pipeline(sentence):
    return list(([vocab[w.text] for w in tokenize(sentence)]))

In [None]:
#NAVEC

from navec import Navec
import torch
from slovnet.model.emb import NavecEmbedding

In [None]:
navec = Navec.load('navec_hudlit_v1_12B_500K_300d_100q.tar')

In [None]:
def navec_text_pipeline(sentence):
    return list([navec[w] if w in navec else navec['<unk>'] for w in normalize(sentence, tokenize, min_length=2, max_length=25)])

# models

In [None]:
class TextClassificationModel(nn.Module):

    def __init__(self, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text_bow):
        return self.fc(text_bow)

In [None]:
class TextClassificationModelEmbedding(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModelEmbedding, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [None]:
class TextClassificationModel2Layer(nn.Module):

    def __init__(self, embed_dim, num_class):
        super(TextClassificationModel2Layer, self).__init__()
        self.fc1 = nn.Linear(embed_dim, 1000)
        self.fc2 = nn.Linear(1000, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc2.bias.data.zero_()

    def forward(self, text_bow):
        return self.fc2(self.fc1(text_bow))

In [None]:
model = TextClassificationModel2Layer(300, 2)

# prepare dataloaders

In [None]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BATCH_SIZE = 8

def collate_batch_emb(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(navec_text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return text_list.to(device), label_list.to(device), offsets.to(device)


def collate_batch_navec(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(navec_text_pipeline(_text), dtype=torch.float32)
        if len(processed_text) == 0:
            processed_text = torch.tensor(navec_text_pipeline('<unk>')[0], dtype=torch.float32)
        else:
            processed_text = processed_text.sum(0)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list)
    return text_list.to(device), label_list.to(device)

train_dataloader = DataLoader(train_items, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch_navec)
valid_dataloader = DataLoader(val_items, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch_navec)

# train

In [None]:
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (text, label) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()
            

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [None]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
import time
# Hyperparameters
EPOCHS = 4 # epoch
LR = 5  # learning rate

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)
#split_train_, split_valid_ = \
   # random_split(train_dataset, [num_train, len(train_dataset) - num_train])

<h2>Save results</h2>

In [None]:
def append_log(embedding_desc, model, learning_desc, validation):
    date_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
    data = [[embedding_desc, model.__str__(), learning_desc, validation, date_time]]
    df = pandas.read_csv(dir_path + "results.csv")
    df.loc[len(df.index)] = [''] + data[0]
    df = pd.DataFrame(data, columns=columns)
    df.to_csv(dir_path + "results.csv")

In [None]:
m = nn.Linear(vocab_size, 2)
#input = torch.tensor(bow_tensor_bul[0:1].clone().detach(), dtype=torch.float32)
output = m(bow_tensor_bul[0:5])
print(output)
print(output.size())

<h2>SVM</h2>

In [None]:
from sklearn import svm
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from numpy import mean
from numpy import std

In [None]:
X = df['X']
y = df['y']

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = svm.SVC(C=1, decision_function_shape='ovo')

# evaluate results

In [None]:
bow_norm_tokenize(te)

<h2>Embeddings option</h2>