In [1]:
import torch

import pandas as pd
from razdel import sentenize, tokenize
from collections import Counter
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

<h2>Read raw data</h2>

In [10]:
import re
import os

dir_path = "/Users/igorpostoev/Projects/Postgraduate/author_classification_task/sources/"

def get_raw_text_from(path):
    contents = []
    
    for filename in os.listdir(path):
        if not filename.endswith(".txt"):
            continue
        filepath = path + filename
        
        # read input, remove redudant escape chars
        with open(filepath) as f:
            contents += [re.sub('[^А-Яа-я0-9 .,!?:;()]+,-', '', s) for s in f.readlines()]
    contents = ' '.join(contents)
    
    return contents

bulgakov_raw_data = get_raw_text_from(dir_path + "bulgakov/")
gogol_raw_data = get_raw_text_from(dir_path + "gogol/")

<h2>Create dataframe</h2>

In [11]:
def create_datadarame(raw_text, sentenizer, label):
    sentences = [s.text for s in sentenizer(raw_text)]
    labels = [label for item in range(0, len(sentences))]
    df = pd.DataFrame({
        "sentence": sentences,
        "label": labels,
    })
    df = df.sample(frac=1)
    return df

bul_df = create_datadarame(bulgakov_raw_data, sentenize, 0)
gog_df = create_datadarame(gogol_raw_data, sentenize, 1)

<h2>Text processing pipelines</h2>

In [33]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def cut_sentence(substrings, min_length, max_length):
    return [t for t in substrings if len(t.text) > min_length and len(t.text) < max_length]

def basic_token_pipeline(sentence, tokenizer):
    tokenized = list(tokenizer(sentence))
    return tokenized
        
def basic_label_pipeline(label):
    return label

def normalized_token_pipeline(sentence, tokenizer, translator, min_length=0, max_length=float("inf")):
    tokenized = list(tokenizer(sentence))
    tokenized = [morph.parse(t.text)[0].normal_form for t in tokenized]
    tokenized = cut_sentence(tokenized, min_length, max_length)
    return tokenized

# make bow tensors by sentences

In [70]:
def create_counter(dfs, tokenizer):
    sentences = [df["sentence"] for df in dfs]
    tokens = sum([[w.text for w in tokenizer(s)] for s in sentences], [])
    counter = Counter(tokens)
    return counter

tokens_counter = create_counter([bul_df, gog_df], tokenize)

In [75]:
vocab_size = 5000
most_common_tokens = list(dict(c.most_common(vocab_size)))

In [76]:
contained_matrix_bul = [[w in s for w in most_common_tokens] for s in tokenized_sentences_bul]
contained_matrix_gog = [[w in s for w in most_common_tokens] for s in tokenized_sentences_gogol]

In [77]:
bow_tensor_bul = torch.tensor(contained_matrix_bul, dtype=torch.float32)
bow_tensor_gog = torch.tensor(contained_matrix_gog, dtype=torch.float32)

# setup model

In [83]:
import torch.nn as nn

In [84]:
class TextClassificationModel(nn.Module):

    def __init__(self, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text_bow):
        return self.fc(text_bow)

In [None]:
class TextClassificationModelWithEmbedding(nn.Module):

    def __init__(self, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text_bow):
        return self.fc(text_bow)

In [85]:
text_bow = torch.tensor([x for x in df['X'].values], dtype=torch.float32)
labels = torch.tensor([x for x in df['y'].values], dtype=torch.float32)
num_class = len(set([label for label in labels.tolist()]))
emsize = text_bow.shape[1]
model = TextClassificationModel(emsize, num_class)

# train model

In [92]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
import time
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 16 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

all_items = df.values
num_validate = int(len(all_items) * 0.1)
num_test = int(len(all_items) * 0.1)

train_items = all_items[:len(all_items) - num_test - num_validate]
test_items = all_items[len(train_items): len(train_items) + num_test]
val_items = all_items[len(train_items) + num_test: len(all_items) + num_test + num_validate]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#split_train_, split_valid_ = \
   # random_split(train_dataset, [num_train, len(train_dataset) - num_train])
    
def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        text_list.append(_text)
        label_list.append(_label)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.tensor(text_list, dtype=torch.float32)
    return text_list.to(device), label_list.to(device)

train_dataloader = DataLoader(train_items, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(val_items, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_items, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

In [97]:
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (text, label) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()
            

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [98]:
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

-----------------------------------------------------------
| end of epoch   1 | time:  0.36s | valid accuracy    0.856 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time:  0.36s | valid accuracy    0.862 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time:  0.34s | valid accuracy    0.877 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time:  0.33s | valid accuracy    0.881 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time:  0.34s | valid accuracy    0.852 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   6 | time:  0.33s |

In [108]:
m = nn.Linear(vocab_size, 2)
#input = torch.tensor(bow_tensor_bul[0:1].clone().detach(), dtype=torch.float32)
output = m(bow_tensor_bul[0:5])
print(output)
print(output.size())

tensor([[-0.3074, -0.1691],
        [-0.3074, -0.1691],
        [-0.3074, -0.1691],
        [-0.0512, -0.1470],
        [-0.3074, -0.1691]], grad_fn=<AddmmBackward0>)
torch.Size([5, 2])


In [161]:
from sklearn import svm
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from numpy import mean
from numpy import std

In [162]:
X = df['X']
y = df['y']

In [169]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
model = svm.SVC(C=1, decision_function_shape='ovo')

# evaluate results

In [170]:
metrics = cross_validate(model, X, y, scoring=['precision_macro', 'recall_macro'], cv=cv, n_jobs=-1)

print('Precision: %.3f (%.3f)' % (mean(metrics["test_precision_macro"]), std(metrics["test_precision_macro"])))
print('Recall: %.3f (%.3f)' % (mean(metrics["test_recall_macro"]), -std(metrics["test_recall_macro"])))

Precision: nan (nan)
Recall: nan (nan)


In [24]:
def divide_chunks(l, n):
    # looping till length l
    for i in range(0, len(l), n):
        yield l[i:i + n]
        
n = int(len(sentences) / 5)
 
divided_sentences = list(divide_chunks(sentences, n))
#frequencies
counted_tokens = Counter(normalized_tokens).most_common()
tokens_by_frequencies = dict([(w, n / len(counted_tokens)) for w, n in counted_tokens])
freq_sentences_bul = [[tokens_by_frequencies[t] for t in s] for s in tokenized_sentences_bul]
freq_sentences_gogol = [[tokens_by_frequencies[t] for t in s] for s in tokenized_sentences_gogol]

torch.Size([45912, 45912])