In [2]:
import os
import numpy as np

import tokenize_uk
from langdetect import detect

##### Load messages

In [3]:
dir_name = '1551'
files = [entry.path
         for entry in os.scandir(dir_name)
         if entry.is_file()
         and entry.name.endswith('.txt')]

In [60]:
category_names, messages, messages_category = [], [], []

for i, f in enumerate(files):
    print(f'\r{i+1} / {len(files)}', end='')
    category_names.append(f[len(dir_name)+1:-4].replace('-', ' '))
    
    with open(f) as fr:
        lines = fr.readlines()
    lines = ''.join(lines).split('\n'*3)
    
    for m in lines:
        try:
            id_, text_ = m.split('\n', 1)
            if detect(text_) == 'uk':
                messages.append(text_)
                messages_category.append(i)
        except:
            pass

188 / 188

##### Split data: training / test

In [62]:
from sklearn.model_selection import train_test_split

In [63]:
x_train, x_test, y_train, y_test = train_test_split(messages, messages_category, test_size=0.3)

##### Load word vectors

In [64]:
import bz2
from gensim.models import KeyedVectors

In [65]:
with bz2.open('news.lowercased.tokenized.word2vec.300d.bz2', 'rt') as fr:
    uk_vectors = KeyedVectors.load_word2vec_format(fr, binary=False)

##### Compute vectors for a given list of texts

In [253]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [246]:
def weighted_average(vectors, weights):
    result = weights @ vectors / np.sum(weights, axis=1, keepdims=True)
    return result # / np.linalg.norm(result) # vector normalization doesn't improve results much

In [283]:
def messages_vectors(doc, tokenizer=tokenize_uk.tokenize_words, vectorizer=CountVectorizer):
    """
    Returns a list of vectors corresponding to a list of texts.
    Vectors are weighted averages of vectors of words comprising the texts.
    """
    v = vectorizer(tokenizer=tokenizer)
    words_count = v.fit_transform( doc ).toarray()
    vocabulary  = v.get_feature_names()
    
    vocabulary_vectors = [uk_vectors.get_vector(word)
                          if word in uk_vectors
                          else np.zeros(300)
                          for word in vocabulary ]
    
    vectors_found = [1 if word in uk_vectors else 10**-10 for word in vocabulary]  # 10^-10 is used to avoid div/0
    
    return weighted_average(vocabulary_vectors, words_count * vectors_found )

##### Classification report

In [171]:
from sklearn.metrics import classification_report

In [173]:
def print_report(y_test, y_predict):
    report = classification_report(y_test, y_predict).split('\n')
    print(report[0])
    print(report[-2])

#### 1. Baseine - KNN, no TF-IDF, no lemmatization

In [79]:
from sklearn.neighbors import KNeighborsClassifier

In [309]:
def evaluate_KNN(tokenizer=tokenize_uk.tokenize_words, vectorizer=CountVectorizer):
    train_vectors = messages_vectors( x_train, tokenizer, vectorizer )
    test_vectors = messages_vectors( x_test, tokenizer, vectorizer )
    
    knn = KNeighborsClassifier(metric='euclidean')
    knn.fit(train_vectors, y_train)
    
    y_predict = knn.predict(test_vectors)
    
    print_report(y_test, y_predict)

In [310]:
evaluate_KNN()

              precision    recall  f1-score   support
weighted avg       0.37      0.34      0.33     18739


#### 2. Class average vectors

In [284]:
train_vectors = messages_vectors( x_train )

In [285]:
test_vectors = messages_vectors( x_test )

In [230]:
weights = ([[1 if y_train[j] == category else 0
             for j in range(len(x_train))]
            for category in range(len(category_names))])

In [247]:
category_vectors = weighted_average(train_vectors, weights)

In [239]:
def find_most_similar(vector) -> int:
    similarity = KeyedVectors.cosine_similarities(vector, category_vectors)
    if sum(vector) == 0:
        # can happen when none of the words from the message are found in a vocabulary
        # for exemple "Відсутне опаленненя"
        return 158 # most common category - "Відсутність-ГВП"
    else:
        return np.where(similarity == np.nanmax(similarity))[0][0]

In [248]:
y_predict = [ find_most_similar( v ) for v in test_vectors]

In [249]:
print_report(y_test, y_predict)

              precision    recall  f1-score   support
weighted avg       0.44      0.21      0.24     18739


##### Observations:

1. Aggregation of message vectors to represent a whole class of messages results in higher precisioin but lower recall as compared to kNN with vectors for individual messages.
2. We can clearly see a corelation between precision and a number of class instances in our training samples:

<img src='Precision.png' width=350> 

Highest precision of 93% is seen for the category with highest number of training samples.

3. Alternatevely, high precision is also seen in categories with some very distinctive features. For example, category \#178 "Заміна та експлуатація поштових скриньок" having only 142 training samples has precision of 90% - it contains a very distinctive "поштових скриньок" phrase in every message.

#### 3. Apply TF-IDF

In [311]:
evaluate_KNN( vectorizer=TfidfVectorizer )

              precision    recall  f1-score   support
weighted avg       0.40      0.36      0.35     18739


  'precision', 'predicted', average, warn_for)


##### Observation
TF-IDF resulted in a bit higher precison and recall than the baseline.

#### 4. Apply lemmatization

In [297]:
import pymorphy2

In [298]:
morph = pymorphy2.MorphAnalyzer(lang='uk')

In [302]:
def stemmed_tokenizer(doc):
    return [morph.parse(w)[0].normal_form for w in tokenize_uk.tokenize_words(doc)]

In [325]:
evaluate_KNN(vectorizer=TfidfVectorizer, tokenizer=stemmed_tokenizer)

              precision    recall  f1-score   support
weighted avg       0.39      0.36      0.36     18739


  'precision', 'predicted', average, warn_for)


##### Observations
1. Lemmatization has slightly improved recall.
2. Most messages contain numerous typos. We supposedly may increase precision by correcting errors in spelling.

#### 5. Logistic regression on message vectors

In [328]:
from sklearn.linear_model import LogisticRegression

In [326]:
train_vectors = messages_vectors( x_train, vectorizer=TfidfVectorizer, tokenizer=stemmed_tokenizer )

In [327]:
test_vectors = messages_vectors( x_test, vectorizer=TfidfVectorizer, tokenizer=stemmed_tokenizer )

In [341]:
logreg = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=500)

In [None]:
logreg.fit(train_vectors, y_train)

In [None]:
y_predict = logreg.predict(test_vectors)

In [340]:
print_report(y_test, y_predict)

              precision    recall  f1-score   support
weighted avg       0.56      0.57      0.56     18739


##### Observation

1. Linear classifiers seem to be able to pick up some extra information contained in vectors which improves substantially precision.