# 1551TextsClassifier

## Get Test and Train texts

Read

In [1]:
import glob
import numpy as np
import re

def get_claims_texts(fname):
    with open(fname) as f:
        all_claims_text = f.read()
    return re.split("\d{7}", all_claims_text)

claims_texts = {}
for claim_fname in glob.glob("1551/*.txt"):
    cat_name = claim_fname[5:-4]
    claims_texts[cat_name] = []
    for claim_text in get_claims_texts(claim_fname):
        claims_texts[cat_name].append(claim_text)

Select uk

In [2]:
import langid

is_uk = lambda text: langid.classify(text)[0] == 'uk'

for cat in claims_texts:
    prev_len = len(claims_texts[cat])
    claims_texts[cat] = [text for text in claims_texts[cat] if is_uk(text)]
    #print(f'{cat}: {len(claims_texts[cat])}/{prev_len}')

Split to test and train

In [3]:
y2cat = list(claims_texts.keys())
cat2y = lambda cat : y2cat.index(cat)

In [4]:
#Equal classes distributions in test and train sets
#Texts are sorted by classes

import random

TRAIN_SIZE = 0.7

X_train_texts, Y_train = [], []
X_test_texts, Y_test = [], []
for cat in claims_texts:
    y = cat2y(cat)
    random.shuffle(claims_texts[cat])
    train_size = int(len(claims_texts[cat]) * TRAIN_SIZE)
    for i in range(train_size):
        X_train_texts.append(claims_texts[cat][i])
        Y_train.append(y)
    for i in range(train_size, len(claims_texts[cat])):
        X_test_texts.append(claims_texts[cat][i])
        Y_test.append(y)

## Baseline solution

### Get vectors

In [5]:
import gensim
from gensim.models import KeyedVectors

uk_vectors_file = 'news.lowercased.tokenized.word2vec.300d'
uk_vectors = KeyedVectors.load_word2vec_format(uk_vectors_file, binary=False)

assert len(uk_vectors["слово"]) == 300
assert uk_vectors.distance("слово", "слова") < uk_vectors.distance("слово", "сова")
assert uk_vectors.similar_by_vector(uk_vectors["король"] - uk_vectors["чоловік"] + uk_vectors["жінка"])[0][0] == 'королева'



In [6]:
def baseline_vector(text):
    to_vec = lambda word : uk_vectors[word] if word in uk_vectors else np.zeros(300)
    vectors = [to_vec(word_str) for word_str in re.split('\W+', text)]
    return np.sum(np.array(vectors), axis=0)

X_train = [baseline_vector(text) for text in X_train_texts]
X_test = [baseline_vector(text) for text in X_test_texts]

### Classify vectors

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import scipy

def cos_dist(x, y):
    return  scipy.spatial.distance.cosine(x, y)

#knn = KNeighborsClassifier(metric=cos_dist) # worked > 1 hour
knn = KNeighborsClassifier(metric='euclidean', n_jobs=10)
knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=10, n_neighbors=5, p=2,
           weights='uniform')

In [8]:
Y_predict = knn.predict(X_test)

def print_short_report(Y_test, Y_predict):
    report_lines = classification_report(Y_test, Y_predict).split('\n')
    print(report_lines[0])
    print(report_lines[-2])

print_short_report(Y_test, Y_predict)

              precision    recall  f1-score   support
weighted avg       0.32      0.32      0.30     18886


  'precision', 'predicted', average, warn_for)


## Baseline improvements

### Improve text preprocessing
* regexp -> tokenize_uk
* lemmatization 
* only alphabetical words
* stopwords

### Improve vector generatoin
* sum(vetors) -> avg(vectors)

In [9]:
#!pip install stop-words

In [10]:
from stop_words import get_stop_words
import tokenize_uk
import pymorphy2
import re

morph = pymorphy2.MorphAnalyzer(lang='uk')
stop_words = get_stop_words('ukrainian')

def text2norm_words(text):
    words = tokenize_uk.tokenize_uk.tokenize_words(text)

    # f1: 0.3 -> 0.36
    words = [w for w in words if len(w) > 3]
    words = [w for w in words if w.isalpha()]
    words = [w.lower() for w in words]

    # f1: 0.36 -> 0.39
    words = [w for w in words if w not in stop_words]
    words = [morph.parse(word)[0].normal_form for word in words]
    #TODO: filter by POS

    #words = list(set(words)) -> f1 -= 0.04
    if not words:
        words = ['']
    return words

def normalized_text_vector(text):
    to_vec = lambda word : uk_vectors[word] if word in uk_vectors else np.zeros(300)
    vectors = [to_vec(word) for word in text2norm_words(text)]
    return np.sum(np.array(vectors), axis=0) / len(vectors)

In [11]:
X_train = [normalized_text_vector(text) for text in X_train_texts]
X_test = [normalized_text_vector(text) for text in X_test_texts]

In [12]:
knn = KNeighborsClassifier(metric='euclidean', n_jobs=10)
knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=10, n_neighbors=5, p=2,
           weights='uniform')

In [13]:
Y_predict = knn.predict(X_test)
print_short_report(Y_test, Y_predict)

              precision    recall  f1-score   support
weighted avg       0.42      0.39      0.39     18886


  'precision', 'predicted', average, warn_for)


### Improve classifier
* knn -> logreg
* TODO: tune params

In [14]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
Y_predict = clf.predict(X_test)
print_short_report(Y_test, Y_predict)

              precision    recall  f1-score   support
weighted avg       0.55      0.56      0.54     18886


# Drafts

## doc2vec

### Train model

In [131]:
def to_doc2vec_format(X, Y):
    res = []
    for i in range(len(Y)):
        res.append(gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(X[i]), [Y[i]]))
        #res.append(gensim.models.doc2vec.TaggedDocument(text2norm_words(X[i]), [Y[i]]))
    return res

train_set = to_doc2vec_format(X_train_texts, Y_train)

model = gensim.models.doc2vec.Doc2Vec(vector_size=10, min_count=2, epochs=40)
model.build_vocab(train_set)
model.train(train_set, total_examples=model.corpus_count, epochs=model.epochs)

doc2vec = lambda text : model.infer_vector(gensim.utils.simple_preprocess(text))

### Train classifier

In [132]:
X_train = [doc2vec(text) for text in X_train_texts]
X_test  = [doc2vec(text) for text in X_test_texts]

knn = KNeighborsClassifier(metric='euclidean', n_jobs=10)
knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=10, n_neighbors=5, p=2,
           weights='uniform')

### Test

In [133]:
Y_predict = knn.predict(X_test)
print_short_report(Y_test, Y_predict)

              precision    recall  f1-score   support
weighted avg       0.07      0.05      0.05     18886


  'precision', 'predicted', average, warn_for)
