In [1]:
import json, os
import pandas as pd
from nltk.corpus import stopwords
import numpy as np
from pymorphy2 import MorphAnalyzer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
morph = MorphAnalyzer()
stops = set(stopwords.words('russian'))

In [2]:
import numpy as np

In [3]:
pd.set_option('display.max_colwidth', 1000)

In [4]:

PATH_TO_DATA = './'

In [5]:
files = [os.path.join(PATH_TO_DATA, file) for file in os.listdir(PATH_TO_DATA) if file.endswith('jsonlines')]

In [6]:
data = pd.concat([pd.read_json(file, lines=True) for file in files][:5], axis=0, ignore_index=True)

In [7]:
data.shape

(1987, 5)

In [24]:
def evaluate(true_kws, predicted_kws):
    assert len(true_kws) == len(predicted_kws)
    
    precisions = []
    recalls = []
    f1s = []
    jaccards = []
    
    for i in range(len(true_kws)):
        
        true_kw = set(true_kws[i])
        predicted_kw = set(predicted_kws[i])
        
        tp = len(true_kw & predicted_kw)
        union = len(true_kw | predicted_kw)
        fp = len(predicted_kw - true_kw)
        fn = len(true_kw - predicted_kw)
        
        if (tp+fp) == 0:
            prec = 0
        else:
            prec = tp / (tp + fp)
        
        if (tp+fn) == 0:
            rec = 0
        else:
            rec = tp / (tp + fn)
        if (prec+rec) == 0:
            f1 = 0
        else:
            f1 = (2*(prec*rec))/(prec+rec)
            
        jac = tp / union
        
        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)
        jaccards.append(jac)
    print('Precision - ', round(np.mean(precisions), 2))
    print('Recall - ', round(np.mean(recalls), 2))
    print('F1 - ', round(np.mean(f1s), 2))
    print('Jaccard - ', round(np.mean(jaccards), 2))
    
    
        

In [8]:
from string import punctuation
from nltk.corpus import stopwords
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return words

In [9]:
data['content_norm'] = data['content'].apply(normalize)

In [10]:
data['title_norm'] = data['title'].apply(normalize)

In [11]:
def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0] for word in words if word and word not in stops]
    words = [word.normal_form for word in words if word.tag.POS == 'NOUN']

    return words

In [12]:
data['content_norm'] = data['content'].apply(normalize)

In [13]:
data['content_norm_str'] = data['content_norm'].apply(' '.join)

In [14]:
# можно заодно сделать нграммы
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2)

In [15]:
tfidf.fit(data['content_norm_str'])

TfidfVectorizer(min_df=2, ngram_range=(1, 2))

In [16]:
id2word = {i:word for i,word in enumerate(tfidf.get_feature_names())}

In [17]:
texts_vectors = tfidf.transform(data['content_norm_str'])

## BASELINE

In [22]:
## так как матрица в tfidf в спарс формате,  ее нельзя просто так отсортировать
## перевести ее в обычный формат для всех данных тоже не получится - не хватит памяти
## поэтому пройдем по строчкам, переведем строчку в обычный array и отсортируем ее
keywords = []

for row in range(texts_vectors.shape[0]):
    row_data = texts_vectors.getrow(row)
    top_inds = row_data.toarray().argsort()[0,:-11:-1]
    keywords.append([id2word[w] for w in top_inds])

In [49]:
evaluate(data['keywords'], keywords)

Precision -  0.13
Recall -  0.25
F1 -  0.16
Jaccard -  0.09


Precision -  0.13
Recall -  0.25
F1 -  0.16
Jaccard -  0.09

## ОПЫТ 1: добавление ЛДА

In [26]:
import gensim

In [27]:
dictionary = gensim.corpora.Dictionary(data['content_norm'])
dictionary.filter_extremes(no_above=0.075, no_below=25)
dictionary.compactify()

In [28]:
corpus = data["content_norm"].apply(
    lambda x: dictionary.doc2bow(x)
)

In [29]:
lda = gensim.models.LdaMulticore(corpus, 200, alpha=0.11, eta=0.66, id2word=dictionary, eval_every=0, passes=15)

In [30]:
def get_topic_tokens(model, corpus):
    tokens = []
    for index in range(len(corpus)):
        probs = model[corpus[index]]
        if len(probs) == 0: 
            tokens.append(list())
            continue
        relevant_key = max(probs, key = lambda x: x[1])[0]
        topic_tokens = [i[0] for i in model.show_topic(relevant_key)]
        tokens.append(topic_tokens)
    return tokens

In [31]:
t_t = get_topic_tokens(lda, corpus)

In [86]:
frequent = data['content_norm'].apply(lambda x: [x[0] for x in Counter(x).most_common(20)])
filtered = []
for idx in range(len(frequent)):
    row_data = texts_vectors.getrow(idx)
    top_inds = row_data.toarray().argsort()[0,:-21:-1]
    top_words = [id2word[w] for w in top_inds]
    if len(t_t[idx]) > 0:
        common = [i for i in frequent[idx] if (i in t_t[idx] or i in top_words)]
        filtered.append(common)
    else:
        filtered.append([i for i in frequent[idx] if i in top_words])
       

### Выше реколл и эф-1

In [87]:
evaluate(data['keywords'], filtered)

Precision -  0.14
Recall -  0.29
F1 -  0.18
Jaccard -  0.1


## ОПЫТ 2: добавление эмбеддингов с русвекторес

In [32]:
fasttext_model = gensim.models.KeyedVectors.load("fasttext/model.model")

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
def embed_list(lst:list, model:object):
    arr = np.array([model.get_vector(n) for n in lst])
    return np.average(arr, axis=0)

In [35]:
averaged_embeddings = [embed_list(item, fasttext_model) for item in data["content_norm"]]

In [36]:
most_common = data['content_norm'].apply(lambda x: [x[0] for x in Counter(x).most_common(30)])

In [37]:
relevant = [
    sorted(most_common[idx],
           reverse=True,
           key=lambda x: cosine_similarity(
               np.atleast_2d(fasttext_model[x]),
               np.atleast_2d(averaged_embeddings[idx]))[0])[:15] 
    for idx 
    in range(len(most_common))
]

In [38]:
frequent = data['content_norm'].apply(lambda x: [x[0] for x in Counter(x).most_common(30)])
filtered = []
for idx in range(len(frequent)):
    row_data = texts_vectors.getrow(idx)
    top_inds = row_data.toarray().argsort()[0,:-21:-1]
    top_words = [id2word[w] for w in top_inds]
    common = [i for i in frequent[idx] if (i in relevant[idx] and i in top_words)]
    filtered.append(common)

### Выше пресижн и ф-1

In [146]:
evaluate(data['keywords'], filtered)

Precision -  0.16
Recall -  0.24
F1 -  0.18
Jaccard -  0.11


## ОПЫТ 2.5: комбинация

In [39]:
filtered = []
for idx in range(len(frequent)):
    row_data = texts_vectors.getrow(idx)
    top_inds = row_data.toarray().argsort()[0,:-21:-1]
    top_words = [id2word[w] for w in top_inds]
    if len(t_t[idx]) > 0:
        common = [i for i in frequent[idx] if (i in t_t[idx] or (i in relevant[idx] and i in top_words))]
        filtered.append(common)
    else:
        filtered.append([i for i in frequent[idx] if (i in relevant[idx] and i in top_words)])

### Тот же пресижн, больше реколл

In [152]:
evaluate(data['keywords'], filtered)

Precision -  0.16
Recall -  0.26
F1 -  0.18
Jaccard -  0.11


## ОПЫТ 3: то же + другая метрика для графа

In [18]:
from itertools import combinations

In [41]:
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


Попбруем теперь важность считать с помощью какой-нибудь метрики из networkx.

In [42]:
import networkx as nx

In [50]:
def build_matrix(text, window_size=5):
    vocab = set(text)
    word2id = {w:i for i, w in enumerate(vocab)}
    id2word = {i:w for i, w in enumerate(vocab)}
    # преобразуем слова в индексы для удобства
    ids = [word2id[word] for word in text]

    # создадим матрицу совстречаемости
    m = np.zeros((len(vocab), len(vocab)))

    # пройдемся окном по всему тексту
    for i in range(0, len(ids), window_size):
        window = ids[i:i+window_size]
        # добавим единичку всем парам слов в этом окне
        for j, k in combinations(window, 2):
            # чтобы граф был ненаправленный 
            m[j][k] += 1
            m[k][j] += 1
    
    return m, id2word

def some_centrality_measure(text, window_size=5, topn=5):
    
    matrix, id2word = build_matrix(text, window_size)
    G = nx.from_numpy_array(matrix)
    # тут можно поставить любую метрику
    # менять тут 
    node2measure = dict(nx.betweenness_centrality(G)) 
    
    return [id2word[index] for index,measure in sorted(node2measure.items(), key=lambda x: -x[1])[:topn]]

In [51]:
%%time
keyword_nx = data['content_norm'].apply(lambda x: some_centrality_measure(x, 10, 20))

Wall time: 8min 52s


In [55]:
keyword_nx_filtered = []
for idx in range(len(keyword_nx)):
    keyword_nx_filtered.append(
    [item for item in keyword_nx[idx] if item in filtered[idx]]
)

### Выше пресижн, эф-1 та же

In [59]:
evaluate(data['keywords'], keyword_nx_filtered)

Precision -  0.17
Recall -  0.23
F1 -  0.18
Jaccard -  0.11
