# Композициональность

Imports of all neccessary modules.

In [391]:
import pandas as pd
import numpy as np
from pymorphy2 import MorphAnalyzer
from gensim.models import KeyedVectors
from scipy.spatial import distance
from tqdm import tqdm
import math
import tensorflow as tf
import tensorflow_hub as hub
# import tensorflow_text
from nltk import word_tokenize
import copy
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.cluster import AgglomerativeClustering
from catboost import CatBoostClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import FillMaskPipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import adjusted_rand_score


pd.options.mode.chained_assignment = None

## Preprocessing of data

In [None]:
m = MorphAnalyzer()

In [78]:
df = pd.read_csv('compounds_AN_top10000.csv')
df = df.drop(labels=['Unnamed: 0'], axis=1)
df = df.drop(labels=[5593], axis=0)
df['Композициональность'] = pd.to_numeric(df['Композициональность'])
df = pd.concat([df[df['Композициональность'] == 0], 
                df[df['Композициональность'] == 1][:248],
                df[df['Композициональность'] == 2][:248]])
df = df.iloc[:, :4]

Lemmatization and PoS tagging via `pymorphy2`.

In [None]:
def analysis(word, pos):
    variants = m.parse(word.lower())
    for var in variants:
        if pos in var.tag:
            if pos == 'NOUN':
                return var.normal_form + '_' + pos
            else:
                return var.normal_form + '_' + 'ADJ'
    return np.nan

In [348]:
def creating_bigram(lemma1, lemma2):
    lemma1 = lemma1.split('_')
    lemma2 = lemma2.split('_')
    return lemma1[0] + '::' + lemma2[0] + '_' + lemma1[1] + lemma2[1]

In [97]:
df['Lemma 1'] = df['Часть 1'].apply(lambda x: analysis(x, 'ADJF'))
df['Lemma 2'] = df['Часть 2'].apply(lambda x: analysis(x, 'NOUN'))
df = df.dropna(subset=['Lemma 1', 'Lemma 2'])
df['Bigram'] = df.apply(lambda row: creating_bigram(row['Lemma 1'], row['Lemma 2']), axis=1)
df['Phrase'] = df['Часть 1'] + ' ' + df['Часть 2']

Vectorization of expressions and their constituents.

In [None]:
wv = KeyedVectors.load_word2vec_format('ruwikiruscorpora_superbigrams_2_1_2.vec')

In [None]:
def vectorization(w):
    try:
        return wv[w]
    except:
        return None

In [None]:
col_embs = ['Lemma 1', 'Lemma 2', 'Bigram']
for col in col_embs:
    df['w2v_'+ col] = df[col].apply(lambda w: vectorization(w))
w2v_df = df.dropna(subset=['w2v_Lemma 1', 'w2v_Lemma 2', 'w2v_Bigram']) # ???

Calculating of cosine similarities between vectors of expressions and their constituents.

In [None]:
w2v_df['w2v_sim_1'] = w2v_df.apply(lambda row: 1 - distance.cosine(row['w2v_Lemma 1'], row['w2v_Bigram']),                           
                                   axis=1)
w2v_df['w2v_sim_2'] = w2v_df.apply(lambda row: 1 - distance.cosine(row['w2v_Lemma 2'], row['w2v_Bigram']), 
                                   axis=1)
df = pd.concat([df, w2v_df[['w2v_sim_1', 'w2v_sim_2']]], axis=1)

Creating of the frequency dictionary from RNC (https://ruscorpora.ru/new/corpora-freq.html).

In [349]:
def freq_dict(): 
    with open('2grams-3.txt') as fh:
        bigrams = fh.readlines()

    main_dict = {}
    for bi in tqdm(bigrams):
        bigr = bi.strip('\n').split('\t')
        if bigr[2] != '':
            continue
        lemma1 = analysis(bigr[1], 'ADJF')
        if type(lemma1) == float:
            continue
        lemma2 = analysis(bigr[3], 'NOUN')
        if type(lemma2) == float:
            continue
        bigram = creating_bigram(lemma1, lemma2)
        main_dict[bigram] = int(bigr[0])
        return main_dict

100%|██████████| 6750525/6750525 [02:58<00:00, 37887.03it/s]


In [None]:
main_dict = freq_dict()
main_dict[:10]

In [354]:
mi = min(main_dict.values())
ma = max(main_dict.values())
df['Frequency_norm'] = df['Bigram'].apply(lambda k: (main_dict[k] - mi) / (ma - mi)\
    if k in main_dict else 0)

{'его::век_ADJNOUN': 3,
 'самый::дело_ADJNOUN': 3,
 'один::иза_ADJNOUN': 3,
 'тот::исполняющий_ADJNOUN': 3,
 'сей::пора_ADJNOUN': 7,
 'этот::время_ADJNOUN': 3,
 'тот::время_ADJNOUN': 3,
 'тот::число_ADJNOUN': 3,
 'российский::федерация_ADJNOUN': 3,
 'такой::образ_ADJNOUN': 3,
 'весь::время_ADJNOUN': 3,
 'тот::пора_ADJNOUN': 3,
 'крайний::мера_ADJNOUN': 3,
 'их::век_ADJNOUN': 4,
 'его::исполняющий_ADJNOUN': 3,
 'её::век_ADJNOUN': 3,
 'всякий::случай_ADJNOUN': 3,
 'тот::век_ADJNOUN': 3,
 'этот::раз_ADJNOUN': 3,
 'весь::жизнь_ADJNOUN': 3,
 'тот::ли_ADJNOUN': 3,
 'этот::исполняющий_ADJNOUN': 5,
 'весь::век_ADJNOUN': 3,
 'другой::сторона_ADJNOUN': 3,
 'один::раз_ADJNOUN': 3,
 'настоящий::время_ADJNOUN': 3,
 'один::сторона_ADJNOUN': 3,
 'каждый::день_ADJNOUN': 4,
 'последний::время_ADJNOUN': 3,
 'его::секунда_ADJNOUN': 3,
 'свой::очередь_ADJNOUN': 8,
 'её::исполняющий_ADJNOUN': 9,
 'некоторый::время_ADJNOUN': 5,
 'его::кандидат_ADJNOUN': 3,
 'свой::время_ADJNOUN': 4,
 'другой::день_ADJNOUN':

In [165]:
df.to_csv('main_dataset.csv')

## Substitution of expressions' constituents

Masking each of constituents in contexts -> 2 variants for each context.

In [311]:
def masking(phrase, context):
    phrase_tokens = word_tokenize(phrase)
    context_tokens = word_tokenize(context)
    context_tokens_2 = copy.deepcopy(context_tokens)
    phrase_lemmas = [m.parse(w)[0].normal_form for w in phrase_tokens]
    context_lemmas = [m.parse(w.lower())[0].normal_form for w in context_tokens]
    if phrase_lemmas[0] in context_lemmas and phrase_lemmas[1] in context_lemmas:
        context_tokens[context_lemmas.index(phrase_lemmas[0])] = '<mask>'
        context_tokens_2[context_lemmas.index(phrase_lemmas[1])] = '<mask>'
    else:
        return np.nan, np.nan
    result1 = ' '.join(context_tokens)
    result2 = ' '.join(context_tokens_2)
    return result1, result2


In [None]:
masked = df.apply(lambda row: masking(row['Phrase'], row['Контекст 1']), axis=1)

In [314]:
df = pd.concat([df, masked], axis=1)
df = df.rename(columns={0:'Masked'})

In [315]:
df['Left_masked'] = df['Masked'].apply(lambda k: k[0])
df['Right_masked'] = df['Masked'].apply(lambda k: k[1])
df = df.drop(columns=['Masked'])
df = df.dropna(subset=['Left_masked', 'Right_masked'])

Substitution via `sberbank-ai/ruRoberta-large`.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/ruRoberta-large")
model = AutoModelForMaskedLM.from_pretrained("sberbank-ai/ruRoberta-large")
nlp_fill = FillMaskPipeline(model, tokenizer)

In [None]:
def substitution(word, context):
    if context == np.nan:
        return np.nan
    substs = nlp_fill(context, top_k=5)
    substs_str = [s['token_str'].strip() for s in substs]
    for s in substs_str:
        if m.parse(s)[0].normal_form == m.parse(word[0])[0].normal_form:
            continue
    else:
        return context.replace('<mask>', s)

In [None]:
df['Left_var'] = df.apply(lambda row: substitution(row['Часть 1'], row['Left_masked']), axis=1)
df['Right_var'] = df.apply(lambda row: substitution(row['Часть 2'], row['Right_masked']), axis=1)

## Universal Sentence Encoder (USE)

Selecting values that shoud be vectorized by USE.

In [None]:
for_use = df[['Часть 1', 'Часть 2', 'Phrase', 'Контекст 1', 'Left_var', 'Right_var']].to_dict('records')

In [None]:
all_for_use = []
for f in tqdm(for_use_sub):
    all_for_use.extend(list(f.values()))

Vectorizing via `google/universal-sentence-encoder-multilingual-large/2`.

In [None]:
embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/2')
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    embeddings = session.run(embed(all_for_use))['outputs']

In [None]:
embs_from_use = []
embeddings = list(embeddings)
for i in range(0, len(embeddings), 6):
    use_dict = {}
    use_dict['USE_Часть 1'] = embeddings[i]
    use_dict['USE_Часть 2'] = embeddings[i+1]
    use_dict['USE_Phrase'] = embeddings[i+2]
    use_dict['USE_Контекст 1'] = embeddings[i+3]
    use_dict['USE_Left_var'] = embeddings[i+4]
    use_dict['USE_Right_var'] = embeddings[i+5]
    embs_from_use.append(use_dict)
use_df = pd.DataFrame(embs_from_use)
df = pd.concat([df, use_df], axis=1)
# df.to_csv('main_dataset_final.csv')

## Experiments

In [322]:
df = pd.read_csv('main_dataset_final.csv')

In [323]:
def converter(vec_str):
    if type(vec_str) != float:
        vec_str = vec_str.strip('[ ')
        vec_str = vec_str.strip('] ')
        return np.array([float(n) for n in vec_str.split()])
    else:
        return vec_str

In [324]:
for c in df.columns:
    if c.startswith('USE') or c.startswith('w2v_L') or c.startswith('w2v_B'):
        df[c] = df[c].apply(lambda k: converter(k))

Calculating of missing cosine similarities.

In [325]:
df['USE_sim_1'] = df.apply(lambda row: 1 - distance.cosine(row['USE_Часть 1'], row['USE_Phrase']), axis=1)
df['USE_sim_2'] = df.apply(lambda row: 1 - distance.cosine(row['USE_Часть 2'], row['USE_Phrase']), axis=1)
df['USE+BERT_sim_1'] = df.apply(lambda row: 1 - distance.cosine(row['USE_Left_var'], row['USE_Контекст 1']), axis=1)
df['USE+BERT_sim_2'] = df.apply(lambda row: 1 - distance.cosine(row['USE_Right_var'], row['USE_Контекст 1']), axis=1)

Sampling.

In [326]:
new_df = df[df['Композициональность'] < 2]

In [327]:
feats_w2v_sim = new_df.dropna().loc[:, ['w2v_sim_1', 'w2v_sim_2']].values.tolist()

In [347]:
feats_use_sim = new_df.dropna().loc[:, ['USE_sim_1','USE_sim_2']].values.tolist()

In [348]:
def embeddings_feats(embeddings):
    feats_new = []
    for f in embeddings:
        lst = []
        for fe in f:
            if type(fe) == str:
                lst.append(fe)
            else:
                try:
                    lst.extend(fe)
                except:
                    lst.append(fe)
        feats_new.append(lst)
    return feats_new

In [349]:
feats_w2v = embeddings_feats(new_df.dropna().loc[:, ['w2v_Lemma 1', 
                                                     'w2v_Lemma 2', 
                                                     'w2v_Bigram']].values.tolist())

feats_use = embeddings_feats(new_df.dropna().loc[:, ['USE_Часть 1', 'USE_Часть 2', 
                                            'USE_Phrase', 'USE_Контекст 1']].values.tolist())

feats_use_wo_context = embeddings_feats(new_df.dropna().loc[:, ['USE_Часть 1', 
                                                       'USE_Часть 2', 
                                                       'USE_Phrase']].values.tolist())

In [350]:
targets_w2v = new_df.dropna()['Композициональность'].values.tolist()
targets_use = new_df.dropna()['Композициональность'].values.tolist()

### Classification

Function for using ML: `train_test_split`, initializing of ML method, fitting, calcucating classification metrics.

In [370]:
def classify(data, targets, fi=False):  
    train_x, test_x, train_y, test_y = train_test_split(data, targets, 
                                                        test_size=0.15, random_state=3)
    train_words, train_x = get_words(train_x)
    test_words, test_x = get_words(test_x)
    tree = DecisionTreeClassifier(random_state=0)
    tree.fit(train_x, train_y)
    print(classification_report(test_y, tree.predict(test_x)))
    if fi:
        feat_imps = tree.feature_importances_
        for i in range(4):
            print(sum(feat_imps[int(len(feat_imps) / 4 * i):int(len(feat_imps) / 4 * (i + 1))]))

#### Features: cosine similarities

CS: Word2Vec embeddings

In [371]:
classify(feats_w2v_sim, targets_w2v)

              precision    recall  f1-score   support

         0.0       0.62      0.67      0.64        24
         1.0       0.70      0.66      0.68        29

    accuracy                           0.66        53
   macro avg       0.66      0.66      0.66        53
weighted avg       0.66      0.66      0.66        53



CS: USE embeddings

In [372]:
classify(feats_use_sim, targets_use)

              precision    recall  f1-score   support

         0.0       0.38      0.46      0.42        24
         1.0       0.46      0.38      0.42        29

    accuracy                           0.42        53
   macro avg       0.42      0.42      0.42        53
weighted avg       0.42      0.42      0.42        53



#### Features: embeddings

Word2Vec embeddings

In [373]:
classify(feats_w2v, targets_w2v)

              precision    recall  f1-score   support

         0.0       0.75      0.88      0.81        24
         1.0       0.88      0.76      0.81        29

    accuracy                           0.81        53
   macro avg       0.81      0.82      0.81        53
weighted avg       0.82      0.81      0.81        53



USE embeddings with context

In [374]:
classify(feats_use, targets_use)

              precision    recall  f1-score   support

         0.0       0.83      0.83      0.83        24
         1.0       0.86      0.86      0.86        29

    accuracy                           0.85        53
   macro avg       0.85      0.85      0.85        53
weighted avg       0.85      0.85      0.85        53



USE embeddings without context

In [375]:
classify(feats_use_wo_context, targets_use)

              precision    recall  f1-score   support

         0.0       0.72      0.75      0.73        24
         1.0       0.79      0.76      0.77        29

    accuracy                           0.75        53
   macro avg       0.75      0.75      0.75        53
weighted avg       0.76      0.75      0.76        53



### Linear Regression

CS: Word2Vec embeddings

In [378]:
clf = LinearRegression(positive=True)
clf.fit(feats_w2v_sim, targets_w2v)
print('MSE:', round(mean_squared_error(targets_w2v, clf.predict(feats_w2v_sim)), 2), 
      'MAE:', round(mean_absolute_error(targets_w2v, clf.predict(feats_w2v_sim)), 2))

MSE: 0.22 MAE: 0.45


In [379]:
phrases = new_df.dropna()['Phrase'].reset_index(drop=True)

Table with scores

In [382]:
lr = pd.concat([pd.Series(clf.predict(feats_w2v_sim)), 
           phrases], axis=1).sort_values(by=[0], ascending=False)
lr[lr[0] < 1]

Unnamed: 0,0,Phrase
276,0.993950,апелляционная инстанция
226,0.973234,акционерное общество
283,0.935329,арбузная корка
320,0.932819,беспилотный аппарат
186,0.921375,авторитарная власть
...,...,...
80,0.219837,мировая судья
2,0.205752,бархатная революция
105,0.193058,открытое письмо
97,0.172262,оранжевая революция


In [383]:
lr.to_csv('lr.csv')

CS: USE embeddings

In [386]:
clf2 = LinearRegression(positive=True)
clf2.fit(feats_use_sim, targets_use)
print('MSE:', round(mean_squared_error(targets_use, clf2.predict(feats_use_sim)), 2), 
      'MAE:', round( mean_absolute_error(targets_use, clf2.predict(feats_use_sim)), 2))

MSE: 0.24 MAE: 0.49


### Clustering

USE embeddings with context

In [388]:
kmeans = KMeans(n_clusters=3, random_state=3).fit(feats_use)
labels = kmeans.labels_
print('ARI:', adjusted_rand_score(targets_use, labels))
phrases2 = new_df['Phrase']


ARI: 0.14553640780295107


Table of clusters

In [389]:
cl = pd.concat([pd.Series(labels), pd.Series(phrases2), pd.Series(targets_use)], axis=1)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(cl.sort_values(by=0))

       0                           Phrase    1
0    0.0           абсолютное большинство  0.0
141  0.0                      первая леди  0.0
142  0.0                     первое время  0.0
144  0.0                     первый канал  0.0
145  0.0                переходный период  0.0
146  0.0                переходный металл  0.0
147  0.0                 пограничный слой  0.0
148  0.0                политическая сила  0.0
149  0.0                   полная катушка  0.0
150  0.0                    почтовый ящик  0.0
140  0.0                  пенсионный фонд  0.0
151  0.0                      правый рука  0.0
155  0.0             промышленный шпионаж  0.0
158  0.0                пулемётная трасса  0.0
159  0.0                     рабочая сила  0.0
160  0.0                   разная область  0.0
161  0.0                   реальное время  0.0
162  0.0                  римская империя  0.0
163  0.0                     римский папа  0.0
164  0.0               российская сторона  0.0
165  0.0     