In [2]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
%config InlineBackend.figure_formats = ['retina']
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import fbeta_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

## Scripts for classification and scoring

Originally, we tried 11 different classifiers, but eventually decided to report scores only for the top 3: Logistic Regression, Linear SVM and Neural Network. Uncomment lines with classifiers and their names if you wou want to try more than three classifiers.

In [124]:
names = ["Logistic Regression", 
         #"Logisitic Regression SGD", 
         "Linear SVM", 
         #"RBF SVM", 
         #"Naive Bayes", 
         #"Gaussian Process", 
         #"Decision Tree", 
         #"Random Forest", 
         "Neural Net", 
         #"AdaBoost", 
         #"Nearest Heighbors"
         ]

classifiers = [LogisticRegression(class_weight='balanced', solver='liblinear', fit_intercept=True, max_iter=10000),
               #linear_model.SGDClassifier(max_iter=50000, tol=1e-3, loss='log', class_weight='balanced'),
               SVC(kernel="linear", C=0.025, max_iter=10000),
               #SVC(gamma=2), 
               #GaussianNB(),
               #GaussianProcessClassifier(1.0 * RBF(1.0)),
               #DecisionTreeClassifier(max_depth=10),
               #RandomForestClassifier(max_depth=10, n_estimators=10),
               MLPClassifier(alpha=0.1, max_iter=5000, learning_rate='adaptive'),
               #AdaBoostClassifier(),
               #KNeighborsClassifier(n_neighbors=2)
               ]

In [168]:
def train(X, y, k_fold=5):
    X = np.array(X)
    y = np.array(y)

    train_results = {}    # {'clf': {'accuracy':[], 'precision':[], 'recall':[], 'f1':[])}

    #kf = KFold(k_fold, shuffle=True, random_state=42)
    kf = StratifiedKFold(k_fold, shuffle=True, random_state=42)

    fold_num = 1

    for train_ind, val_ind in kf.split(X, y):
        # Assign CV IDX
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind]
        
        # Scale Data
        # scaler = StandardScaler()
        scaler = MinMaxScaler()
        X_train_scale = scaler.fit_transform(X_train)
        X_val_scale = scaler.transform(X_val)
        
        # print("Fold num: ", fold_num)

        for name, clf in zip(names, classifiers):

            if name not in train_results:
                train_results[name] = {'accuracy':[], 'precision':[], 'recall':[], 'f1':[]} # 'train_time':[]}

            # print("Training: ", name)
            #start_time = time.time()

            model = clf.fit(X_train_scale, y_train)
            y_pred = model.predict(X_val_scale)

            train_results[name]['accuracy'].append(accuracy_score(y_val, y_pred))
            train_results[name]['precision'].append(precision_score(y_val, y_pred))
            train_results[name]['recall'].append(recall_score(y_val, y_pred))
            train_results[name]['f1'].append(f1_score(y_val, y_pred))
            #train_results[name]['train_time'].append(time.time() - start_time)

            #print("Run Time: ", time.time() - start_time)
        
        fold_num += 1
        # print()
    return train_results

In [96]:
 def scores(results, mode='print'):   
    '''Print or return metric report for all tested classifiers
    '''
    if mode == 'return':
        result = ""
        for clf, scores in results.items():
            result += "Scores for {}\n".format(clf)
            result += f"\t Train accuracy: {np.mean(scores['accuracy']):.3f} +- {np.std(scores['accuracy']):.3f}\n"
            result += f"\t Train precision: {np.mean(scores['precision']):.3f} +- {np.std(scores['precision']):.3f}\n"
            result += f"\t Train recall: {np.mean(scores['recall']):.3f} +- {np.std(scores['recall']):.3f}\n"
            result += f"\t Train f1-score: {np.mean(scores['f1']):.3f} +- {np.std(scores['f1']):.3f}\n"
        return result
        
    elif mode == 'print':
        for clf, scores in results.items():

            print("Scores for ", clf)
            print(f"\t Train accuracy: {np.mean(scores['accuracy']):.3f} +- {np.std(scores['accuracy']):.3f}")
            print(f"\t Train precision: {np.mean(scores['precision']):.3f} +- {np.std(scores['precision']):.3f}")
            print(f"\t Train recall: {np.mean(scores['recall']):.3f} +- {np.std(scores['recall']):.3f}")
            print(f"\t Train f1-score: {np.mean(scores['f1']):.3f} +- {np.std(scores['f1']):.3f}")


In [35]:
def save_log(model_name, results):
    # Print metric report for all tested classifiers
    with open(model_name+"train.txt", "w") as f:
        print(model_name, file=f)
        for clf, scores in results.items():
            print("Scores for ", clf, file=f)
            print(f"\t Train accuracy: {np.mean(scores['accuracy']):.3f} +- {np.std(scores['accuracy']):.3f}", file=f)
            print(f"\t Train precision: {np.mean(scores['precision']):.3f} +- {np.std(scores['precision']):.3f}", file=f)
            print(f"\t Train recall: {np.mean(scores['recall']):.3f} +- {np.std(scores['recall']):.3f}", file=f)
            print(f"\t Train f1-score: {np.mean(scores['f1']):.3f} +- {np.std(scores['f1']):.3f}", file=f)
            #print(f"\t Train time: {np.sum(scores['train_time']):.3f}", file=f)
#save_log(model_name, train_results)

## Loading Dataset with Topic Features from ARTM

In [85]:
df = pd.read_csv('./artm/thetas/metcorp_tm_dense_280.csv', index_col=None)
df.head(3)

Unnamed: 0.1,Unnamed: 0,sents,targets,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,topic_70,topic_71,topic_72,topic_73,topic_74,topic_75,topic_76,topic_77,topic_78,topic_79
0,0,нужно_PRED весь_ADJF время_NOUN бомбардировать...,1,0.011593,0.0117,0.012471,0.011675,0.025298,0.011624,0.0114,...,0.017775,0.011436,0.011364,0.011415,0.012276,0.011373,0.011364,0.01199,0.011616,0.012746
1,1,добрынин_NOUN говорить_VERB шевченко_NOUN цент...,1,0.01057,0.010208,0.010947,0.010107,0.014802,0.011982,0.010549,...,0.010908,0.009851,0.013713,0.01023,0.012501,0.010489,0.009813,0.01072,0.014215,0.010244
2,2,принять_INFN внимание_NOUN настойчиво_ADVB гру...,1,0.009617,0.008274,0.007861,0.007788,0.018796,0.010173,0.008079,...,0.008647,0.007684,0.007576,0.009606,0.009331,0.008243,0.008101,0.007904,0.008173,0.007644


In [164]:
X = df.iloc[:, 3:53].values.tolist()
y = df['targets']
len(X), len(y)

(7077, 7077)

In [169]:
train_results = train(X, y)
scores(train_results)
save_log(model_name="Metcorp_clf_sklearn", results=train_results)

Scores for  Logistic Regression
	 Train accuracy: 0.699 +- 0.008
	 Train precision: 0.691 +- 0.008
	 Train recall: 0.721 +- 0.018
	 Train f1-score: 0.705 +- 0.010
Scores for  Linear SVM
	 Train accuracy: 0.670 +- 0.006
	 Train precision: 0.631 +- 0.004
	 Train recall: 0.819 +- 0.015
	 Train f1-score: 0.713 +- 0.007
Scores for  Neural Net
	 Train accuracy: 0.698 +- 0.008
	 Train precision: 0.691 +- 0.009
	 Train recall: 0.715 +- 0.015
	 Train f1-score: 0.703 +- 0.009


## Bulk train on multiple ARTM features

In [3]:
def clear_stdin():
    if hasattr(tqdm, '_instances'):
        for instance in list(tqdm._instances):
            tqdm._decr_instances(instance)

In [214]:
def bulk_train(theta_table_names, theta_dir='', save_log_name='bulk_train_output.txt'):
    '''
    Train and evaluate classifiers on different feature tables.
    
    Args:
        theta_table_names:      list of all theta dataframe names, where features are in [3:52] columns,
                                column 1 - sentences, column 2 - labels.
        theta_dir:              set root dir for theta tables, if they are not in the same dir 
                                as the notebook. Otherwise, leave as is.
        save_log_name:          filename to save the logs. You can mention the relative/absolute path as well,
                                but make sure the directory exists on disk or elsewise an error might pop up
    '''
    clear_stdin()
    bulk_results = []
    
    for theta in tqdm(theta_table_names):
        features = pd.read_csv(theta_dir+theta, index_col=None)
        
        X = features.iloc[:, 3:53].values.tolist()
        y = features['targets']
        # X = df.values.tolist()    # if it is only theta table
        
        train_results = train(X, y)
        model_results = scores(train_results, mode='return')
        bulk_results.append(f'{theta}\n{model_results}')
        
    
    with open(save_log_name, 'wt', encoding='utf-8') as f:
        for i in bulk_results:
            print(i, file=f)
            print('='*30, file=f)
    

In [215]:
topic_nums=[40,50,60,70,80,90]

In [216]:
bulk_train(theta_table_names=['metcorp_tm_sparse{}.csv'.format(i) for i in topic_nums], theta_dir='./artm/thetas/')

100%|██████████| 6/6 [01:08<00:00, 11.46s/it]


## Milti-feature Classifier
### TM + lex / morph / concr-absrt

In [4]:
from tqdm import tqdm

import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import ast, csv
from statistics import mean
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from metcorp_utils import compute_statistics, assign_scores, freq_table   # custom

In [5]:
# indexes that were eliminated from metaphor corpus
emptied_indexes = [325, 379, 417, 909, 914, 1067, 1146, 1193, 1214, 1301, 1325, 1393, 1398, 1412, 1826,
                   1830, 1864, 1891, 2015, 2016, 2017, 2051, 2078, 2080, 2081, 2086, 2138, 2154, 2178, 
                   2229, 2296, 2425, 2945, 3116, 3128, 3437, 3685, 4036, 4182, 4183, 4770, 4809, 4928, 
                   4984, 5039, 5134, 5136, 5228, 5248, 5322, 5493, 5543, 6059, 6067, 6093, 6109, 6218, 
                   6232, 6288, 6301, 6461, 6663, 6769, 6924, 7136]

In [89]:
# Load and Delete rows that were eliminated when retrieving features for the metcorp (1-2 word ones).
infile = r'lex_pos140420.csv'
df = pd.read_csv(infile, sep = '\t', index_col = 0)
df = df.drop([df.index[i] for i in emptied_indexes]).reset_index()
print(len(df))
df.head(3)

7077


Unnamed: 0,index,class,sentid,lemmas,pos
0,0,1,бомбардировать#1,"['время', 'ребенок', 'музыка']","['ADV', 'PART', ',', 'SPRO', 'PART', 'ADV', 'A..."
1,1,1,бомбардировать#2,"['добрынин', 'говорить', 'шевченко', 'центр', ...","['S anim nom', 'ADV', 'V ipf praet indic', 'S ..."
2,2,1,бомбардировать#3,"['принять', 'внимание', 'настойчиво', 'группа'...","['CONJ', 'V pf - inf', 'ВО', 'S inan acc', ','..."


## Compute Concreteness / Abstractness features in Metaphor corpus

In [7]:
concr = pd.read_csv('concretness5.csv', sep='\t')

In [8]:
concr.head()

Unnamed: 0.1,Unnamed: 0,words_perf_verbs,words_imperf_verbs,things_concr_k10,abstr_concr_k10,mean
0,0,гребенка_NOUN,гребенка_NOUN,0.300831,0.186778,0.243805
1,1,бюджет_NOUN,бюджет_NOUN,0.170042,0.259276,0.214659
2,2,письмо_NOUN,письмо_NOUN,0.260119,0.216631,0.238375
3,3,правительство_NOUN,правительство_NOUN,0.142006,0.263686,0.202846
4,4,зарплата_NOUN,зарплата_NOUN,0.166051,0.247007,0.206529


In [9]:
'письмо_NOUN' in concr['words_perf_verbs'].values

True

In [21]:
pos_lemma = pd.read_csv('metcorp.csv')

In [22]:
import re
def remove_tags(text):
    text = re.sub('_[A-Z]+', '', text)
    return text

In [23]:
pos_lemma['sents'] = [remove_tags(i) for i in pos_lemma['sents']]
pos_lemma.head()

Unnamed: 0,sents,targets
0,нужно весь время бомбардировать ребёнок музыка,1
1,добрынин говорить шевченко центр бомбардироват...,1
2,принять внимание настойчиво группа особый инте...,1
3,кроме покупка рука сохранять природный ресурс ...,1
4,сигнал настойчиво бомбардировать день придать ...,1


In [24]:
pos_lemma.to_csv('lemma_targ.csv')

In [107]:
concr['words_perf_verbs'] = [remove_tags(i) for i in concr['words_perf_verbs']]
concr['words_imperf_verbs'] = [remove_tags(i) for i in concr['words_imperf_verbs']]

In [108]:
concr.head()

Unnamed: 0.1,Unnamed: 0,words_perf_verbs,words_imperf_verbs,things_concr_k10,abstr_concr_k10,mean
0,0,гребенка,гребенка,0.300831,0.186778,0.243805
1,1,бюджет,бюджет,0.170042,0.259276,0.214659
2,2,письмо,письмо,0.260119,0.216631,0.238375
3,3,правительство,правительство,0.142006,0.263686,0.202846
4,4,зарплата,зарплата,0.166051,0.247007,0.206529


In [117]:
concr.loc[concr['words_perf_verbs'] == 'бюджет', 'things_concr_k10'].values

array([0.17004171])

In [123]:
def count_concr(corpus, df):
    
    global_concr = []
    global_abstr = []
    
    for seq in corpus:
        local_concr = []
        local_abstr = []
        for word in seq.split():
            if word in df['words_perf_verbs'].values:
                local_concr.append(df.loc[df['words_perf_verbs'] == word, 'things_concr_k10'].values[0])
                local_abstr.append(df.loc[df['words_perf_verbs'] == word, 'abstr_concr_k10'].values[0])
            
            elif word in df['words_imperf_verbs'].values:
                local_concr.append(df.loc[df['words_imperf_verbs'] == word, 'things_concr_k10'].values[0])
                local_abstr.append(df.loc[df['words_imperf_verbs'] == word, 'abstr_concr_k10'].values[0])
                
        global_concr.append(np.mean(local_concr))  
        global_abstr.append(np.mean(local_abstr))
    
    return global_concr, global_abstr

In [125]:
global_concr, global_abstr = count_concr(list(pos_lemma['sents']), concr)

In [130]:
df['concr'] = global_concr
df['abstr'] = global_abstr

In [131]:
df.to_csv('pos-lex-abstr.csv')

In [134]:
df.head(3)

Unnamed: 0,index,class,sentid,lemmas,pos,concr,abstr
0,0,1,бомбардировать#1,"['время', 'ребенок', 'музыка']","['ADV', 'PART', ',', 'SPRO', 'PART', 'ADV', 'A...",0.246517,0.213053
1,1,1,бомбардировать#2,"['добрынин', 'говорить', 'шевченко', 'центр', ...","['S anim nom', 'ADV', 'V ipf praet indic', 'S ...",0.181383,0.199301
2,2,1,бомбардировать#3,"['принять', 'внимание', 'настойчиво', 'группа'...","['CONJ', 'V pf - inf', 'ВО', 'S inan acc', ','...",0.152103,0.233698


## Embeddings from BERT

In [14]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.nn.utils.rnn import pad_sequence

In [15]:
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True)

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer.vocab_size

119547

In [17]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=1)

In [18]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', config=config).to(device)

In [25]:
input_ids = [torch.tensor(tokenizer.encode(i, add_special_tokens=True)) 
                  for i in list(pos_lemma['sents'])]

In [26]:
clear_stdin()
sentence_embeddings = []

for i in tqdm(input_ids):
    _, outputs = model(i.unsqueeze(0).to(device))
    sentence_embeddings.append(torch.mean(outputs[0], dim=1))

100%|██████████| 7077/7077 [01:05<00:00, 108.87it/s]


In [197]:
pd.DataFrame([i.tolist() for i in sentence_embeddings]).to_csv('bert_embeds.csv')

In [184]:
torch.mean(outputs[1][0], dim=1).shape

torch.Size([5, 768])

## Train on multiple features

In [102]:
theta_df_1 = pd.read_csv('./artm/thetas/metcorp_tm_lda40.csv', index_col=None)
theta_df_2 = pd.read_csv('./artm/thetas/metcorp_tm_sparse40.csv', index_col=None)
theta_df_3 = pd.read_csv('./artm/thetas/metcorp_tm_dense_280.csv', index_col=None)

In [30]:
concr = pd.read_csv('pos-lex-abstr.csv')

In [12]:
verb_dict = ['бомбардировать', 'доить', 'греть', 'нападать', 
             'очертить', 'отрубить', 'пилить', 
             'подхватывать', 'причесать', 'распылять', 
             'разбавлять', 'съедать', 'трубить', 'уколоть', 
             'утюжить', 'выкраивать', 'взорвать', 
             'взвесить', 'зажигать', 'жонглировать']

In [27]:
def train_and_predict(train_list, test_list, clf, train_fold_labels):
    train = pd.concat(train_list, axis=1)
    test = pd.concat(test_list, axis=1)
    pipeline = Pipeline([('scaler', MinMaxScaler()), ('clf', clf)])
    pipeline.fit(train, train_fold_labels)
    predictions = pipeline.predict(test)
    return predictions

In [28]:
bert_embs = np.array([i.cpu().tolist() for i in sentence_embeddings]).squeeze(1)

In [29]:
bert_embs.shape

(7077, 768)

In [103]:
y = df['class'].values

X_lex = df['lemmas']
X_pos = df['pos']
X_conc = concr[['concr', 'abstr']].values

X_tm_1 = theta_df_1.iloc[:, 3:].values
X_tm_2 = theta_df_2.iloc[:, 3:].values
X_tm_3 = theta_df_3.iloc[:, 3:].values

In [9]:
len(y), len(X_lex), len(X_pos), len(theta_df)

(7077, 7077, 7077, 7077)

In [36]:
#X_tm_1.head()

## Training

Training script is at the end of this notebook for convenience

### Linear SVC

In [115]:
# LDA
train_process(X_lex, X_pos, X_conc, y, X_emb=bert_embs, X_tm=None, verb_dict=verb_dict)

Accuracy LEX:  0.8164476473081814
Accuracy POS:  0.675710046629928
Accuracy CONC:  0.7172530733361594
Accuracy EMB:  0.7195139183269748
Accuracy LEX+EMB:  0.8074042673449202
Accuracy LEX+POS:  0.8204041260421082
Accuracy LEX+CONC:  0.8326974706796665
Accuracy LEX+POS+CONC:  0.8352409212943338
Accuracy LEX+POS+EMB:  0.8124911685742546
Accuracy LEX+EMB+CONC:  0.8117846545146249
Accuracy LEX+POS+EMB+CONC:  0.817578069803589


In [59]:
# ARTM sparse
train_process(X_lex, X_pos, X_conc, y, X_emb=bert_embs, X_tm=X_tm_2, skip_non_tm=True, verb_dict=verb_dict)

Accuracy LEX+TM:  0.8321322594319627
Accuracy POS+TM:  0.6149498375017662
Accuracy CONC+TM:  0.7508831425745373
Accuracy EMB+TM:  0.7255899392397909
Accuracy EMB+CONC+TM:  0.5270594884838208
Accuracy LEX+EMB+TM:  0.8124911685742546
Accuracy LEX+CONC+TM:  0.8372191606612972
Accuracy LEX+POS+TM:  0.8346757100466299
Accuracy LEX+POS+CONC+TM:  0.8404691253355941
Accuracy LEX+POS+CONC+EMB+TM:  0.8133389854458104


In [104]:
# ARTM dense
train_process(X_lex, X_pos, X_conc, y, X_emb=bert_embs, X_tm=X_tm_3, skip_non_tm=True, verb_dict=verb_dict)

Accuracy LEX+TM:  0.834534407234704
Accuracy POS+TM:  0.5971456831990957
Accuracy CONC+TM:  0.76162215628091
Accuracy EMB+TM:  0.7315246573406811
Accuracy EMB+CONC+TM:  0.523103009749894
Accuracy LEX+EMB+TM:  0.8075455701568461
Accuracy LEX+CONC+TM:  0.8400452168998163
Accuracy LEX+POS+TM:  0.8394800056521124
Accuracy LEX+POS+CONC+TM:  0.8452734209410767
Accuracy LEX+POS+CONC+EMB+TM:  0.8117846545146249


### Logistic Regression

In [117]:
# LDA
train_process(X_lex, X_pos, X_conc, y, X_emb=bert_embs, X_tm=X_tm_1, skip_non_tm=False, 
              verb_dict=verb_dict, clf='logreg')

Accuracy LEX:  0.8172954641797372
Accuracy POS:  0.6748622297583722
Accuracy CONC:  0.7158400452168998
Accuracy EMB:  0.7309594460929772
Accuracy LEX+EMB:  0.8483820828034477
Accuracy LEX+POS:  0.8201215204182564
Accuracy LEX+CONC:  0.8326974706796665
Accuracy LEX+POS+CONC:  0.8349583156704818
Accuracy LEX+POS+EMB:  0.8530450755970044
Accuracy LEX+EMB+CONC:  0.854316800904338
Accuracy LEX+POS+EMB+CONC:  0.8584145824501908
Accuracy LEX+TM:  0.8332626819273704
Accuracy POS+TM:  0.5407658612406386
Accuracy CONC+TM:  0.7477744807121661
Accuracy EMB+TM:  0.7469266638406105
Accuracy EMB+CONC+TM:  0.5256464603645613
Accuracy LEX+EMB+TM:  0.8540341952804861
Accuracy LEX+CONC+TM:  0.8359474353539635
Accuracy LEX+POS+TM:  0.8331213791154444
Accuracy LEX+POS+CONC+TM:  0.8369365550374452
Accuracy LEX+POS+CONC+EMB+TM:  0.8558711318355235


In [118]:
# ARTM sparse
train_process(X_lex, X_pos, X_conc, y, X_emb=bert_embs, X_tm=X_tm_2, skip_non_tm=False, 
              verb_dict=verb_dict, clf='logreg')

Accuracy LEX:  0.8172954641797372
Accuracy POS:  0.6748622297583722
Accuracy CONC:  0.7158400452168998
Accuracy EMB:  0.7309594460929772
Accuracy LEX+EMB:  0.8483820828034477
Accuracy LEX+POS:  0.8201215204182564
Accuracy LEX+CONC:  0.8326974706796665
Accuracy LEX+POS+CONC:  0.8349583156704818
Accuracy LEX+POS+EMB:  0.8530450755970044
Accuracy LEX+EMB+CONC:  0.854316800904338
Accuracy LEX+POS+EMB+CONC:  0.8584145824501908
Accuracy LEX+TM:  0.8319909566200367
Accuracy POS+TM:  0.5517874805708634
Accuracy CONC+TM:  0.7411332485516462
Accuracy EMB+TM:  0.745372332909425
Accuracy EMB+CONC+TM:  0.5259290659884132
Accuracy LEX+EMB+TM:  0.8536102868447082
Accuracy LEX+CONC+TM:  0.8348170128585559
Accuracy LEX+POS+TM:  0.8324148650558146
Accuracy LEX+POS+CONC+TM:  0.8346757100466299
Accuracy LEX+POS+CONC+EMB+TM:  0.854175498092412


In [119]:
# ARTM dense
train_process(X_lex, X_pos, X_conc, y, X_emb=bert_embs, X_tm=X_tm_3, skip_non_tm=False, 
              verb_dict=verb_dict, clf='logreg')

Accuracy LEX:  0.8172954641797372
Accuracy POS:  0.6748622297583722
Accuracy CONC:  0.7158400452168998
Accuracy EMB:  0.7309594460929772
Accuracy LEX+EMB:  0.8483820828034477
Accuracy LEX+POS:  0.8201215204182564
Accuracy LEX+CONC:  0.8326974706796665
Accuracy LEX+POS+CONC:  0.8349583156704818
Accuracy LEX+POS+EMB:  0.8530450755970044
Accuracy LEX+EMB+CONC:  0.854316800904338
Accuracy LEX+POS+EMB+CONC:  0.8584145824501908
Accuracy LEX+TM:  0.8185671894870707
Accuracy POS+TM:  0.6263953652677688
Accuracy CONC+TM:  0.7312420517168292
Accuracy EMB+TM:  0.7353398332626819
Accuracy EMB+CONC+TM:  0.5221138900664123
Accuracy LEX+EMB+TM:  0.8490885968630776
Accuracy LEX+CONC+TM:  0.827893175074184
Accuracy LEX+POS+TM:  0.8199802176063303
Accuracy LEX+POS+CONC+TM:  0.8287409919457397
Accuracy LEX+POS+CONC+EMB+TM:  0.8520559559135227


### Neural Network (MLP)

In [113]:
# LDA
train_process(X_lex, X_pos, X_conc, y, X_emb=bert_embs, X_tm=None, skip_non_tm=False, 
              verb_dict=verb_dict, clf='nn')

Accuracy LEX:  0.8171541613678113
Accuracy POS:  0.6762752578776318
Accuracy CONC:  0.7173943761480853
Accuracy EMB:  0.7407093401158683
Accuracy LEX+EMB:  0.8318496538081108
Accuracy LEX+POS:  0.8199802176063303
Accuracy LEX+CONC:  0.8332626819273704
Accuracy LEX+POS+CONC:  0.834393104422778
Accuracy LEX+POS+EMB:  0.844566906881447
Accuracy LEX+EMB+CONC:  0.8384908859686308
Accuracy LEX+POS+EMB+CONC:  0.8485233856153738


In [65]:
# ARTM sparse
train_process(X_lex, X_pos, X_conc, y, X_emb=bert_embs, X_tm=X_tm_2, skip_non_tm=True, 
              verb_dict=verb_dict, clf='nn')

Accuracy LEX+TM:  0.831001836936555
Accuracy POS+TM:  0.5954500494559841
Accuracy CONC+TM:  0.7534265931892045
Accuracy EMB+TM:  0.7346333192030522
Accuracy EMB+CONC+TM:  0.5276246997315247
Accuracy LEX+EMB+TM:  0.8362300409778155
Accuracy LEX+CONC+TM:  0.8331213791154444
Accuracy LEX+POS+TM:  0.8322735622438886
Accuracy LEX+POS+CONC+TM:  0.8365126466016674
Accuracy LEX+POS+CONC+EMB+TM:  0.8492298996750035


In [106]:
# ARTM dense
train_process(X_lex, X_pos, X_conc, y, X_emb=bert_embs, X_tm=X_tm_3, skip_non_tm=True, 
              verb_dict=verb_dict, clf='nn')

Accuracy LEX+TM:  0.8230888794687015
Accuracy POS+TM:  0.6133955065705807
Accuracy CONC+TM:  0.7542744100607602
Accuracy EMB+TM:  0.7103292355517875
Accuracy EMB+CONC+TM:  0.5228204041260421
Accuracy LEX+EMB+TM:  0.8399039140878903
Accuracy LEX+CONC+TM:  0.8321322594319627
Accuracy LEX+POS+TM:  0.8252084216475908
Accuracy LEX+POS+CONC+TM:  0.8338278931750742
Accuracy LEX+POS+CONC+EMB+TM:  0.8332626819273704


In [116]:
def train_process(X_lex, X_pos, X_conc, y, X_emb, 
                  X_tm=False, skip_non_tm=False, verb_dict=None, k_folds=5, clf='svc'):
    
    kf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=0)
    all_test_accuracies = {}
    all_results = {}
    all_outcomes = {}
    
    y = y.ravel()
    
    X_lex = X_lex
    X_pos = X_pos
    X_conc = X_conc
    X_emb = X_emb

    alltrue = []
    all_test_indices = []
    
    if not skip_non_tm:
        allpredictions_lex = []
        allpredictions_pos = []
        allpredictions_conc = []
        allpredictions_emb = []
        allpredictions_lex_pos = []
        allpredictions_lex_conc = []
        allpredictions_lex_emb = []
        allpredictions_lex_pos_conc = []
        allpredictions_lex_pos_emb = []
        allpredictions_lex_emb_conc = []
        allpredictions_lex_pos_emb_conc = []
    
    if X_tm is not None:
        X_tm = X_tm
        allpredictions_lex_tm = []
        allpredictions_pos_tm = []
        allpredictions_emb_tm = []
        allpredictions_conc_tm = []
        allpredictions_lex_emb_tm = []
        allpredictions_lex_pos_tm = []
        allpredictions_lex_conc_tm = []
        allpredictions_emb_conc_tm = []
        allpredictions_lex_pos_conc_tm = []
        allpredictions_lex_pos_emb_tm = []
        allpredictions_lex_pos_conc_emb_tm = []

    for train_index, test_index in kf.split(X_lex, y):
        lex_train_fold, lex_test_fold = X_lex[train_index], X_lex[test_index]
        lex_train_fold, lex_test_fold = [ast.literal_eval(x) for x in lex_train_fold], [ast.literal_eval(x) for x in lex_test_fold]

        pos_train_fold, pos_test_fold = X_pos[train_index], X_pos[test_index]
        pos_train_fold, pos_test_fold = [ast.literal_eval(x) for x in pos_train_fold], [ast.literal_eval(x) for x in pos_test_fold]
        
        conc_train, conc_test = X_conc[train_index], X_conc[test_index]
        emb_train, emb_test = X_emb[train_index], X_emb[test_index]
        
        scaler = MinMaxScaler()
        emb_train, emb_test = scaler.fit_transform(emb_train), scaler.transform(emb_test)
        
        if X_tm is not None:
            tm_train_fold, tm_test_fold = X_tm[train_index], X_tm[test_index]

        train_fold_labels, test_fold_labels = y[train_index], y[test_index]
        alltrue += test_fold_labels.tolist()
        
        all_test_indices += list(test_index)
        
        lex_train_pairs = list(zip(train_fold_labels, lex_train_fold))
        lex_test_pairs = list(zip(test_fold_labels, lex_test_fold))

        lex_train_freq_table = freq_table(lex_train_pairs, verb_dict)  
        lex_frequencies = lex_train_freq_table[0]
        lex_met_corpus_size = lex_train_freq_table[1]
        lex_nonmet_corpus_size = lex_train_freq_table[2]

        lex_train_statistics = compute_statistics(lex_frequencies, lex_met_corpus_size, 
                                                   lex_nonmet_corpus_size)

        lex_train = assign_scores(lex_train_fold, lex_train_statistics)
        lex_test = assign_scores(lex_test_fold, lex_train_statistics)

        pos_train_pairs = list(zip(train_fold_labels, pos_train_fold))
        pos_test_pairs = list(zip(test_fold_labels, pos_test_fold))

        pos_train_freq_table = freq_table(pos_train_pairs, verb_dict)        
        pos_frequencies = pos_train_freq_table[0]
        pos_met_corpus_size = pos_train_freq_table[1]
        pos_nonmet_corpus_size = pos_train_freq_table[2]

        pos_train_statistics = compute_statistics(pos_frequencies, pos_met_corpus_size, 
                                                   pos_nonmet_corpus_size)

        pos_train = assign_scores(pos_train_fold, pos_train_statistics)
        pos_test = assign_scores(pos_test_fold, pos_train_statistics)
        
        if clf == 'svc':
            clf = LinearSVC(penalty = 'l2', loss = 'squared_hinge', dual = False, 
                        C = 1000, multi_class = 'ovr', random_state=0)
        elif clf == 'nn':
            clf = MLPClassifier(alpha=0.1, max_iter=5000, learning_rate='adaptive')
        
        elif clf == 'logreg':
            clf = LogisticRegression(class_weight='balanced', solver='liblinear', 
                                     fit_intercept=True, max_iter=10000)
        if not skip_non_tm:
            clf_lex = clf       
            clf_pos = clf
            clf_conc = clf
            clf_emb = clf
            clf_lex_pos = clf
            clf_lex_conc = clf
            clf_lex_emb = clf
            clf_lex_pos_conc = clf
            clf_lex_pos_emb = clf
            clf_lex_emb_conc = clf
            clf_lex_pos_emb_conc = clf
        
            # LEX
            clf_lex.fit(lex_train, train_fold_labels)
            predictions_lex = clf_lex.predict(lex_test)

            # POS
            clf_pos.fit(pos_train, train_fold_labels)
            predictions_pos = clf_pos.predict(pos_test)

            # CONC
            clf_conc.fit(conc_train, train_fold_labels)
            predictions_conc = clf_conc.predict(conc_test)

            # EMB
            clf_emb.fit(emb_train, train_fold_labels)
            predictions_emb = clf_emb.predict(emb_test)

            # LEX + POS
            predictions_lex_pos = train_and_predict([pd.DataFrame(lex_train), pd.DataFrame(pos_train)],
                                                           (pd.DataFrame(lex_test), pd.DataFrame(pos_test)),
                                                           clf_lex_pos, train_fold_labels=train_fold_labels)
            # LEX + CONC
            predictions_lex_conc = train_and_predict([pd.DataFrame(lex_train), pd.DataFrame(conc_train)],
                                                           (pd.DataFrame(lex_test), pd.DataFrame(conc_test)),
                                                           clf_lex_conc, train_fold_labels=train_fold_labels)

            # LEX + EMB
            predictions_lex_emb = train_and_predict([pd.DataFrame(lex_train), pd.DataFrame(emb_train)],
                                                           (pd.DataFrame(lex_test), pd.DataFrame(emb_test)),
                                                           clf_lex_emb, train_fold_labels=train_fold_labels)

            # LEX + POS + CONC
            predictions_lex_pos_conc = train_and_predict([pd.DataFrame(lex_train), pd.DataFrame(pos_train), 
                                                     pd.DataFrame(conc_train)], (pd.DataFrame(lex_test), 
                                                     pd.DataFrame(pos_test), pd.DataFrame(conc_test)),
                                                     clf_lex_pos_conc, train_fold_labels=train_fold_labels)
            # LEX + POS + EMB
            predictions_lex_pos_emb = train_and_predict([pd.DataFrame(lex_train), pd.DataFrame(pos_train), 
                                                     pd.DataFrame(emb_train)], (pd.DataFrame(lex_test), 
                                                     pd.DataFrame(pos_test), pd.DataFrame(emb_test)),
                                                     clf_lex_pos_emb, train_fold_labels=train_fold_labels)

            # LEX + EMB + CONC
            predictions_lex_emb_conc = train_and_predict([pd.DataFrame(lex_train), pd.DataFrame(emb_train), 
                                                     pd.DataFrame(conc_train)], (pd.DataFrame(lex_test), 
                                                     pd.DataFrame(emb_test), pd.DataFrame(conc_test)),
                                                     clf_lex_emb_conc, train_fold_labels=train_fold_labels)
            
            # LEX + EMB + CONC
            predictions_lex_pos_emb_conc = train_and_predict([pd.DataFrame(lex_train), pd.DataFrame(pos_train), 
                                                     pd.DataFrame(emb_train), pd.DataFrame(conc_train)], 
                                                     (pd.DataFrame(lex_test), pd.DataFrame(pos_test),
                                                     pd.DataFrame(emb_test), pd.DataFrame(conc_test)),
                                                     clf_lex_pos_emb_conc, train_fold_labels=train_fold_labels)

            allpredictions_lex += list(predictions_lex)
            allpredictions_pos += list(predictions_pos)
            allpredictions_emb += list(predictions_emb)
            allpredictions_conc += list(predictions_conc)
            allpredictions_lex_pos += list(predictions_lex_pos)
            allpredictions_lex_conc += list(predictions_lex_conc)
            allpredictions_lex_emb += list(predictions_lex_emb)
            allpredictions_lex_pos_conc += list(predictions_lex_pos_conc)
            allpredictions_lex_pos_emb += list(predictions_lex_pos_emb)
            allpredictions_lex_emb_conc += list(predictions_lex_emb_conc)
            allpredictions_lex_pos_emb_conc += list(predictions_lex_pos_emb_conc)
        
        if X_tm is not None:
            clf_lex_tm = clf
            clf_pos_tm = clf
            clf_emb_tm = clf
            clf_conc_tm = clf
            clf_lex_pos_tm = clf
            clf_lex_emb_tm = clf
            clf_lex_conc_tm = clf
            clf_conc_emb_tm = clf
            clf_lex_pos_conc_tm = clf
            clf_lex_pos_conc_emb_tm = clf
            
            # LEX + TM
            clf_lex_tm.fit(lex_train.join(pd.DataFrame(tm_train_fold), rsuffix='tm'), train_fold_labels)
            predictions_lex_tm = clf_lex_tm.predict(lex_test.join(pd.DataFrame(tm_test_fold), rsuffix='tm'))
            
            # POS + TM
            clf_pos_tm.fit(pos_train.join(pd.DataFrame(tm_train_fold), rsuffix='tm'), train_fold_labels)
            predictions_pos_tm = clf_pos_tm.predict(pos_test.join(pd.DataFrame(tm_train_fold), rsuffix='tm'))
            
            # CONC + TM
            df_conc_tm_train = pd.DataFrame(conc_train).join(pd.DataFrame(tm_train_fold), rsuffix='tm')
            df_conc_tm_test = pd.DataFrame(conc_test).join(pd.DataFrame(tm_test_fold), rsuffix='tm')
                                                           
            clf_conc_tm.fit(df_conc_tm_train, train_fold_labels)
            predictions_conc_tm = clf_conc_tm.predict(df_conc_tm_test)
            
            # EMB + TM
            df_emb_tm_train = pd.DataFrame(emb_train).join(pd.DataFrame(tm_train_fold), rsuffix='tm')
            df_emb_tm_test = pd.DataFrame(emb_test).join(pd.DataFrame(tm_test_fold), rsuffix='tm')
            
            clf_emb_tm.fit(df_emb_tm_train, train_fold_labels) 
            predictions_emb_tm = clf_emb_tm.predict(df_emb_tm_test)
            
            # LEX + POS + TM 
            df_lex_pos_train = lex_train.join(pd.DataFrame(pos_train), rsuffix='pos')
            df_lex_pos_test = lex_test.join(pd.DataFrame(pos_test), rsuffix='pos')
            
            df_lex_pos_tm_train = df_lex_pos_train.join(pd.DataFrame(tm_train_fold), rsuffix='tm')
            df_lex_pos_tm_test = df_lex_pos_test.join(pd.DataFrame(tm_test_fold), rsuffix='tm')
            
            clf_lex_pos_tm.fit(df_lex_pos_tm_train, train_fold_labels)
            predictions_lex_pos_tm = clf_lex_pos_tm.predict(df_lex_pos_tm_test)
            
            # LEX + EMB + TM
            df_lex_emb_train = lex_train.join(pd.DataFrame(emb_train), rsuffix='emb')
            df_lex_emb_test = lex_test.join(pd.DataFrame(emb_test), rsuffix='emb')
            
            df_lex_emb_tm_train = df_lex_emb_train.join(pd.DataFrame(tm_train_fold), rsuffix='tm')
            df_lex_emb_tm_test = df_lex_emb_test.join(pd.DataFrame(tm_test_fold), rsuffix='tm')
            
            clf_lex_emb_tm.fit(df_lex_emb_tm_train, train_fold_labels)
            predictions_lex_emb_tm = clf_lex_emb_tm.predict(df_lex_emb_tm_test)
            
            # LEX + CONC + TM
            
            df_lex_conc_tm_train = df_conc_tm_train.join(lex_train, rsuffix='lex')
            df_lex_conc_tm_test = df_conc_tm_test.join(lex_test, rsuffix='lex')
            
            clf_lex_conc_tm.fit(df_lex_conc_tm_train, train_fold_labels)
            predictions_lex_conc_tm = clf_lex_conc_tm.predict(df_lex_conc_tm_test)
            
            # CONC + EMB + TM
            
            df_conc_emb_tm_train = df_emb_tm_train.join(lex_train, rsuffix='lex')
            df_conc_emb_tm_test = df_emb_tm_test.join(lex_train, rsuffix='lex')
            
            clf_conc_emb_tm.fit(df_conc_emb_tm_train, train_fold_labels)
            predictions_conc_emb_tm = clf_conc_emb_tm.predict(df_conc_emb_tm_test)
            
            # LEX + POS + CONC + TM
            
            df_lex_pos_conc_tm_train = df_lex_pos_tm_train.join(pd.DataFrame(conc_train), rsuffix='conc')
            df_lex_pos_conc_tm_test = df_lex_pos_tm_test.join(pd.DataFrame(conc_test), rsuffix='conc')
            
            clf_lex_pos_conc_tm.fit(df_lex_pos_conc_tm_train, train_fold_labels)
            predictions_lex_pos_conc_tm = clf_lex_pos_conc_tm.predict(df_lex_pos_conc_tm_test)
            
            # LEX + POS + CONC + EBM + TM
            df_full_train = df_lex_pos_conc_tm_train.join(pd.DataFrame(emb_train), rsuffix='emb')
            df_full_test = df_lex_pos_conc_tm_test.join(pd.DataFrame(emb_test), rsuffix='emb')
            
            clf_lex_pos_conc_emb_tm.fit(df_full_train, train_fold_labels)
            predictions_lex_pos_conc_emb_tm = clf_lex_pos_conc_emb_tm.predict(df_full_test)
            
            allpredictions_lex_tm += list(predictions_lex_tm)
            allpredictions_pos_tm += list(predictions_pos_tm)
            allpredictions_conc_tm += list(predictions_conc_tm)
            allpredictions_emb_tm += list(predictions_emb_tm)
            allpredictions_emb_conc_tm += list(predictions_conc_emb_tm)   #*
            allpredictions_lex_emb_tm += list(predictions_lex_emb_tm)
            allpredictions_lex_conc_tm += list(predictions_lex_conc_tm)
            allpredictions_lex_pos_tm += list(predictions_lex_pos_tm)
            allpredictions_lex_pos_conc_tm += list(predictions_lex_pos_conc_tm)
            allpredictions_lex_pos_conc_emb_tm += list(predictions_lex_pos_conc_emb_tm)
    
    if not skip_non_tm:
        accuracy_lex = accuracy_score(alltrue, allpredictions_lex)
        accuracy_pos = accuracy_score(alltrue, allpredictions_pos)
        accuracy_conc = accuracy_score(alltrue, allpredictions_conc)
        accuracy_emb = accuracy_score(alltrue, allpredictions_emb)
        accuracy_lex_conc = accuracy_score(alltrue, allpredictions_lex_conc)
        accuracy_lex_emb = accuracy_score(alltrue, allpredictions_lex_emb)
        accuracy_lex_pos = accuracy_score(alltrue, allpredictions_lex_pos)
        accuracy_lex_pos_conc = accuracy_score(alltrue, allpredictions_lex_pos_conc)
        accuracy_lex_pos_emb = accuracy_score(alltrue, allpredictions_lex_pos_emb)
        accuracy_lex_emb_conc = accuracy_score(alltrue, allpredictions_lex_emb_conc)
        accuracy_lex_pos_emb_conc = accuracy_score(alltrue, allpredictions_lex_pos_emb_conc)


        print('Accuracy LEX: ', accuracy_lex)
        print('Accuracy POS: ', accuracy_pos)
        print('Accuracy CONC: ', accuracy_conc)
        print('Accuracy EMB: ', accuracy_emb)
        print('Accuracy LEX+EMB: ', accuracy_lex_emb)
        print('Accuracy LEX+POS: ', accuracy_lex_pos)
        print('Accuracy LEX+CONC: ', accuracy_lex_conc)
        print('Accuracy LEX+POS+CONC: ', accuracy_lex_pos_conc)
        print('Accuracy LEX+POS+EMB: ', accuracy_lex_pos_emb)
        print('Accuracy LEX+EMB+CONC: ', accuracy_lex_emb_conc)
        print('Accuracy LEX+POS+EMB+CONC: ', accuracy_lex_pos_emb_conc)
    
    
    if X_tm is not None:
        
        accuracy_lex_tm = accuracy_score(alltrue, allpredictions_lex_tm)
        accuracy_pos_tm = accuracy_score(alltrue, allpredictions_pos_tm)
        accuracy_conc_tm = accuracy_score(alltrue, allpredictions_conc_tm)
        accuracy_emb_tm = accuracy_score(alltrue, allpredictions_emb_tm)
        accuracy_lex_emb_tm = accuracy_score(alltrue, allpredictions_lex_emb_tm)
        accuracy_emb_conc_tm = accuracy_score(alltrue, allpredictions_emb_conc_tm)
        accuracy_lex_pos_tm = accuracy_score(alltrue, allpredictions_lex_pos_tm)
        accuracy_lex_conc_tm = accuracy_score(alltrue, allpredictions_lex_conc_tm)
        accuracy_lex_pos_conc_tm = accuracy_score(alltrue, allpredictions_lex_pos_conc_tm)
        accuracy_lex_pos_conc_emb_tm = accuracy_score(alltrue, allpredictions_lex_pos_conc_emb_tm)
        
        
        print('Accuracy LEX+TM: ', accuracy_lex_tm)
        print('Accuracy POS+TM: ', accuracy_pos_tm)
        print('Accuracy CONC+TM: ', accuracy_conc_tm)
        print('Accuracy EMB+TM: ', accuracy_emb_tm)
        print('Accuracy EMB+CONC+TM: ', accuracy_emb_conc_tm)
        print('Accuracy LEX+EMB+TM: ', accuracy_lex_emb_tm)
        print('Accuracy LEX+CONC+TM: ', accuracy_lex_conc_tm)      
        print('Accuracy LEX+POS+TM: ', accuracy_lex_pos_tm)
        print('Accuracy LEX+POS+CONC+TM: ', accuracy_lex_pos_conc_tm)
        print('Accuracy LEX+POS+CONC+EMB+TM: ', accuracy_lex_pos_conc_emb_tm)        