# Metaphor Identification using Pre-Trained Topic Models
#### *Adridged Pipepine*

In [21]:
import artm
import pandas as pd

In [9]:
TOPIC_N = 40   # specify number of topics

dictionary = artm.Dictionary()   # initialize Dictionary object
dictionary = dictionary.load('./artm/wiki_with_bi_artm_dictionary.dict')   # load the dictionary

## Loading the Pre-Trained Model

### Option 1: Loading ARTM sparse/dense models

To load pre-trained weights, we need to first create the empty model with the same params as the pre-trained model.

In [10]:
# Creating the model
model = artm.ARTM(num_topics=TOPIC_N, dictionary=dictionary, 
                          cache_theta=False)
# Adding scores
model.scores.add(artm.PerplexityScore(name='PerplexityScore', dictionary=dictionary))
model.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore'))
model.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore'))
model.scores.add(artm.TopTokensScore(name='top_words', num_tokens=15))

In [11]:
# Sparse ARTM regularizers
artm_regs_sparse = [artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer', tau=-1.5),
                    artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer', tau=-0.5),
                    artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', tau=1e+3)
                   ]

# Dense ARTM regularizers
artm_regs_dense = [artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer', tau=-1.5),
                    artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer', tau=0.5),
                    artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', tau=1e+3)
                   ]

In [12]:
# Add regularizers. Options: artm_regs_sparse|artm_regs_dense
for reg in artm_regs_sparse:
    model.regularizers.add(reg)

Now we are ready to load the pre-trained weights. Specify the model location (both `p_wt` and `n_wt` files).

In [13]:
model.load(filename='./artm/models/artm_p_wt_sparse40', model_name='p_wt')
model.load(filename='./artm/models/artm_n_wt_sparse40', model_name='n_wt')

### Option 2: Loading LDA model

First, initialize LDA object with the same parameters as the pre-trained model.

In [14]:
lda = artm.LDA(num_topics=topic_n, alpha=0.01, beta=0.001, cache_theta=False,
               num_document_passes=10, dictionary=dictionary)

In [None]:
lda.load(filename='saved_p_wt', model_name='p_wt')
lda.load(filename='saved_n_wt', model_name='n_wt')

## Loading Metaphor Corpus and Extracting Topic Features

Plan:
    
    * Load Preprocessed metaphor corpus (in Vowpal Wabbit format)
    * Create batches for BigARTM
    * Extract topic features (Theta matrix)

When we batch dataset for the first time, BigARTM creates a folder with batched documents.

In [18]:
# When creating batch_vectorizer for the corpus for the first time
met_batch_vectorizer = artm.BatchVectorizer(data_path='./artm/metcorp_vw.txt', data_format='vowpal_wabbit',
                                            target_folder='metcorp_batches')

If we already have files with batches, we can load directly from them (faster, recommended). 


In [None]:
# # When you already have batches, it's recommended to load from batches:
# met_batch_vectorizer = artm.BatchVectorizer(data_path='metcorp_batches', data_format='batches')

Extracting features from the metaphor corpus. Use the `model` (or `lda`) loaded before. 

`model.transform()` outputs topic distribution as pandas.DataFrame object

In [19]:
metcorp_theta = model.transform(met_batch_vectorizer)

`NB!` Topic numbering starts from $0$, not $1$

In [26]:
metcorp_theta.head(3)

Unnamed: 0,6000,6001,6002,6003,6004,6005,6006,6007,6008,6009,...,7067,7068,7069,7070,7071,7072,7073,7074,7075,7076
topic_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
topic_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
topic_2,0.0,0.0,0.0,0.0,0.0,0.392546,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.084376,0.0,0.0,0.0,0.0,0.0


Load DataFrame with preprocessed metaphor corpus (not in vowpal wabbit format, for convenience).

In [22]:
df = pd.read_csv('metcorp.csv', index_col=False)

Insert topic distribution for each context in the dataframe.

In [25]:
df = df.join(metcorp_theta.T)
df.head(3)

Unnamed: 0,sents,targets,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_30,topic_31,topic_32,topic_33,topic_34,topic_35,topic_36,topic_37,topic_38,topic_39
0,нужно_PRED весь_ADJF время_NOUN бомбардировать...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,добрынин_NOUN говорить_VERB шевченко_NOUN цент...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.164717,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01967,0.0
2,принять_INFN внимание_NOUN настойчиво_ADVB гру...,1,0.0,0.0,0.0,0.0,0.0,0.148076,0.205082,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.429145,0.142618,0.0


You can save the Dataframe and reuse it later

In [28]:
# df.to_csv('metcorp_with_topic_features.csv')

## Classification

In [31]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
# from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

# from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

Originally, there were 11 different classifiers in the experiment, but eventually it was down to the top 3: Logistic Regression, Linear SVM and Neural Network. Uncomment lines with classifiers and their names if you wou want to try other classifiers.

In [32]:
names = ["Logistic Regression", 
         #"Logisitic Regression SGD", 
         "Linear SVM", 
         #"RBF SVM", 
         #"Naive Bayes", 
         #"Gaussian Process", 
         #"Decision Tree", 
         #"Random Forest", 
         "Neural Net", 
         #"AdaBoost", 
         #"Nearest Heighbors"
         ]

classifiers = [LogisticRegression(class_weight='balanced', solver='liblinear', fit_intercept=True, max_iter=10000),
               #linear_model.SGDClassifier(max_iter=50000, tol=1e-3, loss='log', class_weight='balanced'),
               SVC(kernel="linear", C=0.025, max_iter=10000),
               #SVC(gamma=2), 
               #GaussianNB(),
               #GaussianProcessClassifier(1.0 * RBF(1.0)),
               #DecisionTreeClassifier(max_depth=10),
               #RandomForestClassifier(max_depth=10, n_estimators=10),
               MLPClassifier(alpha=0.1, max_iter=5000, learning_rate='adaptive'),
               #AdaBoostClassifier(),
               #KNeighborsClassifier(n_neighbors=2)
               ]

Training script

In [33]:
def train(X, y, k_fold=5):
    X = np.array(X)
    y = np.array(y)

    train_results = {}    # {'clf': {'accuracy':[], 'precision':[], 'recall':[], 'f1':[])}

    #kf = KFold(k_fold, shuffle=True, random_state=42)
    kf = StratifiedKFold(k_fold, shuffle=True, random_state=42)

    fold_num = 1

    for train_ind, val_ind in kf.split(X, y):
        # Assign CV IDX
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind]
        
        # Scale Data
        # scaler = StandardScaler()
        scaler = MinMaxScaler()
        X_train_scale = scaler.fit_transform(X_train)
        X_val_scale = scaler.transform(X_val)
        
        # print("Fold num: ", fold_num)

        for name, clf in zip(names, classifiers):

            if name not in train_results:
                train_results[name] = {'accuracy':[], 'precision':[], 'recall':[], 'f1':[]} # 'train_time':[]}

            # print("Training: ", name)
            #start_time = time.time()

            model = clf.fit(X_train_scale, y_train)
            y_pred = model.predict(X_val_scale)

            train_results[name]['accuracy'].append(accuracy_score(y_val, y_pred))
            train_results[name]['precision'].append(precision_score(y_val, y_pred))
            train_results[name]['recall'].append(recall_score(y_val, y_pred))
            train_results[name]['f1'].append(f1_score(y_val, y_pred))
            #train_results[name]['train_time'].append(time.time() - start_time)

            #print("Run Time: ", time.time() - start_time)
        
        fold_num += 1
        # print()
    return train_results

In [35]:
def scores(results):   
    '''Print metrics for all tested classifiers
    '''     
    for clf, scores in results.items():
        print("Scores for ", clf)
        print(f"\t Train accuracy: {np.mean(scores['accuracy']):.3f} +- {np.std(scores['accuracy']):.3f}")
        print(f"\t Train precision: {np.mean(scores['precision']):.3f} +- {np.std(scores['precision']):.3f}")
        print(f"\t Train recall: {np.mean(scores['recall']):.3f} +- {np.std(scores['recall']):.3f}")
        print(f"\t Train f1-score: {np.mean(scores['f1']):.3f} +- {np.std(scores['f1']):.3f}")

Training & Evaluating models

In [None]:
# # Load if not loaded yet
# df = pd.read_csv('./artm/metcorp_with_topic_features.csv', index_col=None)

In [36]:
X = df.iloc[:, 3:].values.tolist()   # X is columns with topic distributions
y = df['targets']

In [37]:
train_results = train(X, y)
scores(train_results)

Scores for  Logistic Regression
	 Train accuracy: 0.668 +- 0.008
	 Train precision: 0.653 +- 0.007
	 Train recall: 0.718 +- 0.017
	 Train f1-score: 0.684 +- 0.010
Scores for  Linear SVM
	 Train accuracy: 0.648 +- 0.009
	 Train precision: 0.616 +- 0.007
	 Train recall: 0.788 +- 0.016
	 Train f1-score: 0.691 +- 0.009
Scores for  Neural Net
	 Train accuracy: 0.674 +- 0.012
	 Train precision: 0.659 +- 0.016
	 Train recall: 0.727 +- 0.015
	 Train f1-score: 0.691 +- 0.009
