In [1]:
import fasttext
import json
from faq50_adapted import FAQ, extract_word_probs
from topic_word_probs import *
from tfidf_classifier import TFIDF_Classifier
from cs_lemmatizer import *


%load_ext autoreload
%autoreload 2

In [4]:
model_path = "models/cc.cs.300.bin"
model = fasttext.load_model(model_path)

# probs_general = extract_word_probs(model_path=model_path)



### Tests:

In [2]:
path_to_q = "upv_faq/data/FAQ76_questions.xlsx"
path_to_a = "upv_faq/data/FAQ76_answers.xlsx"

probs, n_words = count_word_probs_in_corpuses(path_to_questions=path_to_q, 
                                              path_to_answers=path_to_a)

with open("data_preprocessing/cc.cs.300_probs.json", "r") as wp_file:
    probs_general = json.load(wp_file)

divided_probs = general_to_corpus_probs(probs, probs_general)

print(dict(sorted(divided_probs.items(), key=lambda item: -item[1])))

{'pod': 1.0, 'cena': 0.6074890022706535, ':': 0.5659048059915419, 'zdarma': 0.49365751001500446, 'často': 0.39844489790048426, '2': 0.3128659584656283, '1': 0.2893766009226427, '/': 0.2523183880233372, 'přesně': 0.19289961346157225, 'typ': 0.17953311782630857, 'díl': 0.1261104390942313, 'věc': 0.10989899506400493, '4': 0.09520616956528116, 'ani': 0.09042458505842817, 'délka': 0.09020974383639789, 'řešit': 0.08678243111738208, ';': 0.08640326665060592, 'částečně': 0.08484973897870123, 'software': 0.07243528797032969, 'zrušení': 0.06909737924069727, 'Proč': 0.06847962084399283, '18': 0.06817587098980361, '2007': 0.06368449189220955, 'povinnost': 0.05801895460450519, 'krok': 0.057911094563694136, 'několik': 0.057186634622913304, 'stáhnout': 0.05615864853765753, 'pomoci': 0.05408757596143244, 'místo': 0.05291694727690579, 'veřejně': 0.04855443576415243, 'webu': 0.04561527362094945, 'vlastní': 0.044196919168595916, 'starý': 0.04410980311947217, 'prostor': 0.041361089849810885, 'č': 0.040700

In [16]:
# boost some words: 
print(divided_probs["užitný"])
print(divided_probs["průmyslový"])
divided_probs["užitný"] = 1e-15
divided_probs["průmyslový"] = 1e-15
divided_probs["předpisy"] = 1e-15
divided_probs["najít"] = 1e-15

1e-15
1e-15


In [24]:
q_xslx = "upv_faq/data/FAQ76_questions.xlsx"
a_xslx = "upv_faq/data/FAQ76_answers.xlsx"

faq = FAQ(model, q_xslx, a_xslx, 
          probs=divided_probs, 
          alpha=0.1, 
          rm_stop_words=True, 
          lemm=True)

acc, acc_sec = faq.cross_match_test()
print(f"\nQuestion Cross-Match Accuracy: 1st: {round(acc, 4)}, 2nd: {round(acc_sec, 3)}")

# acc, acc_sec = faq.mean_match_test()
# print(f"\nQuestion Mean-Match Accuracy: 1st: {round(acc, 3)}, 2nd: {round(acc_sec, 3)}")

: 

: 

### Mean match test without inclusion of tested question:

In [None]:
f, s, t = faq.mean_match_test_disjunctive()
print(f"\033[1m1st right: {f} \033[0m")
print(f"2nd right: {s}\n3rd right: {t}\n-> {round(f+s+t, 3)}")

[1m1st right: 0.48 [0m
2nd right: 0.138
3rd right: 0.072
-> 0.69


In [None]:
# pouze words from questions:
probs, n_words = count_word_probs_in_corpuses(path_to_questions=path_to_q, path_to_answers=None)
# histogram_of_words(probs, n_words)

In [None]:
LMTZR.remove_stop_words_from_sentence(["rešerše", "mezi", "jsem", "přihlasit", "jak", "patentem"])

['rešerše', 'přihlasit', 'patentem']

## Weighting with TF-IDF

In [20]:
path_to_q = "upv_faq/data/FAQ76_questions.xlsx"
path_to_a = "upv_faq/data/FAQ76_answers.xlsx"

# path_to_save = "780_upv_questions/w_probs_in_q_and_a.json"
# probs_old, n_words = count_word_probs_in_corpuses(path_to_save, path_to_questions=path_to_q, path_to_answers=None)

c = TFIDF_Classifier(path_to_q)
test_data = c.structure_data(test_data_percent=1) 
tfidf_matrix, feat_names = c.get_TFIDF_matrix()
probs = get_TFIDF_threshold_probabilities(tfidf_matrix, feat_names)

In [None]:
""" previous best results without tf-idf weighting:
0.471, _ : alpha=0.23

1st right: 0.481 
2nd right: 0.135
3rd right: 0.065
-> 0.681 : alpha=0.41
"""
faq = FAQ(model, path_to_q, path_to_a, 
          probs=probs,
          alpha=0.11,
          rm_stop_words=True, 
          lemm=True,
          tfidf_weighting=True)

acc, acc_sec = faq.cross_match_test()
print(f"\nQuestion Cross-Match Accuracy: 1st: {round(acc, 3)}, 2nd: {round(acc_sec, 3)}")

# disjunctive, but still the question is included to tfidf matrix...
f, s, t = faq.mean_match_test_disjunctive()
print(f"\n\033[1m1st right: {f} \033[0m")
print(f"2nd right: {s}\n3rd right: {t}\n-> {round(f+s+t, 3)}")

In [None]:
# ~ 4 minutes
acc, acc_sec = faq.cross_match_test_tfidf_disj()
print(f"\nQuestion Cross-Match Disj Accuracy: 1st: {round(acc, 3)}, 2nd: {round(acc_sec, 3)}")

In [None]:
# ~ 2 minutes
faq = FAQ(model, q_xslx, a_xslx, probs=probs, alpha=0.3, rm_stop_words=True, lemm=True,
          tfidf_weighting=True)
f, s, t = faq.mean_match_test_disjunctive(leave_one_out_also_tfidf=True)
print(f"\n\033[1m1st right: {f} \033[0m")
print(f"2nd right: {s}\n3rd right: {t}\n-> {round(f+s+t, 3)}")


[1m1st right: 0.5 [0m
2nd right: 0.142
3rd right: 0.071
-> 0.713


## Confused sentences:

In [None]:
# Ambiguous matches, disable lemmatizing and stop words removal
path_to_save = "780_upv_questions/same_question_different_answers.json"
faq.get_same_question_different_answer_pairs(save_path=path_to_save)

In [None]:
faq.get_most_confused_questions(cos_sim_threshold=0.85) # with mean-match

(788, 79)
(788,)
41     Na jaké stránce jsou uvedeny nejdůležitější pr...
399    Kde mohu najít právní předpisy týkající se pop...
400    Jaké jsou nejdůležitější právní předpisy týkaj...
Name: question, dtype: object


In [None]:
path_to_save = "780_upv_questions/most_missclassified_class_pairs"
faq.get_most_misclassified_class_pairs(n_of_common_misses=3, save_path=path_to_save)

(19, 45) 4
(26, 24) 3
(26, 25) 3
(30, 55) 4
(31, 69) 4
(33, 34) 4
(35, 65) 3
(36, 57) 5
(40, 4) 3
(42, 44) 3
(43, 51) 3
(43, 47) 3
(45, 55) 3
(49, 51) 5
(52, 51) 4
(53, 62) 4
(58, 68) 6
(65, 55) 3
(71, 51) 3
(77, 3) 5
['19:45', '26:24', '26:25', '30:55', '31:69', '33:34', '35:65', '36:57', '40:4', '42:44', '43:51', '43:47', '45:55', '49:51', '52:51', '53:62', '58:68', '65:55', '71:51', '77:3']


In [28]:
questions_xlsx = "upv_faq/data/FAQ76_questions.xlsx"
# questions_xlsx = "upv_faq/Q50_questions.xlsx"
c = TFIDF_Classifier(questions_xlsx)

In [29]:
test_data = c.structure_data(test_data_percent=0.1) 

c.get_TFIDF_matrix()

(<76x611 sparse matrix of type '<class 'numpy.float64'>'
 	with 1950 stored elements in Compressed Sparse Row format>,
 array(['20', 'advokát', 'aktuální', 'anotace', 'autor', 'autorský',
        'bankovní', 'benelux', 'beneluxu', 'bezplatný', 'biotechnologický',
        'být', 'celý', 'cena', 'ceník', 'cesta', 'charakteristika',
        'chránit', 'chránitelné', 'cílový', 'další', 'databáze', 'datový',
        'definice', 'definovat', 'delá', 'dispozice', 'dlouho', 'doba',
        'dohledat', 'dohoda', 'dojít', 'doklad', 'dokument', 'doplnění',
        'doručit', 'dostat', 'dostupnou', 'dostupný', 'dotaz', 'dozvědět',
        'druh', 'duševní', 'dvůr', 'dát', 'dávat', 'délka', 'dílo',
        'dívat', 'dělat', 'důkaz', 'důsledek', 'důvod', 'elektronicky',
        'elektronické', 'elektronického', 'elektronickém', 'estetický',
        'eu', 'euipo', 'evropský', 'existovat', 'expresní', 'expresních',
        'express', 'forma', 'formulovaný', 'formulář', 'formuář',
        'formální', '

In [32]:
c.classify_test_sentences_list(test_data)

Got right: 0.7227722772277227


In [33]:
test_data = c.structure_data(test_data_percent=1) # without removal from train data
c.get_TFIDF_matrix()
c.classify_test_sentences_list(test_data)

Got right: 0.7645607107601184


In [34]:
c.leave_one_out_test()

2026it [05:35,  6.04it/s]


0.668

## Word embeddings based Edit Distance

In [None]:
from weed import *

rm_sw = True
lm = True

# q_xslx = "new_questions/FAQ76_questions.xlsx"
# a_xslx = "new_questions/FAQ76_answers.xlsx"
# q_xslx = "780_upv_questions/Q78_questions.xlsx"
# a_xslx = "780_upv_questions/Q78_answers_no_tags.xlsx"
q_xslx = "upv_faq/data/FAQ50_questions.xlsx"
a_xslx = "upv_faq/data/FAQ50_answers.xlsx"

c = TFIDF_Classifier(q_xslx, rm_sw, lm)
test_data = c.structure_data(test_data_percent=1) 
tfidf_matrix, feat_names = c.get_TFIDF_matrix()
probs = get_TFIDF_threshold_probabilities(tfidf_matrix, feat_names)

# Or from frequencies - change to tfidf_weighting=False
path_to_save = "w_probs_in_questions.json"
probs, n_words = count_word_probs_in_corpuses(path_to_save, path_to_questions=q_xslx, path_to_answers=None)

weed = WEED(model, q_xslx, a_xslx, probs=probs, alpha=1, 
            lemm=lm, rm_stop_words=rm_sw, sigma=0.9,
            tfidf_weighting=False)
print(weed.nearest_question_test_weed())

0.9448398576512456


: 

In [None]:
"Finding best sigma"
best_alpha_so_far = None
best_acc = 0
for sigma in np.arange(0.0, 1.0, 0.1):
    weed = WEED(model, q_xslx, a_xslx, probs=probs, alpha=0.05, 
                lemm=lm, rm_stop_words=rm_sw, sigma=sigma)
    acc = weed.nearest_question_test_weed()
    print("\n", acc)
    if acc > best_acc:
        best_acc = acc
        best_alpha_so_far = sigma

        print("S:", best_alpha_so_far)

Average number of words in a tokenized sentence: 8.803

 0.5101522842639594
S: 0.0
Average number of words in a tokenized sentence: 8.803

 0.5101522842639594
Average number of words in a tokenized sentence: 8.803

 0.5177664974619289
S: 0.2
Average number of words in a tokenized sentence: 8.803

 0.5114213197969543
Average number of words in a tokenized sentence: 8.803

 0.5215736040609137
S: 0.4
Average number of words in a tokenized sentence: 8.803

 0.5279187817258884
S: 0.5
Average number of words in a tokenized sentence: 8.803

 0.5342639593908629
S: 0.6000000000000001
Average number of words in a tokenized sentence: 8.803

 0.5317258883248731
Average number of words in a tokenized sentence: 8.803

 0.5215736040609137
Average number of words in a tokenized sentence: 8.803

 0.5177664974619289


In [None]:
"Finding best alpha"
best_alpha_so_far = None
best_acc = 0
for alpha in np.arange(0.01, 0.6, 0.022):
    weed = WEED(model, q_xslx, a_xslx, probs=probs, alpha=alpha, 
                lemm=lm, rm_stop_words=rm_sw)
    acc = weed.nearest_question_test_weed()
    if acc > best_acc:
        best_acc = acc
        best_alpha_so_far = alpha

        print(best_alpha_so_far)
        print(best_acc)

0.01
0.7764067127344522
0.032
0.7828232971372162


## BERT-like

In [1]:
import fasttext
import json
from faq50_adapted import FAQ, extract_word_probs
from topic_word_probs import *
from tfidf_classifier import TFIDF_Classifier
from cs_lemmatizer import *
from bert_like_models.slavic_bert_tests import SlavicBERT
from weed import WEED
from bert_like_models.robeczech import Robeczech

%load_ext autoreload
%autoreload 2

### Slavic BERT


In [None]:
q_xslx = "upv_faq/data/FAQ50_questions.xlsx"

dir_to_models_bert = "bert_like_models/models"
model = SlavicBERT(dir_to_models=dir_to_models_bert,
                   )

In [None]:
c = TFIDF_Classifier(q_xslx, rm_sw=False, lm=False)
test_data = c.structure_data(test_data_percent=1,
                             tokenizer=model.tokenizer.tokenize) 
tfidf_matrix, feat_names = c.get_TFIDF_matrix()
probs = get_TFIDF_threshold_probabilities(tfidf_matrix, feat_names)

# probs, n_words = count_word_probs_in_corpuses(path_to_questions=q_xslx,
                                    #  tokenizer=model.tokenizer.tokenize)

faq = FAQ(model, q_xslx, 
          slBert=True, 
          rm_stop_words=False,
          probs=probs,
          alpha=0.5,
          lemm=False,
          tfidf_weighting=True,
        #   sigma=0.7,
          )

In [None]:
print(faq.mean_db.shape)

(50, 768)


In [None]:
# faq.sigma = 0.9
# print(faq.nearest_question_test_weed())

0.7833168805528135


In [None]:
# for FAQ
acc, acc_sec = faq.cross_match_test()
print(f"Cross-Match Accuracy: 1st: {round(acc, 3)}")

# disjunctive, but still the question is included to tfidf matrix...
f, s = faq.mean_match_test()
print("Mean-Match Accuracy:", f)

# f, s, t = faq.mean_match_test_disjunctive() # raw: 0.69
# print("Mean-Match Disj Accuracy:", f)

Cross-Match Accuracy: 1st: 0.867
Mean-Match Accuracy: 0.792


### ROBECZECH

In [62]:
m = Robeczech()

In [63]:
text = "Co chrání autorské právo?"
text2 = "Co se stane po podání stížnosti?"
# text2 = "Kam se obrátit s dotazy k autorským právům?"

# text = "Auto jede po cestě do Ostravy"
# text2 = "Pomeranče jsou nejlepší ovoce"

embs = m.get_mean_sentence_embedding(text, mean=True)
print(embs.shape)
embs2 = m.get_mean_sentence_embedding(text2, mean=True)
print(embs2.shape)

print("Cosine:", embs @ embs2)

(768,)
(768,)
Cosine: 0.9208753


In [52]:
q_xslx = "upv_faq/data/FAQ79_questions.xlsx"

model = Robeczech()
# print(model.model)

In [56]:
# c = TFIDF_Classifier(q_xslx, rm_sw=False, lm=False)
# test_data = c.structure_data(test_data_percent=1,
#                              tokenizer=model.tokenizer.tokenize) 
# tfidf_matrix, feat_names = c.get_TFIDF_matrix()
# probs = get_TFIDF_threshold_probabilities(tfidf_matrix, feat_names)

# probs, n_words = count_word_probs_in_corpuses(path_to_questions=q_xslx,
#                                      tokenizer=model.tokenizer.tokenize)

faq = WEED(model, q_xslx, 
          slBert=True, 
          rm_stop_words=False,
          lemm=False,
        #   probs=probs,
        #   tfidf_weighting=True,
          sigma=1,
          )

In [54]:
acc = faq.nearest_question_test_weed()
print("WEED acc:", acc)

WEED acc: 0.4073604060913706


In [57]:
# for FAQ
acc, acc_sec = faq.cross_match_test()
print(f"Cross-Match Accuracy: 1st: {round(acc, 3)}")

# disjunctive, but still the question is included to tfidf matrix...
f, s = faq.mean_match_test()
print("Mean-Match Accuracy:", f)

# f, s, t = faq.mean_match_test_disjunctive() # raw: 0.69
# print("Mean-Match Disj Accuracy:", f)

Cross-Match Accuracy: 1st: 0.412
Mean-Match Accuracy: 0.471


In [68]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('ufal/robeczech-base')

No sentence-transformers model found with name /home/micha/.cache/torch/sentence_transformers/ufal_robeczech-base. Creating a new one with MEAN pooling.


In [69]:
text = "Co chrání autorské právo?"
text2 = "Co se stane po podání stížnosti?"
# text2 = "Kam se obrátit s dotazy k autorským právům?"

# text = "Auto jede po cestě do Ostravy"
# text2 = "Pomeranče jsou nejlepší ovoce"

emb = model.encode(text)
print(emb.shape)
emb2 = model.encode(text2)
print(emb2.shape)

print("Cosine:", (emb@emb2) / (np.linalg.norm(emb)*np.linalg.norm(emb2)))

(768,)
(768,)
Cosine: 0.9208754
