In [87]:
import fasttext
import json
from faq50_adapted import FAQ, extract_word_probs
from topic_word_probs import *
from tfidf_classifier import TFIDF_Classifier


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
model_path = "models/cc.cs.300.bin"
model = fasttext.load_model(model_path)



### Tests:

In [178]:
path_to_corpus = "upv_faq/Q78_questions.xlsx"
path_to_save = "780_upv_questions/w_probs_in_questions.json"
count_word_probs_in_questions(path_to_corpus, path_to_save)
with open(path_to_save, "r") as wp_file:
    probs_q = json.load(wp_file)


path_to_corpus = "upv_faq/Q78_answers_no_tags.xlsx"
path_to_save = "780_upv_questions/w_probs_in_answers.json"
count_word_probs_in_answers(path_to_corpus, path_to_save)
with open(path_to_save, "r") as wp_file:
    probs_a = json.load(wp_file)


path_to_save = "780_upv_questions/w_probs_in_q_and_a.json"
probs = combine_dicts(probs_a, probs_q, save_path=path_to_save)


print(dict(sorted(probs.items(), key=lambda item: -item[1])))

{'?': 0.09639449882294635, 'být': 0.07743478334328291, ',': 0.06389910156861983, '': 0.05613623052024849, 'vzor': 0.04541864250233721, 'průmyslový': 0.03882966359509764, 'na': 0.03806866180417391, 'o': 0.03404073455520274, 'moci': 0.033622852350676996, 'jaký': 0.03194570449655997, 'a': 0.02898307735787039, 'v': 0.02772114938597632, 'přihláška': 0.026272206071937997, 'známka': 0.024804997690947385, 'ochranný': 0.024371179951484294, 'pro': 0.02335871669921185, 'úřada': 0.0218742416221325, 'podat': 0.020820589171925606, 'se': 0.020137972293416587, 'informace': 0.018175827956636914, 'kde': 0.015673564982679546, 'co': 0.013319457684946922, 'právo': 0.013198219623158518, 'jak': 0.01307165691676454, 'nebo': 0.01299427037521337, 'do': 0.012951084435551832, 'ochrana': 0.012579383283278261, 'užitného': 0.012144234382663776, 'podávat': 0.01164963121716256, 'patentu': 0.01129852466385621, 'možný': 0.011050723895673829, 'lze': 0.011030129296706605, 'úřad': 0.01086470692439312, 'vlastnictví': 0.0107

In [221]:
q_xslx = "upv_faq/Q78_questions.xlsx"
a_xslx = "upv_faq/Q78_answers.xlsx"

faq = FAQ(model, q_xslx, a_xslx, probs=probs, alpha=0.1)

In [222]:
acc, cm = faq.cross_match_test()
print(f"\nQuestion Cross-Match Accuracy: {acc}")
acc, cm = faq.mean_match_test()
print(f"\nQuestion Mean-Match Accuracy: {acc}")


Question Cross-Match Accuracy: 0.42385786802030456

Question Mean-Match Accuracy: 0.699238578680203


### Mean match test without inclusion of tested question:

In [232]:
f, s, t = faq.mean_match_test_disjunctive()
print(f"\033[1m1st right: {f} \033[0m")
print(f"2nd right: {s}\n3rd right: {t}\n-> {round(f+s+t, 3)}")

[1m1st right: 0.468 [0m
2nd right: 0.127
3rd right: 0.074
-> 0.669


In [162]:
# m_acc, _ = faq.mean_match_test(show_cm=True)

## Confused sentences:

In [154]:
# Ambiguous matches
path_to_save = "780_upv_questions/same_question_different_answers.json"
faq.get_same_question_different_answer_pairs(save_path=path_to_save)

{'Jaké jsou hlavní rozdíly mezi užitným vzorem a patentem?': ['34', '33'],
 'Jaké jsou možnosti podání přihlášky průmyslového vzoru? ': ['51', '49'],
 'Kdo může podat přihlášku průmyslového vzoru?': ['52', '49'],
 'Kdo může podat přihlášku užitného vzoru?': ['61', '59'],
 'Jak mohu objednat rešeršní služby Úřadu?': ['65', '45'],
 'Jak dlouho trvá zpracování rešerše?': ['65', '45', '74'],
 'Co znamená kvalifikované zveřejnění technického řešení?': ['68', '58'],
 'Jakým způsobem lze podat objednávku na rešeršní službu?': ['74', '55']}

In [155]:
faq.get_most_confused_questions(cos_sim_threshold=0.85) # with mean-match

(788, 79)
(788,)
Series([], Name: question, dtype: object)


In [156]:
path_to_save = "780_upv_questions/most_missclassified_class_pairs"
faq.get_most_misclassified_class_pairs(n_of_common_misses=3, save_path=path_to_save)

(0, 66) 4
(19, 55) 3
(19, 42) 3
(22, 7) 3
(30, 55) 4
(31, 69) 3
(34, 33) 3
(36, 57) 5
(38, 39) 3
(40, 4) 3
(45, 55) 3
(46, 0) 3
(53, 62) 6
(55, 74) 3
(56, 0) 3
(58, 68) 7
(65, 55) 3
(72, 14) 4
(77, 3) 5
['0:66', '19:55', '19:42', '22:7', '30:55', '31:69', '34:33', '36:57', '38:39', '40:4', '45:55', '46:0', '53:62', '55:74', '56:0', '58:68', '65:55', '72:14', '77:3']


In [157]:
questions_xlsx = "780_upv_questions/expanded_data_all2.xlsx"
# questions_xlsx = "upv_faq/Q50_questions.xlsx"
c = TFIDF_Classifier(questions_xlsx)

In [158]:
test_data = c.structure_data(test_data_percent=0.1) 

c.get_TFIDF_matrix()

TFIDF matrix shape: (79, 809)


In [159]:
c.classify_sentence("Co by nastalo kdyby patent byl zrušen?")

73

In [160]:
c.classify_test_sentences_list(test_data)

Got right: 0.41025641025641024


In [161]:
test_data = c.structure_data(test_data_percent=1) # without removal from train data
c.get_TFIDF_matrix()
c.classify_test_sentences_list(test_data)

TFIDF matrix shape: (79, 848)
Got right: 0.8439086294416244
