In [101]:
answer_folder = "C:\\!DEV\\C++\\Diplom\\TemporalSummarization\\build-TemporalSummarization-Desktop_Qt_5_5_1_MinGW_32bit-Release\\"
answer_file_name = answer_folder + "answer.xml"
gold_file_name = "C:\\!DEV\\C++\\Diplom\\GoldSummary\\gold1.xml"
mapping_file_name = "C:\\!DEV\\C++\\Diplom\\GoldSummary\\id_to_querry.xml"
stop_words_file_name = "C:\\!DEV\\C++\\Diplom\\GoldSummary\\stop_words.txt"

In [102]:
from pymystem3 import Mystem
import pymorphy2
import codecs

morph = pymorphy2.MorphAnalyzer()
def read_stop_words(file_name):
    file = codecs.open(file_name, "r", "utf_8_sig")
    stop_words = file.read().lower().split()

    return stop_words

stop_words = read_stop_words(stop_words_file_name)

In [103]:
import os
import codecs
import re
import string

### ANSWER PARSER
mystem = Mystem(entire_input=False)
def GetNormalForm(text) :
    if type(text) != type("str") :
        return []
    morph_data = mystem.analyze(text)
    normal_forms = []    
    for data in morph_data :
        if len(data) == 0 :
            continue

        if len(data["analysis"]) == 0 :
            continue
    
        normal_forms.append(data["analysis"][0]["lex"])
        
    return normal_forms

def GetNormalFormPymorphy2(text):
    words = text.split()
    normal_forms = []
    for word in words:
        morph_data = morph.parse(word)
        if len(morph_data) > 0:
            normal_forms.append(morph_data[0].normal_form)
            
    return normal_forms
            
def clean_text_data(text):
    punct_set = string.punctuation
    punct_set += '»'
    punct_set += '«'
    punct_set += '“'
    punct_set += '„'
    translator = str.maketrans('', '', punct_set)
    
    text = text.translate(translator).lower()
    splited_text = text.split()
    cleared_list = []
    for word in splited_text:
        if word not in stop_words:
            cleared_list.append(word)
    
    text = ' '.join(word for word in cleared_list)
    
    return text
def answer_parser(file_name):
    answer_data = dict()
    file = codecs.open(file_name, "r", "utf_8_sig")
    text = file.read()
    
    #del newline
    text = re.sub(r"\r\n", "", text)
    # del metadata
    text = re.sub(r"<metadata(.*?)>", "", text)
    
    #del querry data
    text = re.sub(r"<querries>(.*?)</querries>", "", text)
    stories = re.findall(r"(<story.*?)</story>", text)
    for story in stories:
        story_id = ""
        if re.search(r"init_doc_id=(\d*)", story) :
            story_id = re.search(r"init_doc_id=(\d*)", story).group(1)
        else:
            story_id = re.search(r"story id=(\d*)", story).group(1)
            
        sentences = re.findall(r"(<sentence.*?)</sentence>", story)
        answer_data[story_id] = []
        for sentence in sentences:
            sent_data = re.search(r"<sentence id=(\d*)(.*?)>(.*)", sentence)
            
            sent_text = sent_data.group(3)
            sent_text = clean_text_data(sent_text)
            answer_data[story_id].append(sent_text)
            
    return answer_data


In [104]:
def parse_mapping(file_name):
    mapping = dict()
    file = codecs.open(file_name, "r", "utf_8_sig")
    text = file.read()
    text = re.sub(r"\r\n", "", text)
    
    pairs = re.findall(r"<pair>(.*?)</pair>", text)
    for pair in pairs:
        story_id = re.search(r"<id>(\d*)</id>", pair).group(1)
        queries = re.findall(r"<doc_id>(.*?)</doc_id>", pair)
        for query in queries:
            mapping[query] = story_id
            
    return mapping

In [105]:
mapping = parse_mapping(mapping_file_name)
answers = answer_parser(answer_file_name)
golds = answer_parser(gold_file_name)

In [106]:
def create_ngramms(input_strs, N):
    ngramms = []
    for i in range(0, len(input_strs) - N + 1):
        ngramm = input_strs[i]
        for j in range(i + 1, i + N):
            ngramm += " " + input_strs[j]
        ngramms.append(ngramm)

    return set(ngramms)

In [107]:
from gensim.models import Word2Vec
path_to_w2v = "C:\\Users\\MishaDEV\\Data\\news_corp_tr_last_w5_s100_c10.bin"
w2v_model = Word2Vec.load_word2vec_format(path_to_w2v, binary=True)


In [118]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy

def compute_RPF(retr_ngrams_by_sent, rel_ngrams_by_sent):
    all_retr = set()
    all_rel = set()
    for sentence_data in retr_ngrams_by_sent:
        all_retr.update(sentence_data)
        
    for sentence_data in rel_ngrams_by_sent:
        all_rel.update(sentence_data)
        
    RPF = [0, 0, 0]
    if len(all_retr) > 0 and len(all_rel) > 0:
        P = len(all_rel.intersection(all_retr)) / len(all_retr)
        R = len(all_rel.intersection(all_retr)) / len(all_rel)
        F = 2 * P * R / (P + R)
        RPF[0] = R
        RPF[1] = P
        RPF[2] = F
        
    return RPF
        
def create_embeding_for_ngramm(ngramm, model):
    vec = numpy.zeros(100)
    splited = ngramm.split()
    
    count = 0
    for word in splited:
        n_form = GetNormalFormPymorphy2(word)[0]
        if n_form in model.wv.vocab:
            vec += model[n_form]
            count += 1
    
    if count > 0:
        vec /= count
        
    return vec

def create_embeddings(ngramms, model):
    embeddings = []
    for ngramm in ngramms:
        embeddings.append(create_embeding_for_ngramm(ngramm, model))
        
    return embeddings

def sentences_intersect_by_embedd(sent_retr, sent_rel):
    intersect_count = 0
    red_line = 0.5
    for ngramm_emb_rel in sent_rel:
        for ngramm_emb_retr in sent_retr:
            if cosine_similarity([ngramm_emb_rel], [ngramm_emb_retr])[0][0] > red_line:
                intersect_count += 1
                break
    
    return intersect_count
            
def compute_Psent(retr_ngrams_by_sent, rel_ngrams_by_sent, model):
    red_line = 0.7
    success_hits = 0
    
    vecs_for_set_sentence_rel = []
    for rel_sentence_ngramms in rel_ngrams_by_sent:
        vecs_for_set_sentence_rel.append(create_embeddings(rel_sentence_ngramms, model))
       
    vecs_for_set_sentence_retr = []
    for retr_sentence_ngramms in retr_ngrams_by_sent:
        vecs_for_set_sentence_retr.append(create_embeddings(retr_sentence_ngramms, model))
        

    for rel_sentence_embedds in vecs_for_set_sentence_rel:
        for retr_sentence_embedds in vecs_for_set_sentence_retr:
            intersect_count = sentences_intersect_by_embedd(retr_sentence_embedds, rel_sentence_embedds)
            cur_score = intersect_count / len(rel_sentence_embedds) 
            if cur_score > red_line:
                success_hits += 1
                break
                
    return success_hits / len(rel_ngrams_by_sent)
    
    
metrics = {"R1" : {}, "P1" : {}, "F1" : {}, "Psent1" : {}, "R2" : {}, "P2" : {}, "F2" : {}, "Psent2" : {}}

def compute_all_metrics(metrics, answers, golds, mapping, model):
    count = 0
    all_size = len(answers)
    for story in answers:
        retrieved_sentences = answers[story]
        relevant_sentences = golds[mapping[story]]
        retr_ngrams_by_sent = []
        rel_ngrams_by_sent = []
        for sentence in retrieved_sentences:
            retr_ngrams_sent = create_ngramms(sentence.split(), 1)
            retr_ngrams_by_sent.append(set(retr_ngrams_sent))
        for sentence in relevant_sentences:
            rel_ngrams_sent = create_ngramms(sentence.split(), 1)
            rel_ngrams_by_sent.append(set(rel_ngrams_sent))

        RPF = compute_RPF(retr_ngrams_by_sent, rel_ngrams_by_sent)
        metrics["R1"][story] = RPF[0]
        metrics["P1"][story] = RPF[1]
        metrics["F1"][story] = RPF[2]
        metrics["Psent1"][story] = compute_Psent(retr_ngrams_by_sent, rel_ngrams_by_sent, model)
        
        retr_ngrams_by_sent = []
        rel_ngrams_by_sent = []
        for sentence in retrieved_sentences:
            retr_ngrams_sent = create_ngramms(sentence.split(), 2)
            retr_ngrams_by_sent.append(set(retr_ngrams_sent))
        for sentence in relevant_sentences:
            rel_ngrams_sent = create_ngramms(sentence.split(), 2)
            rel_ngrams_by_sent.append(set(rel_ngrams_sent))
    
        RPF = compute_RPF(retr_ngrams_by_sent, rel_ngrams_by_sent)
        metrics["R2"][story] = RPF[0]
        metrics["P2"][story] = RPF[1]
        metrics["F2"][story] = RPF[2]
        #metrics["Psent2"][story] = compute_Psent(retr_ngrams_by_sent, rel_ngrams_by_sent, model)
        if count % 1 == 0:
            print(str(100 * count / all_size) + "%", end="\r")
        count += 1
    
    return metrics
        
        
def compute_mean_for_metric(metric):
    mean = 0
    for story in metric:
         mean += metric[story]
    if len(metric) > 0:
        mean /= len(metric)
    return mean


In [119]:
compute_all_metrics(metrics, answers, golds, mapping, w2v_model)  

metrics_names = ["R1", "P1", "F1", "Psent1", "R2", "P2", "F2", "Psent2"]
for metric in metrics_names:
    mean = compute_mean_for_metric(metrics[metric])
    print(metric + " : " + str(mean))

R1 : 0.35575395973630547
P1 : 0.2047346988232779
F1 : 0.2559178145041231
Psent1 : 0.3162706492853552
R2 : 0.19743499603293263
P2 : 0.09119156268027792
F2 : 0.1227691541962909
Psent2 : 0


In [None]:
# PARSE
answer_folder = "C:\\!DEV\\C++\\Diplom\\TemporalSummarization\\build-TemporalSummarization-Desktop_Qt_5_5_1_MinGW_32bit-Release\\"
answer_file_name = answer_folder + "answer.xml"
gold_file_name = "C:\\!DEV\\C++\\Diplom\\GoldSummary\\gold1.xml"
mapping_file_name = "C:\\!DEV\\C++\\Diplom\\GoldSummary\\id_to_querry.xml"
stop_words_file_name = "C:\\!DEV\\C++\\Diplom\\GoldSummary\\stop_words.txt"

mapping = parse_mapping(mapping_file_name)
answers = answer_parser(answer_file_name)
golds = answer_parser(gold_file_name)

path_to_w2v = "C:\\Users\\MishaDEV\\Data\\news_corp_tr_last_w5_s100_c10.bin"
w2v_model = Word2Vec.load_word2vec_format(path_to_w2v, binary=True)

In [None]:
#COMPUTE
metrics = {"R1" : {}, "P1" : {}, "F1" : {}, "Psent1" : {}, "R2" : {}, "P2" : {}, "F2" : {}, "Psent2" : {}}
compute_all_metrics(metrics, answers, golds, mapping, w2v_model)  

metrics_names = ["R1", "P1", "F1", "Psent1", "R2", "P2", "F2", "Psent2"]
for metric in metrics_names:
    mean = compute_mean_for_metric(metrics[metric])
    print(metric + " : " + str(mean))