In [19]:
from pymystem3 import Mystem
import pymorphy2
import codecs

def read_stop_words(file_name):
    file = codecs.open(file_name, "r", "utf_8_sig")
    stop_words = file.read().lower().split()

    return stop_words


In [20]:
import os
import codecs
import re
import string

### ANSWER PARSER
def GetNormalForm(text) :
    if type(text) != type("str") :
        return []
    morph_data = mystem.analyze(text)
    normal_forms = []    
    for data in morph_data :
        if len(data) == 0 :
            continue

        if len(data["analysis"]) == 0 :
            continue
    
        normal_forms.append(data["analysis"][0]["lex"])
        
    return normal_forms

def GetNormalFormPymorphy2(text):
    words = text.split()
    normal_forms = []
    for word in words:
        morph_data = morph.parse(word)
        if len(morph_data) > 0:
            normal_forms.append(morph_data[0].normal_form)
            
    return normal_forms
            
def clean_text_data(text):
    punct_set = string.punctuation
    punct_set += '»'
    punct_set += '«'
    punct_set += '“'
    punct_set += '„'
    translator = str.maketrans('', '', punct_set)
    
    text = text.translate(translator).lower()
    splited_text = text.split()
    cleared_list = []
    for word in splited_text:
        if word not in stop_words:
            cleared_list.append(word)
    
    text = ' '.join(word for word in cleared_list)
    
    return text
def answer_parser(file_name):
    answer_data = dict()
    file = codecs.open(file_name, "r", "utf_8_sig")
    text = file.read()
    
    #del newline
    text = re.sub(r"\r\n", "", text)
    # del metadata
    text = re.sub(r"<metadata(.*?)>", "", text)
    
    #del querry data
    text = re.sub(r"<querries>(.*?)</querries>", "", text)
    stories = re.findall(r"(<story.*?)</story>", text)
    for story in stories:
        story_id = ""
        if re.search(r"init_doc_id=(\d*)", story) :
            story_id = re.search(r"init_doc_id=(\d*)", story).group(1)
        else:
            story_id = re.search(r"story id=(\d*)", story).group(1)
            
        sentences = re.findall(r"(<sentence.*?)</sentence>", story)
        answer_data[story_id] = []
        for sentence in sentences:
            sent_data = re.search(r"<sentence id=(\d*)(.*?)>(.*)", sentence)
            
            sent_text = sent_data.group(3)
            sent_text = clean_text_data(sent_text)
            answer_data[story_id].append(sent_text)
            
    return answer_data


In [21]:
def parse_mapping(file_name):
    mapping = dict()
    file = codecs.open(file_name, "r", "utf_8_sig")
    text = file.read()
    text = re.sub(r"\r\n", "", text)
    
    pairs = re.findall(r"<pair>(.*?)</pair>", text)
    for pair in pairs:
        story_id = re.search(r"<id>(\d*)</id>", pair).group(1)
        queries = re.findall(r"<doc_id>(.*?)</doc_id>", pair)
        for query in queries:
            mapping[query] = story_id
            
    return mapping

In [22]:
def create_ngramms(input_strs, N):
    ngramms = []
    for i in range(0, len(input_strs) - N + 1):
        ngramm = input_strs[i]
        for j in range(i + 1, i + N):
            ngramm += " " + input_strs[j]
        ngramms.append(ngramm)

    return set(ngramms)

In [54]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy
from gensim.models import Word2Vec
import time

metrics_names = ["R1", "P1", "F1", "Psent1", "R2", "P2", "F2"]

def compute_RPF(retr_ngrams_by_sent, rel_ngrams_by_sent):
    all_retr = set()
    all_rel = set()
    for sentence_data in retr_ngrams_by_sent:
        all_retr.update(sentence_data)
        
    for sentence_data in rel_ngrams_by_sent:
        all_rel.update(sentence_data)
        
    RPF = [0, 0, 0]
    if len(all_retr) > 0 and len(all_rel) > 0:
        P = len(all_rel.intersection(all_retr)) / len(all_retr)
        R = len(all_rel.intersection(all_retr)) / len(all_rel)
        F = 2 * P * R / (P + R)
        RPF[0] = R
        RPF[1] = P
        RPF[2] = F
        
    return RPF
        
def create_embeding_for_ngramm(ngramm, model):
    vec = numpy.zeros(100)
    splited = ngramm.split()
    
    count = 0
    for word in splited:
        n_form = GetNormalFormPymorphy2(word)[0]
        if n_form in model.wv.vocab:
            vec += model[n_form]
            count += 1
    
    if count > 0:
        vec /= count
        
    return vec

def create_embeddings(ngramms, model):
    embeddings = []
    for ngramm in ngramms:
        embeddings.append(create_embeding_for_ngramm(ngramm, model))
        
    return embeddings

def sentences_intersect_by_embedd(sent_retr, sent_rel):
    intersect_count = 0
    red_line = 0.5
    for ngramm_emb_rel in sent_rel:
        for ngramm_emb_retr in sent_retr:
            if cosine_similarity([ngramm_emb_rel], [ngramm_emb_retr])[0][0] > red_line:
                intersect_count += 1
                break
    
    return intersect_count

def compute_Psent(retr_ngrams_by_sent, rel_ngrams_by_sent, model):
    global create_embedings_count 
    global all_p_sent_time 
    start = time.clock()
    red_line = 0.7
    success_hits = 0
    
    vecs_for_set_sentence_rel = []
    for rel_sentence_ngramms in rel_ngrams_by_sent:
        vecs_for_set_sentence_rel.append(create_embeddings(rel_sentence_ngramms, model))
       
    vecs_for_set_sentence_retr = []
    for retr_sentence_ngramms in retr_ngrams_by_sent:
        vecs_for_set_sentence_retr.append(create_embeddings(retr_sentence_ngramms, model))
        
    end_creating_embedding = time.clock()
    create_embedings_count += end_creating_embedding - start
    for rel_sentence_embedds in vecs_for_set_sentence_rel:
        for retr_sentence_embedds in vecs_for_set_sentence_retr:
            intersect_count = sentences_intersect_by_embedd(retr_sentence_embedds, rel_sentence_embedds)
            cur_score = intersect_count / len(rel_sentence_embedds) 
            if cur_score > red_line:
                success_hits += 1
                break
    
    end = time.clock()
    all_p_sent_time += end - start
    return success_hits / len(rel_ngrams_by_sent)
    
def compute_all_metrics(metrics, answers, golds, mapping, model):
    count = 0
    all_size = len(answers)
    for story in answers:
        retrieved_sentences = answers[story]
        relevant_sentences = golds[mapping[story]]
        retr_ngrams_by_sent = []
        rel_ngrams_by_sent = []
        for sentence in retrieved_sentences:
            retr_ngrams_sent = create_ngramms(sentence.split(), 1)
            retr_ngrams_by_sent.append(set(retr_ngrams_sent))
        for sentence in relevant_sentences:
            rel_ngrams_sent = create_ngramms(sentence.split(), 1)
            rel_ngrams_by_sent.append(set(rel_ngrams_sent))

        RPF = compute_RPF(retr_ngrams_by_sent, rel_ngrams_by_sent)
        metrics["R1"][story] = RPF[0]
        metrics["P1"][story] = RPF[1]
        metrics["F1"][story] = RPF[2]
        #metrics["Psent1"][story] = compute_Psent(retr_ngrams_by_sent, rel_ngrams_by_sent, model)
        
        retr_ngrams_by_sent = []
        rel_ngrams_by_sent = []
        for sentence in retrieved_sentences:
            retr_ngrams_sent = create_ngramms(sentence.split(), 2)
            retr_ngrams_by_sent.append(set(retr_ngrams_sent))
        for sentence in relevant_sentences:
            rel_ngrams_sent = create_ngramms(sentence.split(), 2)
            rel_ngrams_by_sent.append(set(rel_ngrams_sent))
    
        RPF = compute_RPF(retr_ngrams_by_sent, rel_ngrams_by_sent)
        metrics["R2"][story] = RPF[0]
        metrics["P2"][story] = RPF[1]
        metrics["F2"][story] = RPF[2]
        #metrics["Psent2"][story] = compute_Psent(retr_ngrams_by_sent, rel_ngrams_by_sent, model)
        if count % 1 == 0:
            print(str(100 * count / all_size) + "%", end="\r")
        count += 1
    
    return metrics
        
        
def compute_mean_for_metric(metric):
    mean = 0
    for story in metric:
         mean += metric[story]
    if len(metric) > 0:
        mean /= len(metric)
    return mean

def create_tag(tag_name, params, data):
    tag = "<" + tag_name
    for param in params:
        tag += " " + param + "=" + str(params[param])
    tag += ">" + str(data) + "</" + tag_name + ">\n"
    return tag

def save_evaluation_in_file(configuration, results, file_name):    
    config_tag = create_tag("configuration", {}, "")
    
    #create set of stories
    stories = set()
    for result in results:
        for story in results[result]:
            if story not in stories:
                stories.add(story)
            
    result_by_story = {}

    for story_name in stories:
        for result in results:
            if story_name in results[result]:
                if story_name not in result_by_story:
                    result_by_story[story_name] = {}
                    
                result_by_story[story_name][result] = results[result][story_name] 
    
    newline = "\n"
    #create means_tag
    mean_metric_data = ""
    for metric in metrics_names:
        if len(results[metric]) > 0:
            mean_metric_data += create_tag(metric, {}, compute_mean_for_metric(results[metric]))
        
    means_tag = create_tag("means", {}, newline + mean_metric_data)
    story_tags = []
    #create story_tags
    for story in result_by_story:
        param = {"id" : story}
        metric_data = newline
        for metric in metrics_names:
            if metric in result_by_story[story]:
                metric_tag = create_tag(metric, [], result_by_story[story][metric])
                metric_data += metric_tag
        
        story_tags.append(create_tag("story", param, metric_data))    
    
    stories_data = newline + means_tag + "".join(story for story in story_tags)
    stories_tag = create_tag("stories", {}, stories_data)
    
    
    #create main tag
    run_tag = create_tag("run", {}, newline + config_tag + stories_tag)
    
    file = open(file_name, "a")
    file.write(run_tag)
    
    

In [24]:
# PARSE
answer_folder = "C:\\!DEV\\C++\\Diplom\\TemporalSummarization\\saved\\"
answer_file_name = answer_folder + "answer.xml"
gold_file_name = "C:\\!DEV\\C++\\Diplom\\GoldSummary\\gold1.xml"
mapping_file_name = "C:\\!DEV\\C++\\Diplom\\GoldSummary\\id_to_querry.xml"
stop_words_file_name = "C:\\!DEV\\C++\\Diplom\\GoldSummary\\stop_words.txt"
file_for_saving = "C:\\!DEV\\C++\\Diplom\\TemporalSummarization\\evaluation.txt"
exe_folder = "C:\\Qt\\5.5\\mingw492_32\\bin\\"
exe_file = exe_folder + "TemporalSummarization.exe"
config_file_name = "C:\\!DEV\\C++\\Diplom\\TemporalSummarization\\TemporalSummarization\\start_config.xml"
w2v_file_name = "C:\\Users\\MishaDEV\\Data\\news_corp_tr_last_w5_s100_c10.bin"

morph = pymorphy2.MorphAnalyzer()
stop_words = read_stop_words(stop_words_file_name)

path_to_w2v = "C:\\Users\\MishaDEV\\Data\\news_corp_tr_last_w5_s100_c10.bin"
w2v_model = Word2Vec.load_word2vec_format(path_to_w2v, binary=True)

In [31]:
#PARSE QUERIES
mapping = parse_mapping(mapping_file_name)
queries = mapping.keys()

In [32]:
import subprocess
args = [exe_file]
args.append(str(len(queries)))
for query in queries:
    args.append(str(query))
    
args_tail = ["-a", answer_file_name, "-c", config_file_name, "-e", w2v_file_name]
args += args_tail

run
end


In [None]:
#CALL TSS
subprocess.call(args)

In [33]:
#PARSE ANSWERS
answers = answer_parser(answer_file_name)
golds = answer_parser(gold_file_name)

In [55]:
#EVALUATE
metrics = {"R1" : {}, "P1" : {}, "F1" : {}, "Psent1" : {}, "R2" : {}, "P2" : {}, "F2" : {}, "Psent2" : {}}

compute_all_metrics(metrics, answers, golds, mapping, w2v_model) 

save_evaluation_in_file("", metrics, file_for_saving)

0.0%6.666666666666667%13.333333333333334%20.0%26.666666666666668%33.333333333333336%40.0%46.666666666666664%53.333333333333336%60.0%66.66666666666667%73.33333333333333%80.0%86.66666666666667%93.33333333333333%