In [41]:
import pymorphy2
from sklearn.metrics.pairwise import cosine_similarity
import numpy
from gensim.models import Word2Vec
import time
import subprocess
import codecs
import re
import string
import random

class Config:
    def __init__(self):
        self.m_Params = dict()
        self.m_Params["PDocCount"] = None
        self.m_Params["QEQuerrySize"] = None
        self.m_Params["QETopLemms"] = None
        self.m_Params["QEDocCount"] = None
        self.m_Params["PSoftOr"] = None
        self.m_Params["DIMinLinkScore"] = None
        self.m_Params["PKeepL"] = None
        self.m_Params["PKeepT"] = None
        self.m_Params["PLambda"] = None
        self.m_Params["QEMinDocRank"] = None
        self.m_Params["PTemporalMode"] = None
        self.m_Params["DocImportance"] = None
        self.m_Params["DIAlpha"] = None
        self.m_Params["DIPowerMethodDFactor"] = None
        self.m_Params["TempMaxDailyAnswerSize"] = None
        self.m_Params["DIDocBoundary"] = None
        self.m_Params["PQuerryEx"] = None
        self.m_Params["QEDEInitQuerrySize"] = None
        self.m_Params["QEDoubleExtension"] = None
    
    def CreateSimpleTag(self, tag, data):
        return "<" + tag + ">" + str(data) + "</" + tag + ">\n"
    
    def CreateParamsTag(self):
        tags = ""
        for param_name in self.m_Params:
            tags += self.CreateSimpleTag(param_name, self.m_Params[param_name])
            
        return self.CreateSimpleTag("params", "\n" + tags)
    
    def RandomParams(self):
        self.m_Params["PDocCount"] = int(random.uniform(200, 800))
        self.m_Params["QEQuerrySize"] = int(random.uniform(5, 20))
        self.m_Params["QETopLemms"] = int(random.uniform(5, 25))
        self.m_Params["QEDocCount"] = int(random.uniform(20, 300))
        self.m_Params["PSoftOr"] = random.uniform(0.2, 0.6)
        self.m_Params["DIMinLinkScore"] = random.uniform(0.4, 1)
        self.m_Params["PKeepL"] = int(random.uniform(1, 10))
        self.m_Params["PKeepT"] = int(random.uniform(0, 5))
        self.m_Params["PLambda"] = random.uniform(0.3, 1)
        self.m_Params["QEMinDocRank"] = random.uniform(0.2, 0.7)
        if random.uniform(0, 1) > 0.5:
            self.m_Params["PTemporalMode"] = "true"
        else:
            self.m_Params["PTemporalMode"] = "false"
        if random.uniform(0, 1) > 0.5:
            self.m_Params["DocImportance"] = "true"
        else:
            self.m_Params["DocImportance"] = "false"

        self.m_Params["DIAlpha"] = random.uniform(0.2, 0.7)
        self.m_Params["DIPowerMethodDFactor"] = random.uniform(0.2, 0.7)
        self.m_Params["TempMaxDailyAnswerSize"] = int(random.uniform(5, 25))
        self.m_Params["DIDocBoundary"] = random.uniform(0.4, 0.9)
        self.m_Params["PQuerryEx"] = "true"
        self.m_Params["QEDEInitQuerrySize"] = int(random.uniform(2, 10))
        self.m_Params["QEDoubleExtension"] = "true"
        
    m_ConfigFileName = "C:\\!DEV\\C++\\Diplom\\TemporalSummarization\\TemporalSummarization\\start_config.xml"
    #m_Params = dict()
    m_ParamsOrder = ["PDocCount", "QEQuerrySize"]

In [51]:
class Evaluator:
    def read_stop_words(self, file_name):
        file = codecs.open(file_name, "r", "utf_8_sig")
        stop_words = file.read().lower().split()

        return stop_words
    
    def clean_text_data(self, text, stop_words):
        punct_set = string.punctuation
        punct_set += '»'
        punct_set += '«'
        punct_set += '“'
        punct_set += '„'
        translator = str.maketrans('', '', punct_set)

        text = text.translate(translator).lower()
        splited_text = text.split()
        cleared_list = []
        for word in splited_text:
            if word not in stop_words:
                cleared_list.append(word)

        text = ' '.join(word for word in cleared_list)

        return text
    
    def answer_parser(self, file_name, stop_words_file_name):
        stop_words = self.read_stop_words(stop_words_file_name)
        text = ""
        answer_data = dict()
        
        try:
            file = codecs.open(file_name, "r", "utf-8-sig")
            text = file.read()
        except UnicodeDecodeError:
            file = codecs.open(file_name, "r", "windows-1251")
            text = file.read()

        #del newline
        text = re.sub(r"\r\n", "", text)
        # del metadata
        text = re.sub(r"<metadata(.*?)>", "", text)

        #del querry data
        text = re.sub(r"<querries>(.*?)</querries>", "", text)
        stories = re.findall(r"(<story.*?)</story>", text)
        for story in stories:
            story_id = ""
            if re.search(r"init_doc_id=(\d*)", story) :
                story_id = re.search(r"init_doc_id=(\d*)", story).group(1)
            else:
                story_id = re.search(r"story id=(\d*)", story).group(1)

            sentences = re.findall(r"(<sentence.*?)</sentence>", story)
            answer_data[story_id] = []
            for sentence in sentences:
                sent_data = re.search(r"<sentence id=(\d*)(.*?)>(.*)", sentence)
                sent_text = sent_data.group(3)
                sent_text = self.clean_text_data(sent_text, stop_words)
                answer_data[story_id].append(sent_text)

        return answer_data
    
    def __init__(self):
        #self.m_W2Vmodel = Word2Vec.load_word2vec_format(self.m_W2VFileName, binary=True)
        self.m_Morph = pymorphy2.MorphAnalyzer()
        self.m_Config.m_Params["PDocCount"] = 500
        self.m_Config.m_Params["QEQuerrySize"] = 10
        self.m_Config.m_Params["QETopLemms"] = 15
        self.m_Config.m_Params["QETopTermins"] = 3
        self.m_Config.m_Params["QEDocCount"] = 50
        self.m_Config.m_Params["QESoftOr"] = 0.3
        self.m_Config.m_Params["PSoftOr"] = 0.3
        self.m_Config.m_Params["DIMinLinkScore"] = 0.65
        self.m_Config.m_Params["PKeepL"] = 5
        self.m_Config.m_Params["PKeepT"] = 2
        self.m_Config.m_Params["PLambda"] = 0.84
        self.m_Config.m_Params["QEMinDocRank"] = 0.3
        self.m_Config.m_Params["PTemporalMode"] = "false"
        self.m_Config.m_Params["DocImportance"] = "false"
        self.m_Config.m_Params["DIAlpha"] = 0.35
        self.m_Config.m_Params["DIPowerMethodDFactor"] = 0.35
        self.m_Config.m_Params["TempMaxDailyAnswerSize"] = 15
        self.m_Config.m_Params["DIDocBoundary"] = 0.7
        self.m_Config.m_Params["PQuerryEx"] = "true"
        self.m_Config.m_Params["QEDEInitQuerrySize"] = 5
        self.m_Config.m_Params["QEDoubleExtension"] = "true"
        
    def parse_mapping(self):
        self.m_StoryMapping = dict()
        file = codecs.open(self.m_MappingFileName, "r", "utf_8_sig")
        text = file.read()
        text = re.sub(r"\r\n", "", text)

        story_to_queries = {}
        pairs = re.findall(r"<pair>(.*?)</pair>", text)
        for pair in pairs:
            story_id = re.search(r"<id>(\d*)</id>", pair).group(1)
            story_to_queries[int(story_id)]  = []
            
            queries = re.findall(r"<doc_id>(.*?)</doc_id>", pair)
            for query in queries:
                self.m_StoryMapping[query] = story_id
                story_to_queries[int(story_id)].append(query)
                

        self.m_TestQueries = []
        self.m_FitQueries = []
        for i in range(0, len(story_to_queries)) :
            for j in range(0, len(story_to_queries[i])) :
                if j == self.m_RandomPositions[i] :
                    self.m_TestQueries.append(story_to_queries[i][j])
                else :
                    self.m_FitQueries.append(story_to_queries[i][j])
        
    def create_args(self, mode):
        args = [self.m_AppFileName]
        queries = []
        if mode == "Test" :
            queries = self.m_TestQueries
        else :
            queries = self.m_FitQueries
            
        args.append(str(len(queries)))
        for query in queries:
            args.append(str(query))

        #args.append(str(10))
        #for i in range(0, 10) :
            #args.append("10013948")
        args_tail = ["-a", self.m_AnswerFileName, "-c", self.m_Config.m_ConfigFileName, "-e", self.m_W2VFileName]
        self.m_ArgsForRun = args + args_tail
        #print(self.m_ArgsForRun)
        
#----------Evaluate methods----------
    def create_ngramms(self, input_strs, N):
        ngramms = []
        for i in range(0, len(input_strs) - N + 1):
            ngramm = input_strs[i]
            for j in range(i + 1, i + N):
                ngramm += " " + input_strs[j]
            ngramms.append(ngramm)

        return set(ngramms)

    def compute_RPF(self, retr_ngrams_by_sent, rel_ngrams_by_sent):
        all_retr = set()
        all_rel = set()
        for sentence_data in retr_ngrams_by_sent:
            all_retr.update(sentence_data)

        for sentence_data in rel_ngrams_by_sent:
            all_rel.update(sentence_data)

        RPF = [0, 0, 0]
        if len(all_retr) > 0 and len(all_rel) > 0:
            P = len(all_rel.intersection(all_retr)) / len(all_retr)
            R = len(all_rel.intersection(all_retr)) / len(all_rel)
            if P + R > 0:
                F = 2 * P * R / (P + R)
            else:
                F = 0
                
            RPF[0] = R
            RPF[1] = P
            RPF[2] = F

        return RPF

    def compute_all_metrics(self):
        count = 0
        all_size = len(self.m_AnswerData)
        for story in self.m_AnswerData:
            retrieved_sentences = self.m_AnswerData[story]
            relevant_sentences = self.m_GoldData[self.m_StoryMapping[story]]
            retr_ngrams_by_sent = []
            rel_ngrams_by_sent = []
            for sentence in retrieved_sentences:
                retr_ngrams_sent = self.create_ngramms(sentence.split(), 1)
                retr_ngrams_by_sent.append(set(retr_ngrams_sent))
            for sentence in relevant_sentences:
                rel_ngrams_sent = self.create_ngramms(sentence.split(), 1)
                rel_ngrams_by_sent.append(set(rel_ngrams_sent))

            RPF = self.compute_RPF(retr_ngrams_by_sent, rel_ngrams_by_sent)
            self.m_Metrics["R1"][story] = RPF[0]
            self.m_Metrics["P1"][story] = RPF[1]
            self.m_Metrics["F1"][story] = RPF[2]
            #metrics["Psent1"][story] = compute_Psent(retr_ngrams_by_sent, rel_ngrams_by_sent, model)

            retr_ngrams_by_sent = []
            rel_ngrams_by_sent = []
            for sentence in retrieved_sentences:
                retr_ngrams_sent = self.create_ngramms(sentence.split(), 2)
                retr_ngrams_by_sent.append(set(retr_ngrams_sent))
            for sentence in relevant_sentences:
                rel_ngrams_sent = self.create_ngramms(sentence.split(), 2)
                rel_ngrams_by_sent.append(set(rel_ngrams_sent))

            RPF = self.compute_RPF(retr_ngrams_by_sent, rel_ngrams_by_sent)
            self.m_Metrics["R2"][story] = RPF[0]
            self.m_Metrics["P2"][story] = RPF[1]
            self.m_Metrics["F2"][story] = RPF[2]
            #metrics["Psent2"][story] = compute_Psent(retr_ngrams_by_sent, rel_ngrams_by_sent, model)
            if count % 1 == 0:
                print(str(100 * count / all_size) + "%", end="\r")
            count += 1
    
    def compute_mean_for_metric(self, metric):
        mean = 0
        for story in metric:
             mean += metric[story]
        if len(metric) > 0:
            mean /= len(metric)
        return mean

    def create_tag(self, tag_name, params, data):
        tag = "<" + tag_name
        for param in params:
            tag += " " + param + "=" + str(params[param])
        tag += ">" + str(data) + "</" + tag_name + ">\n"
        return tag
    
    def save_evaluation_in_file(self):    
        config_tag = self.m_Config.CreateParamsTag()

        #create set of stories
        stories = set()
        for result in self.m_Metrics:
            for story in self.m_Metrics[result]:
                if story not in stories:
                    stories.add(story)

        result_by_story = {}

        for story_name in stories:
            for result in self.m_Metrics:
                if story_name in self.m_Metrics[result]:
                    if story_name not in result_by_story:
                        result_by_story[story_name] = {}

                    result_by_story[story_name][result] = self.m_Metrics[result][story_name] 

        newline = "\n"
        #create means_tag
        mean_metric_data = ""
        for metric in self.m_MetricsNames:
            if len(self.m_Metrics[metric]) > 0:
                mean_metric_data += self.create_tag(metric, {}, round(self.compute_mean_for_metric(self.m_Metrics[metric]), 4))

        means_tag = self.create_tag("means", {}, newline + mean_metric_data)
        story_tags = []
        #create story_tags
        for story in result_by_story:
            param = {"id" : story}
            metric_data = newline
            for metric in self.m_MetricsNames:
                if metric in result_by_story[story]:
                    metric_tag = self.create_tag(metric, [], round(result_by_story[story][metric], 4))
                    metric_data += metric_tag

            story_tags.append(self.create_tag("story", param, metric_data))    

        stories_data = newline + means_tag + "".join(story for story in story_tags)
        stories_tag = self.create_tag("stories", {}, stories_data)


        #create main tag
        run_tag = self.create_tag("run", {}, newline + config_tag + stories_tag)

        file = open(self.m_EvaluationFileName, "a")
        file.write(run_tag)
    
    def save_config(self):
        file = open(self.m_Config.m_ConfigFileName, "w")
        file.write(self.m_Config.CreateParamsTag())
        
    def call_tss(self):
        subprocess.call(self.m_ArgsForRun)
    
    def parse_answer(self):
        self.m_AnswerData = self.answer_parser(self.m_AnswerFileName, self.m_StopWordsFileName)
        
    def parse_gold(self):
        self.m_GoldData = self.answer_parser(self.m_GoldFileName, self.m_StopWordsFileName)
        
    #----------shared members----------
    #files
    m_AnswerFileName = "C:\\!DEV\\C++\\TemporalSummarization\\TemporalSummarizationVS\\Data\\answer.xml"
    m_GoldFileName = "C:\\!DEV\\C++\\Diplom\\GoldSummary\\gold1.xml"
    m_MappingFileName = "C:\\!DEV\\C++\\Diplom\\GoldSummary\\id_to_querry.xml"
    m_StopWordsFileName = "C:\\!DEV\\C++\\Diplom\\GoldSummary\\stop_words.txt"
    m_W2VFileName = "C:\\Users\\MishaDEV\\Data\\news_corp_tr_last_w5_s100_c10.bin"
    m_EvaluationFileName = "C:\\!DEV\\C++\\TemporalSummarization\\TemporalSummarizationVS\\Data\\evaluation.txt"
    m_AppFileName = "C:\\!DEV\\C++\\TemporalSummarization\\TemporalSummarizationVS\\x64\\Release\\TemporalSummarization.exe"
    #usefull objects
    m_Morph = None
    m_W2Vmodel = None
    #other
    m_ArgsForRun = None
    m_FitQueries = []
    m_TestQueries = []
    m_StoryMapping = None
    m_AnswerData = None
    m_GoldData = None
    m_RandomPositions = [2, 0, 1, 2, 2, 0, 1, 0, 1, 1, 0, 0, 1, 1, 2]
    m_Metrics = {"R1" : {}, "P1" : {}, "F1" : {}, "Psent1" : {}, "R2" : {}, "P2" : {}, "F2" : {}, "Psent2" : {}}
    m_MetricsNames = ["R1", "P1", "F1", "Psent1", "R2", "P2", "F2"]
    
    m_Config = Config()
    
    

In [52]:
evaluator = Evaluator()

In [88]:
def func_to_max_no_temp_no_imp(keep_l, keep_t, top_l, top_t, q_size, init_q_size, doc_count, soft_or, min_doc_rank) :
    print("Start func...")
    evaluator.m_Config.m_Params["QEQuerrySize"] = int(q_size)
    evaluator.m_Config.m_Params["QETopLemms"] = int(top_l)
    evaluator.m_Config.m_Params["QETopTermins"] = int(top_t)
    evaluator.m_Config.m_Params["QEDocCount"] = int(doc_count)
    evaluator.m_Config.m_Params["QESoftOr"] = soft_or
    evaluator.m_Config.m_Params["QEDEInitQuerrySize"] = int(init_q_size)
    evaluator.m_Config.m_Params["QEMinDocRank"] = min_doc_rank
    evaluator.m_Config.m_Params["PKeepL"] = int(keep_l)
    evaluator.m_Config.m_Params["PKeepT"] = int(keep_t)
    
    evaluator.m_Config.m_Params["PDocCount"] = 500
    evaluator.m_Config.m_Params["PSoftOr"] = 0.3
    evaluator.m_Config.m_Params["DIMinLinkScore"] = 0.65
    evaluator.m_Config.m_Params["PLambda"] = 0.84
    evaluator.m_Config.m_Params["PTemporalMode"] = "false"
    evaluator.m_Config.m_Params["DocImportance"] = "false"
    evaluator.m_Config.m_Params["DIAlpha"] = 0.35
    evaluator.m_Config.m_Params["DIPowerMethodDFactor"] = 0.35
    evaluator.m_Config.m_Params["TempMaxDailyAnswerSize"] = 15
    evaluator.m_Config.m_Params["DIDocBoundary"] = 0.7
    evaluator.m_Config.m_Params["PQuerryEx"] = "true"
    evaluator.m_Config.m_Params["QEDoubleExtension"] = "true"
    
    evaluator.parse_mapping()
    evaluator.parse_gold()
    evaluator.create_args("Fit")
    evaluator.save_config()
    evaluator.call_tss()
    evaluator.parse_answer()
    evaluator.compute_all_metrics()
    evaluator.save_evaluation_in_file()
    
    score = evaluator.compute_mean_for_metric(evaluator.m_Metrics["R1"]) + evaluator.compute_mean_for_metric(evaluator.m_Metrics["R2"])
    print("End func. Score = " + str(score))
    return score
    
def read_already_existing_data(filename, dict_tag_to_param_name) :
    text = ""
        
    try:
        file = codecs.open(filename, "r", "utf-8-sig")
        text = file.read()
    except UnicodeDecodeError:
        file = codecs.open(filename, "r", "windows-1251")
        text = file.read()
        
    text = re.sub(r"\r\n", "", text)
    answer = {"target" : []}
    for val in dict_tag_to_param_name.values() :
        answer[val] = []
        
    print(answer)
    runs = re.findall(r"<run>(.*?)</run>", text, re.MULTILINE)
    for run in runs:
        params = re.search(r"<params>(.*?)</params>", run, re.MULTILINE).group(1)
        for tag in dict_tag_to_param_name :
            value = re.search(r"<" + re.escape(tag) + r">(.*?)</" + re.escape(tag) + r">", params, re.M).group(1)
            answer[dict_tag_to_param_name[tag]].append(float(value))
            
        means = re.search("<means>(.*?)</means>", run, re.M).group(1)
        R1 = float(re.search(r"<R1>(.*?)</R1>", means, re.M).group(1))
        R2 = float(re.search(r"<R2>(.*?)</R2>", means, re.M).group(1))
        answer["target"].append(R1 + R2)
    
    print(answer)
    
    return answer

In [107]:
tag_to_param = {"QEQuerrySize" : "q_size", "QETopLemms" : "top_l", "QETopTermins" : "top_t", "QEDocCount" : "doc_count",
                "QESoftOr" : "soft_or", "QEDEInitQuerrySize" : "init_q_size", "QEMinDocRank" : "min_doc_rank", 
                "PKeepL" : "keep_l", "PKeepT" : "keep_t" }

ready_points = read_already_existing_data("C:\\!DEV\\C++\\TemporalSummarization\\TemporalSummarizationVS\\Data\\evaluation.txt", tag_to_param)

{'init_q_size': [], 'keep_l': [], 'keep_t': [], 'q_size': [], 'top_l': [], 'target': [], 'soft_or': [], 'min_doc_rank': [], 'doc_count': [], 'top_t': []}
{'init_q_size': [5.0, 6.0, 7.0, 7.0, 9.0, 3.0, 3.0, 6.0, 5.0, 3.0, 7.0, 9.0, 10.0, 3.0, 3.0, 3.0, 4.0, 3.0, 4.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0, 5.0, 4.0, 3.0, 3.0, 3.0, 4.0, 3.0, 4.0, 4.0, 5.0, 4.0, 5.0, 3.0, 4.0, 3.0, 5.0, 4.0, 3.0, 3.0, 4.0, 4.0, 3.0, 3.0, 3.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 4.0, 4.0, 5.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 5.0, 3.0, 4.0, 4.0, 4.0, 5.0, 3.0, 4.0, 5.0, 4.0, 5.0, 5.0, 4.0, 4.0, 5.0, 6.0, 5.0, 5.0, 3.0, 3.0, 4.0, 5.0, 4.0, 5.0, 5.0, 6.0, 5.0, 5.0, 5.0, 4.0, 5.0, 4.0, 5.0, 5.0, 3.0, 3.0, 6.0, 4.0, 4.0, 4.0, 6.0, 4.0, 6.0, 4.0, 3.0, 5.0, 4.0, 6.0, 3.0, 5.0, 3.0, 4.0, 6.0, 4.0, 4.0, 4.0, 3.0], 'keep_l': [5.0, 4.0, 6.0, 5.0, 5.0, 3.0, 3.0, 6.0, 6.0, 4.0, 5.0, 3.0, 3.0, 7.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 3.0, 4.0, 5.0, 4.0, 3.0, 4.0, 3.0, 3.0, 4.0, 3.0, 3.0, 4.0, 3.0, 4.0, 3.0, 5.0, 3.0, 4

In [46]:
evaluator.parse_mapping()

In [47]:
func_to_max_no_temp_no_imp(5,3,15,3,10,5,50,0.5,0.3)


Start func...
End func. Score = 0.3219398491909084


0.3219398491909084

In [110]:
from bayes_opt import BayesianOptimization

#keep_l, keep_t, top_l, top_t, q_size, init_q_size, doc_count, soft_or, min_doc_rank
bo = BayesianOptimization(func_to_max_no_temp_no_imp,
                          {'keep_l': (3, 7), 'keep_t': (0, 5), 'top_l': (5, 20),
                           'top_t': (0, 5), 'q_size': (5, 15), 'init_q_size': (3, 10),
                           'doc_count': (10, 100), 'soft_or': (0, 0.99), 'min_doc_rank': (0, 0.5)})

In [111]:
bo.initialize(ready_points)

In [93]:
bo.maximize(init_points=0, n_iter=1, acq='ucb', kappa=5)

[31mInitialization[0m
[94m----------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   doc_count |   init_q_size |    keep_l |    keep_t |   min_doc_rank |    q_size |   soft_or |     top_l |     top_t | 
    1 | 00m00s | [35m   0.32200[0m | [32m    50.0000[0m | [32m       5.0000[0m | [32m   5.0000[0m | [32m   3.0000[0m | [32m        0.3000[0m | [32m  10.0000[0m | [32m   0.5000[0m | [32m  15.0000[0m | [32m   3.0000[0m | 
    2 | 00m00s | [35m   0.35230[0m | [32m    97.0000[0m | [32m       6.0000[0m | [32m   4.0000[0m | [32m   4.0000[0m | [32m        0.4990[0m | [32m  10.0000[0m | [32m   0.9062[0m | [32m  16.0000[0m | [32m   0.0000[0m | 
    3 | 00m00s |    0.32510 |     32.0000 |        7.0000 |    6.0000 |    2.0000 |         0.1308 |    6.0000 |    0.3924 |   18.0000 |    2.0000 | 
    4 | 00m00s |    0.28920 |     24.0

In [None]:

gp_params = {'kernel': None,
             'alpha': 0.4}

#bo.maximize(n_iter=10, acq='ucb', kappa=4, **gp_params)
bo.maximize(n_iter=200, acq="ei", xi=0.01, **gp_params)

In [95]:
bo.maximize(init_points=5, n_iter=20, acq='ucb', kappa=4, **gp_params)

[31mBayesian Optimization[0m
[94m----------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   doc_count |   init_q_size |    keep_l |    keep_t |   min_doc_rank |    q_size |   soft_or |     top_l |     top_t | 
Start func...
End func. Score = 0.3295479410617712
   25 | 00m27s |    0.32955 |     70.4886 |        4.5710 |    4.3121 |    1.9384 |         0.5000 |   10.4932 |    0.9900 |   17.6998 |    2.3497 | 
Start func...
End func. Score = 0.30981525293765855
   26 | 00m49s |    0.30982 |     72.3333 |        4.0733 |    3.7225 |    3.0342 |         0.5000 |    9.4918 |    0.0000 |   18.3251 |    3.7234 | 
Start func...
End func. Score = 0.3341211936646207
   27 | 00m30s |    0.33412 |     70.7592 |        3.5090 |    4.4465 |    1.1794 |         0.5000 |    8.5692 |    0.9900 |   17.9984 |    2.3932 | 
Start func...
End func. Score = 0.4035233428431743
   28 

In [106]:

params = {'alpha': 0.5 }
bo.maximize(init_points=0, n_iter=50, acq='ucb', kappa=4, **params)

TypeError: get_params() missing 1 required positional argument: 'self'

In [104]:
gp_params = {'alpha': 0.5 }
bo.maximize(init_points=0, n_iter=50, acq='ucb', kappa=4, **gp_params)

TypeError: get_params() missing 1 required positional argument: 'self'

In [29]:
configs = [Config(), Config()]
configs[0].m_Params["PDocCount"] = 500
configs[0].m_Params["QEQuerrySize"] = 10
configs[0].m_Params["QETopLemms"] = 15
configs[0].m_Params["QETopTermins"] = 0
configs[0].m_Params["QEDocCount"] = 50
configs[0].m_Params["QESoftOr"] = 0.3
configs[0].m_Params["PSoftOr"] = 0.3
configs[0].m_Params["DIMinLinkScore"] = 0.65
configs[0].m_Params["PKeepL"] = 5
configs[0].m_Params["PKeepT"] = 2
configs[0].m_Params["PLambda"] = 0.84
configs[0].m_Params["QEMinDocRank"] = 0.3
configs[0].m_Params["PTemporalMode"] = "false"
configs[0].m_Params["DocImportance"] = "false"
configs[0].m_Params["DIAlpha"] = 0.35
configs[0].m_Params["DIPowerMethodDFactor"] = 0.35
configs[0].m_Params["TempMaxDailyAnswerSize"] = 15
configs[0].m_Params["DIDocBoundary"] = 0.7
configs[0].m_Params["PQuerryEx"] = "true"
configs[0].m_Params["QEDEInitQuerrySize"] = 5
configs[0].m_Params["QEDoubleExtension"] = "true"

configs[1].m_Params["PDocCount"] = 500
configs[1].m_Params["QEQuerrySize"] = 10
configs[1].m_Params["QETopLemms"] = 15
configs[1].m_Params["QETopTermins"] = 3
configs[1].m_Params["QEDocCount"] = 50
configs[1].m_Params["QESoftOr"] = 0.3
configs[1].m_Params["PSoftOr"] = 0.3
configs[1].m_Params["DIMinLinkScore"] = 0.6
configs[1].m_Params["PKeepL"] = 5
configs[1].m_Params["PKeepT"] = 2
configs[1].m_Params["PLambda"] = 0.84
configs[1].m_Params["QEMinDocRank"] = 0.0
configs[1].m_Params["PTemporalMode"] = "true"
configs[1].m_Params["DocImportance"] = "true"
configs[1].m_Params["DIAlpha"] = 0.35
configs[1].m_Params["DIPowerMethodDFactor"] = 0.35
configs[1].m_Params["TempMaxDailyAnswerSize"] = 15
configs[1].m_Params["DIDocBoundary"] = 0.7
configs[1].m_Params["PQuerryEx"] = "true"
configs[1].m_Params["QEDEInitQuerrySize"] = 5
configs[1].m_Params["QEDoubleExtension"] = "true"

In [34]:
evaluator.m_Config = configs[1]

for i in range (0, 5) :
    evaluator.parse_mapping()
    evaluator.parse_gold()
    evaluator.create_args("Fit")
    evaluator.save_config()
    evaluator.call_tss()
    evaluator.parse_answer()
    evaluator.compute_all_metrics()
    evaluator.save_evaluation_in_file()


['C:\\!DEV\\C++\\TemporalSummarization\\TemporalSummarizationVS\\x64\\Release\\TemporalSummarization.exe', '45', '11086272', '11870621', '12171411', '11772232', '10105996', '12782702', '13377372', '13322307', '10365689', '13055100', '11770254', '12458844', '12209942', '12154084', '10466159', '13495829', '10482258', '12082485', '12156694', '10822682', '13394061', '10138558', '13142685', '10824477', '12263137', '11136413', '10372602', '11768974', '12521721', '12483331', '11134760', '11155739', '13324368', '11155970', '11872175', '12367599', '11092876', '10366234', '10839989', '10167143', '10825393', '9926682', '10781778', '10482437', '13197524', '-a', 'C:\\!DEV\\C++\\TemporalSummarization\\TemporalSummarizationVS\\Data\\answer.xml', '-c', 'C:\\!DEV\\C++\\Diplom\\TemporalSummarization\\TemporalSummarization\\start_config.xml', '-e', 'C:\\Users\\MishaDEV\\Data\\news_corp_tr_last_w5_s100_c10.bin']
['C:\\!DEV\\C++\\TemporalSummarization\\TemporalSummarizationVS\\x64\\Release\\TemporalSummari

In [16]:
import numpy as np
np.arange(0.0, 1.0, 0.1)
configs[0].m_Params["QEQuerrySize"] = 10
configs[0].m_Params["PKeepL"] = 5
configs[0].m_Params["PKeepT"] = 2
configs[0].m_Params["QEDocCount"] = 50
configs[0].m_Params["QESoftOr"] = 0.3
configs[0].m_Params["QEMinDocRank"] = 0.0

configs[0].m_Params["QETopLemms"] = 15
configs[0].m_Params["QETopTermins"] = 0
configs[0].m_Params["QEDEInitQuerrySize"] = 5

evaluator.m_Config = configs[0]

evaluator.parse_mapping()
evaluator.parse_gold()


for i in range(0, 100) :
    evaluator.m_Config.m_Params["QEQuerrySize"] = int(np.random.uniform(3, 10))
    evaluator.m_Config.m_Params["PKeepL"] = int(np.random.uniform(2, 7))
    evaluator.m_Config.m_Params["PKeepT"] = int(np.random.uniform(0, 5))
    evaluator.m_Config.m_Params["QEDocCount"] = int(np.random.uniform(10, 100))
    evaluator.m_Config.m_Params["QESoftOr"] = np.random.uniform(0.0, 0.8)
    evaluator.m_Config.m_Params["QEMinDocRank"] = np.random.uniform(0.0, 0.5)
    evaluator.m_Config.m_Params["QETopLemms"] = int(np.random.uniform(8, 20))
    evaluator.m_Config.m_Params["QETopTermins"] = int(np.random.uniform(0, 5))
    evaluator.m_Config.m_Params["QEDEInitQuerrySize"] = int(np.random.uniform(4, 8))
    evaluator.create_args("Fit")
    evaluator.save_config()
    evaluator.call_tss()
    evaluator.parse_answer()
    evaluator.compute_all_metrics()
    evaluator.save_evaluation_in_file()
    
'''for qeqsize in np.random.uniform(3, 10.0, 5) :
    configs[0].m_Params["QEQuerrySize"] = qeqsize
    for keepl in np.random.uniform(2, 7, 1) :
        configs[0].m_Params["PKeepL"] = keepl
        for keept in np.random.uniform(0, 5, 5) :
            configs[0].m_Params["PKeepT"] = keept
            for doccount in np.random.uniform(10, 100, 5) :
                configs[0].m_Params["QEDocCount"] = doccount
                for softor in np.random.uniform(0.0, 0.8, 5) :
                    configs[0].m_Params["QESoftOr"] = softor
                    for minrank in np.random.uniform(0.0, 0.5, 5) :
                        configs[0].m_Params["QEMinDocRank"] = minrank
                        for toplemms in np.random.uniform(8, 20, 5) :
                            configs[0].m_Params["QETopLemms"] = toplemms
                            for topterms in np.random.uniform(0, 5, 5) :
                                configs[0].m_Params["QETopTermins"] = topterms
                                for initquerysizede in np.random.uniform(4, 8, 5) :
                                    configs[0].m_Params["QEDEInitQuerrySize"] = initquerysizede
                                    evaluator.create_args()
                                    evaluator.save_config()
                                    evaluator.call_tss()
                                    evaluator.parse_answer()
                                    evaluator.compute_all_metrics()
                                    evaluator.save_evaluation_in_file()'''
                                    

['C:\\!DEV\\C++\\TemporalSummarization\\TemporalSummarizationVS\\x64\\Release\\TemporalSummarization.exe', '45', '12458844', '12209942', '13324368', '11155739', '12483331', '13055100', '13394061', '10466159', '10839989', '13197524', '12521721', '11092876', '11872175', '10366234', '11155970', '13322307', '10824477', '11768974', '10822682', '12884293', '13142685', '12171411', '10105996', '11870621', '12367599', '11136413', '10482258', '11134760', '11772232', '10365689', '11086272', '12263137', '10138558', '12156694', '12782702', '10372602', '13377372', '10781778', '12082485', '11770254', '9926682', '12154084', '10482437', '10825393', '10167143', '-a', 'C:\\!DEV\\C++\\TemporalSummarization\\TemporalSummarizationVS\\Data\\answer.xml', '-c', 'C:\\!DEV\\C++\\Diplom\\TemporalSummarization\\TemporalSummarization\\start_config.xml', '-e', 'C:\\Users\\MishaDEV\\Data\\news_corp_tr_last_w5_s100_c10.bin']
['C:\\!DEV\\C++\\TemporalSummarization\\TemporalSummarizationVS\\x64\\Release\\TemporalSummari

KeyboardInterrupt: 

In [None]:
#random tests
evaluator.parse_mapping()
evaluator.parse_gold()
evaluator.create_args("Fit")
evaluator.save_config()
evaluator.call_tss()
evaluator.parse_answer()
evaluator.compute_all_metrics()
evaluator.save_evaluation_in_file()

random_conf = Config() 
for i in range(1, 100):
    random_conf.RandomParams()
    evaluator.m_Config = random_conf
    evaluator.save_config()
    evaluator.call_tss()
    evaluator.parse_answer()
    evaluator.compute_all_metrics()
    evaluator.save_evaluation_in_file()


In [None]:
print(" ".join(param for param in evaluator.m_ArgsForRun))