1. 定義技能與技能關鍵字
2. 定義instance
3. 定義技能slot與slot對應instance
4. 定義技能語法模型

In [358]:
## 定義技能語法模型
class UserDefineData(object):
    def __init__(self):
        self.skill_id = ''
        self.skill_name = ''
        self.sentence = ''
        self.slot_list = []#SlotData  
    
class Slot(object):
    def __init__(self):
        self.slot_type = ''
        self.synonyms_id = ''
        self.burden_id = ''
        self.slot_data_list = []#SlotData  

class SlotData(object):
    def __init__(self):
        self.slot_serial_in_sentence = 0
        self.slot_multiple_serial_in_sentence = 0
        self.slot_id = ''
        self.slot_name = ''
        self.instance_id = ''
        self.regex_required = ''


In [359]:
class SlotParser:  

    def get_rule(self):  
        pass  
    
    def get_replace_str(self):  
        pass
    
    def get_expression(self):  
        pass

class MultipleSlotParser(SlotParser):  
 
    def __init__(self, skill_id, slot, expression):  
        self.__skill_id= skill_id 
        self.__slot= slot  
        self.__expression = expression
        
    def get_rule(self):

        rule =r'(.*)'
        return rule.encode('utf8')
        
    def get_replace_str(self):
        
        replace_str = ''
        for data in self.__slot.slot_data_list:
            replace_str = replace_str + '@{' + data.slot_id + '}'

        replace_str = '+{' + replace_str + '}'

        return replace_str.encode('utf8')
    
    def get_expression(self):
#         print 'MultipleSlotParser get_expression:'
#         print self.get_replace_str()
#         print self.get_rule()
#         print self.__sentence_rule
        return self.__expression.replace(self.get_replace_str() , self.get_rule())

class SingleSlotParser(SlotParser):  

    def __init__(self, skill_id, slot, expression):  
        self.__skill_id= skill_id 
        self.__slot= slot  
        self.__expression = expression
        
    def get_rule(self):
        
        rule = ''
        for data in self.__slot.slot_data_list:

            rule =r'(.*)'
            if data.regex_required == False:
                rule = rule + '?'
            
        return rule.encode('utf8')
        
    def get_replace_str(self):
        
        for data in self.__slot.slot_data_list:
            replace_str = '@{' + data.slot_id + '}'
            
        return replace_str.encode('utf8')
    
    def get_expression(self):
#         print 'SingleSlotParser get_expression:'
#         print self.get_replace_str()
#         print self.get_rule()
#         print self.__expression
#         print type(self.get_replace_str())
#         print type(self.get_rule())
#         print type(self.__expression)
        return self.__expression.replace(self.get_replace_str() , self.get_rule())

class SynonymsSlotParser(SlotParser):  
 
    def __init__(self, skill_id, slot, expression):  
        self.__skill_id= skill_id 
        self.__slot= slot  
        self.__expression = expression
        
    def get_rule(self):
        
        cql = ("select * from " + DB_TABLE_SKILL_SENTENCE_SYNONYMS +
               " where skill_id = '" + self.__skill_id + "' and " +
               " synonyms_id = " + self.__slot.synonyms_id + ";" )
#         print cql
        word_pd = dao.execCQLSelectToPandasDF(DB_KEYSPACE, cql)
        return StringPreHandler.regex_format_convertor('or', word_pd['data'][0].encode('utf8'), ',')
        
    def get_replace_str(self):

        replace_str = '${' + self.__slot.synonyms_id + '}'
        return replace_str.encode('utf8')
    
    def get_expression(self):

        return self.__expression.replace(self.get_replace_str() , self.get_rule())

class BurdenSlotParser(SlotParser):  

    def __init__(self, skill_id, slot, expression):  
        self.__skill_id = skill_id 
        self.__slot = slot  
        self.__expression = expression
        
    def get_rule(self):
        
        cql = ("select * from " + DB_TABLE_SKILL_SENTENCE_BURDEN +
               " where skill_id = '" + self.__skill_id + "' and " +
               " burden_id = " + self.__slot.burden_id + ";" )
#         print cql
        word_pd = dao.execCQLSelectToPandasDF(DB_KEYSPACE, cql)
        rule = r'(' + word_pd['burden'][0].encode('utf8') + ')?'
        return rule
        
    def get_replace_str(self):
        
        replace_str = '?{' + self.__slot.burden_id + '}'  
        return replace_str.encode('utf8')
    
    def get_expression(self):

        return self.__expression.replace(self.get_replace_str() , self.get_rule())


class SlotParserFactory:
    
    def __init__(self, skill_id, slot, expression):
        self.__skill_id = skill_id
        self.__slot = slot
        self.__slot_type = slot.slot_type
        self.__expression = expression
    
    @property
    def slot_type(self):
        return self.slot_type
    
    @slot_type.setter
    def slot_type(self, value):
        self.__slot_type = value
    
    def create_parser(self):
    
        if self.__slot_type == 'multiple':
            parser = MultipleSlotParser(self.__skill_id, self.__slot, self.__expression)
        elif self.__slot_type == 'synonyms':
            parser = SynonymsSlotParser(self.__skill_id, self.__slot, self.__expression)
        elif self.__slot_type == 'burden':
            parser = BurdenSlotParser(self.__skill_id, self.__slot, self.__expression)
        else:
            parser = SingleSlotParser(self.__skill_id, self.__slot, self.__expression)
        return parser


In [360]:
# cql = ("select * from " + DB_TABLE_SKILL_SENTENCE_SYNONYMS +
#        " where skill_id = 'qcall_test' and " +
#        " synonyms_id = 5c2ce733-65a2-4f51-bda0-080cbbe7b892	;" )
# word_pd = dao.execCQLSelectToPandasDF(DB_KEYSPACE, cql)   
# rule = StringPreHandler.regex_format_convertor('or', word_pd['data'][0], ',')
# rule

In [361]:
def get_parser(skill_id, slot, expression):
    
    parser_factory = SlotParserFactory(skill_id, slot, expression)
    parser = parser_factory.create_parser()
    return parser

In [362]:
def save_to_db(user_define_data_list):
    
    # N句批次處理
    for udd in user_define_data_list:

        print 'skill_id = ' + udd.skill_id
        print 'skill_name = ' + udd.skill_name
        
        sentence_id = get_skill_sentence_id(udd.skill_id)
        
        expression = udd.sentence
        
        # 單句每個slot處理
        for slot in udd.slot_list:

            # 每個slot_data
            # single有一個slot_data
            # mutiple有多個slot_data
            for index, data in enumerate(slot.slot_data_list):

                save_sentence_data(ud.skill_id, 
                                   sentence_id,
                                   slot.slot_type,
                                   data.slot_serial_in_sentence, 
                                   data.slot_multiple_serial_in_sentence,
                                   data.slot_name,
                                   data.instance_id,
                                   data.slot_id,
                                   data.regex_required)

            rule = ''
            replace_str = ''
            
            # 句子轉成正規表示法
            # 迴圈所有slot做完才完成正規表示法
            parser = get_parser(ud.skill_id, slot, expression)
            expression = parser.get_expression()
            
        # save to DB
        # TODO: 先砍掉sentence_data
        save_sentence(udd.skill_id, sentence_id, udd.sentence, expression)   

           
# insert into sentence_slots(skill_id, sentence_id, slot_serial_in_sentence, slot_name, slot_type , instance_id, slot_id, slot_multiple_serial_in_sentence) 
# values('qcall2',1, 1, '受話人','single','sys.any','person',1);


        
#             c_dao.execCQL(DB_KEYSPACE, insert_log_cql)


In [363]:
def get_skill_sentence(skill_id):
    
    cql = ("select * from " +
             DB_TABLE_SKILL_SENTENCE +
             " where skill_id = '" +
             skill_id + "';")
    return dao.execCQLSelectToPandasDF(DB_KEYSPACE, cql)

def get_skill_sentence_id(skill_id):
    
    skill_sentence_pd = get_skill_sentence(skill_id)  
    return str(skill_sentence_pd['sentence_id'].max() + 1)
    
def save_sentence(skill_id, sentence_id, sentence, expression):
#     print '(skill_id, sentence_id, sentence, expression) '
#     print type(skill_id)
#     print type(sentence_id)
#     print type(sentence)
#     print type(expression)
    
    cql = ("insert into " + DB_TABLE_SKILL_SENTENCE +
             "(skill_id, sentence_id, sentence, expression) "+
             "values('" + skill_id + "'," + str(sentence_id) +
             ", '" + sentence +  "', '" + expression + "');")

    print '=== save_sentence ==='
    print cql
    print ''

def save_sentence_data(skill_id, 
                       sentence_id,
                       slot_type,
                       slot_serial_in_sentence, 
                       slot_multiple_serial_in_sentence,
                       slot_name,
                       instance_id,
                       slot_id,
                       regex_required):
    
    cql = ("insert into " + DB_TABLE_SENTENCE_SLOT +
            "(skill_id, sentence_id, slot_serial_in_sentence, " +
            "slot_multiple_serial_in_sentence, slot_name, slot_type, instance_id, slot_id, regex_required) " +
            "values('" + skill_id +
                     "'," + str(sentence_id) + 
                     "," + str(slot_serial_in_sentence) +
                     "," + str(slot_multiple_serial_in_sentence) + 
                     ",'" + slot_name + 
                     "','" + slot_type + 
                     "','" + instance_id + 
                     "','" + slot_id + 
                     "','" + regex_required + "');")
    print '--- save_sentence_data ---'    
    print cql  
    print ''

def delete_sentence(skill_id):
    
    cql = "delete from " + DB_TABLE_SKILL_SENTENCE + " where skill_id = '" + skill_id + "';"
    print cql 
    delete_sentence_data(skill_id)
    
def delete_sentence_data(skill_id):
    
    cql = "delete from " + DB_TABLE_SENTENCE_SLOT + " where skill_id = '" + skill_id + "';"
    print cql 
    

In [364]:
user_define_data_list = []

ud = UserDefineData()
ud.skill_id = 'qcall_test'
ud.skill_name = 'QCALL_TEST'
ud.sentence  = '${5c2ce733-65a2-4f51-bda0-080cbbe7b892}@{ext}?{bd31bfbc-39b4-41f0-8d4b-a456fd5f1d24}的分機'

st1 = Slot()
st1.slot_type = 'synonyms'
st1.synonyms_id='5c2ce733-65a2-4f51-bda0-080cbbe7b892'
st1.burden_id=''
st1.slot_data_list = []
ud.slot_list.append(st1)

st2 = Slot()
st2.slot_type = 'single'
st2.synonyms_id=''
st2.burden_id=''
sd2 = SlotData()
sd2.slot_serial_in_sentence = 1
sd2.slot_multiple_serial_in_sentence = 1
sd2.slot_id = 'person_1'
sd2.slot_name = '人員'
sd2.instance_id = 'camp.employee'
sd2.regex_required = 'True'
st2.slot_data_list = [sd2]
ud.slot_list.append(st2)

st3 = Slot()
st3.slot_type = 'burden'
st3.synonyms_id=''
st3.burden_id='bd31bfbc-39b4-41f0-8d4b-a456fd5f1d24'
st3.slot_data_list = []
ud.slot_list.append(st3)

user_define_data_list.append(ud)

save_to_db(user_define_data_list)

skill_id = qcall_test
skill_name = QCALL_TEST
--- save_sentence_data ---
insert into sentence_slots(skill_id, sentence_id, slot_serial_in_sentence, slot_multiple_serial_in_sentence, slot_name, slot_type, instance_id, slot_id, regex_required) values('qcall_test',5,1,1,'人員','single','camp.employee','person_1','True');

(skill_id, sentence_id, sentence, expression) 
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
=== save_sentence ===
insert into skill_sentence(skill_id, sentence_id, sentence, expression) values('qcall_test',5, '${5c2ce733-65a2-4f51-bda0-080cbbe7b892}@{ext}?{bd31bfbc-39b4-41f0-8d4b-a456fd5f1d24}的分機', '(呼叫|幫我呼|幫我叫)@{ext}(的)?的分機');



In [365]:
#批次save句子
user_define_data_list = []

#每一句的資料, parser json格式後寫入一個暫存資料結構
ud = UserDefineData()
ud.skill_id = 'drink'
ud.skill_name = '賣飲料'
# ud.sentence  = '${help_me}${qcall}給+{@{person_1}@{person_2}}${to}@{location}'
# ud.sentence  = '${help_me}買拿鐵+{@{sugar}@{iced}}'
# ud.sentence  = '買拿鐵+{@{sugar}@{iced}}'
ud.sentence  = '幫忙@{person_1}買拿鐵+{@{sugar}@{iced}}'
# ud.sentence  = '+{@{sugar}@{iced}}'

# st1 = Slot()
# st1.slot_type = 'single'
# sd1 = SlotData()
# sd1.slot_name = 'help_me'
# sd1.entity_type = 'synonyms'
# sd1.entity = 'synonyms.help.me'
# sd1.regex_required = False
# # st1.slot_data_list.append(sd1)
# st1.slot_data_list = [sd1]
# ud.slot_list.append(st1)


st1 = Slot()
st1.slot_type = 'single'
sd1 = SlotData()
sd1.slot_serial_in_sentence = 1
sd1.slot_multiple_serial_in_sentence = 1
sd1.slot_id = 'person_1'
sd1.slot_name = '人員'
sd1.instance_id = 'camp.employee'
sd1.regex_required = 'True'
# st1.slot_data_list.append(sd1)
st1.slot_data_list = [sd1]
ud.slot_list.append(st1)

st2 = Slot()
st2.slot_type = 'multiple'
sd2 = SlotData()
sd2.slot_serial_in_sentence = 2
sd2.slot_multiple_serial_in_sentence = 2
sd2.slot_id = 'sugar'
sd2.slot_name = '甜度'
sd2.instance_id = 'udf.drink.sugar'
sd2.regex_required = 'True'
# st2.slot_data_list.append(sd2)
sd3 = SlotData()
sd3.slot_serial_in_sentence = 3
sd3.slot_multiple_serial_in_sentence = 2
sd3.slot_id = 'iced'
sd3.slot_name = '冰塊'
sd3.instance_id = 'udf.drink.iced'
sd3.regex_required = 'True'
# st2.slot_data_list.append(sd3)
st2.slot_data_list = [sd2,sd3]
ud.slot_list.append(st2)

user_define_data_list.append(ud)

save_to_db(user_define_data_list)

skill_id = drink
skill_name = 賣飲料
--- save_sentence_data ---
insert into sentence_slots(skill_id, sentence_id, slot_serial_in_sentence, slot_multiple_serial_in_sentence, slot_name, slot_type, instance_id, slot_id, regex_required) values('drink',4,1,1,'人員','single','camp.employee','person_1','True');

--- save_sentence_data ---
insert into sentence_slots(skill_id, sentence_id, slot_serial_in_sentence, slot_multiple_serial_in_sentence, slot_name, slot_type, instance_id, slot_id, regex_required) values('drink',4,2,2,'甜度','multiple','udf.drink.sugar','sugar','True');

--- save_sentence_data ---
insert into sentence_slots(skill_id, sentence_id, slot_serial_in_sentence, slot_multiple_serial_in_sentence, slot_name, slot_type, instance_id, slot_id, regex_required) values('drink',4,3,2,'冰塊','multiple','udf.drink.iced','iced','True');

(skill_id, sentence_id, sentence, expression) 
<type 'str'>
<type 'str'>
<type 'str'>
<type 'str'>
=== save_sentence ===
insert into skill_sentence(skill_id, sent

In [366]:
def get_skill_sentence(skill_id, sentence_id=None):
    
    if sentence_id:   
        cql = ("select * from " +
                 DB_TABLE_SKILL_SENTENCE +
                 " where skill_id = '" + skill_id + "'" +
                 " and sentence_id = " + str(sentence_id) + ";")
    else:
        cql = ("select * from " +
                 DB_TABLE_SKILL_SENTENCE +
                 " where skill_id = '" + skill_id + "';")
    
    return dao.execCQLSelectToPandasDF(DB_KEYSPACE, cql)

def get_skill_sentence_data(skill_id, sentence_id=None):
    
    if sentence_id:   
        cql = ("select * from " +
                 DB_TABLE_SENTENCE_SLOT +
                 " where skill_id = '" + skill_id + "'" +
                 " and sentence_id = " + str(sentence_id) + ";")
    else:
        cql = ("select * from " +
                 DB_TABLE_SENTENCE_SLOT +
                 " where skill_id = '" + skill_id + "';")
        
        
    return dao.execCQLSelectToPandasDF(DB_KEYSPACE, cql)

# 修改同義詞&贅詞要回去修改句子

In [367]:
moodify_skill_id = 'qcall_test'
moodify_synonyms_id = '5c2ce733-65a2-4f51-bda0-080cbbe7b892'
moodify_burden_id = 'bd31bfbc-39b4-41f0-8d4b-a456fd5f1d24'

# 找到同義詞句子
skill_sentence_pd = get_skill_sentence('qcall_test')   
for index, row in skill_sentence_pd.reset_index().iterrows():
    
    if moodify_sentence_id in row['sentence']:
        print '================================='
        print row['sentence_id'], row['sentence']
        
        sentence = row['sentence'].encode('utf-8')
        expression = sentence
        s_data_pd = get_skill_sentence_data(moodify_skill_id, row['sentence_id'])
        
        # by sentence_data來重組expression
        for s_index, s_row in s_data_pd.reset_index().iterrows():
            # 句子轉成正規表示法
            # 迴圈所有slot做完才完成正規表示法
            print '------------------------------'
            print s_index, expression, s_row['slot_type']

            slot = Slot()
            slot.slot_type = s_row['slot_type']
            
            if s_row['slot_type'] == 'synonyms':
                slot.synonyms_id = moodify_synonyms_id
            elif s_row['slot_type'] == 'burden':
                slot.burden_id = moodify_burden_id
            else:
                sd = SlotData()
                sd.slot_serial_in_sentence = s_row['slot_serial_in_sentence']
                sd.slot_multiple_serial_in_sentence = s_row['slot_multiple_serial_in_sentence']
                sd.slot_id = s_row['slot_id']
                sd.slot_name = s_row['slot_name']
                sd.instance_id = s_row['instance_id']
                sd.regex_required = s_row['regex_required']
                slot.slot_data_list = [sd]
            
            parser = get_parser(moodify_skill_id, slot, expression)
            expression = parser.get_expression()
            print 'expression = ' + expression
            
        save_sentence(moodify_skill_id, row['sentence_id'], sentence, expression)   
        

3 ${5c2ce733-65a2-4f51-bda0-080cbbe7b892}@{ext}?{bd31bfbc-39b4-41f0-8d4b-a456fd5f1d24}分機
------------------------------
0 ${5c2ce733-65a2-4f51-bda0-080cbbe7b892}@{ext}?{bd31bfbc-39b4-41f0-8d4b-a456fd5f1d24}分機 synonyms
expression = (呼叫|幫我呼|幫我叫)@{ext}?{bd31bfbc-39b4-41f0-8d4b-a456fd5f1d24}分機
------------------------------
1 (呼叫|幫我呼|幫我叫)@{ext}?{bd31bfbc-39b4-41f0-8d4b-a456fd5f1d24}分機 single
expression = (呼叫|幫我呼|幫我叫)(.*)?{bd31bfbc-39b4-41f0-8d4b-a456fd5f1d24}分機
------------------------------
2 (呼叫|幫我呼|幫我叫)(.*)?{bd31bfbc-39b4-41f0-8d4b-a456fd5f1d24}分機 burden
expression = (呼叫|幫我呼|幫我叫)(.*)(的)?分機
(skill_id, sentence_id, sentence, expression) 
<type 'str'>
<type 'int'>
<type 'str'>
<type 'str'>
=== save_sentence ===
insert into skill_sentence(skill_id, sentence_id, sentence, expression) values('qcall_test',3, '${5c2ce733-65a2-4f51-bda0-080cbbe7b892}@{ext}?{bd31bfbc-39b4-41f0-8d4b-a456fd5f1d24}分機', '(呼叫|幫我呼|幫我叫)(.*)(的)?分機');

4 ${5c2ce733-65a2-4f51-bda0-080cbbe7b892}大雄?{bd31bfbc-39b4-41f0-8d4b-a