In [2]:
from cassandra.cluster import Cluster
from cassandra.policies import DCAwareRoundRobinPolicy
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import BatchStatement
import pandas as pd

class CassandraType(object):
    PRODUCTION = 0
    TEST = 1
    TEST_DOCKER = 2


class CassandraDAO(object):

    # you have to install following items :
    # a. python-Cassandra driver
    # b. pyspark cassandra connector

    def __init__(self, type):
        print('init CassandraDAO')
        if type == CassandraType.PRODUCTION:
            self.contact_points = ['192.168.95.127', '192.168.95.122']
            self.contact_points_str = "192.168.95.127,192.168.95.122"
        elif type == CassandraType.TEST:
            self.contact_points = ['192.168.0.41', '192.168.0.42']
            self.contact_points_str = "192.168.0.41,192.168.0.42"
        else:
            self.contact_points = ['192.168.0.121', '192.168.0.122', '192.168.0.52']
            self.contact_points_str = "192.168.0.121,192.168.0.122,192.168.0.52"

        self.formatString = "org.apache.spark.sql.cassandra"
        self.username = "username"
        self.password = "password"
        self.cluster = None
        self.session = None
        self.createSession()

    def __del__(self):
        self.cluster.shutdown()

    def pandas_factory(self, colnames, rows):
        return pd.DataFrame(rows, columns=colnames)

    def createSession(self):
        print ("contact_points = " + self.contact_points_str)
        self.cluster = Cluster(
            contact_points=self.contact_points,  # random select a node
            #             load_balancing_policy = DCAwareRoundRobinPolicy(local_dc='datacenter1'),
            #         auth_provider = PlainTextAuthProvider(username='cassandra', password='cassandra')
        )
        self.session = self.cluster.connect()
        self.session.row_factory = self.pandas_factory
        self.session.default_fetch_size = 10000000
        # needed for large queries, otherwise driver will do pagination. Default is 50000.

    def getSession(self):
        return self.session

    def execCQL(self, keyspace, cql):
        """
        execute CQL
        """
        self.session.set_keyspace(keyspace)
        self.session.execute_async(cql)

    def execCQLSelect(self, keyspace, cql):
        """
        execute CQL, select only
        """

        self.session.set_keyspace(keyspace)

        #       cassandra ResultSet
        async_results = self.session.execute_async(cql)
        return async_results

    def execCQLCallBackAnysc(self, keyspace, cql, handle_success, handle_error):
        """
        execute CQL, if success => handle_success function, else handle_error
        """
        self.session.set_keyspace(keyspace)
        async_results = self.session.execute_async(cql)
        async_results.add_callbacks(handle_success, handle_error)

    def execCQLSelectToPandasDF(self, keyspace, cql):
        """
        execute CQL, select only, return Pandas DataFrame
        """

        self.session.set_keyspace(keyspace)

        #       cassandra ResultSet
        async_results = self.session.execute_async(cql)
        #         async_results = self.session.execute_async(cql)
        #       to Pandas DataFrame
        return async_results.result()._current_rows

    def execCQLSelectToDF(self, sqlContext, keyspace, cql):
        """
        execute CQL, select only, return Spark DataFrame
        """

        #       pandas dataframe to spark dataframe
        pandas_dataframe = self.execCQLSelectToPandasDF(keyspace, cql)
        if pandas_dataframe.empty:
            schema = StructType([])
            return sqlContext.createDataFrame([], schema)
        else:
            return sqlContext.createDataFrame(pandas_dataframe)

    def execCQLSelectToRDD(self, sqlContext, keyspace, cql):
        """
        execute CQL, select only, return Spark RDD
        """

        return self.execCQLSelectToDF(sqlContext, keyspace, cql).rdd.map(tuple)  # dataFrame to RDD

    @property
    def contactPoints(self):
        return self.contact_points

    @contactPoints.setter
    def contactPoints(self, contact_points):
        self.contact_points = contact_points

    @contactPoints.deleter
    def contactPoints(self):
        del self.contact_points

In [3]:
CASSANDRA_ENV = CassandraType.PRODUCTION
c_dao = CassandraDAO(CASSANDRA_ENV)
HELPER_TEST_KEYSPACE='helper_test_keyspace'

init CassandraDAO
contact_points = 192.168.95.127,192.168.95.122


In [4]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model

from datetime import datetime
import pickle
import numpy as np
import json
import requests
import pandas as pd
import decimal
pd.set_option('max_colwidth', 100)

# import urllib.request
import os
import tarfile
np.random.seed(10)


Using TensorFlow backend.


In [5]:
from collections import namedtuple
import pandas as pd
import jieba
import jieba.posseg as pseg


class JiebaSegmentor:

    def __init__(self, dict_path, userdict=[], stopwords=False, stopwords_path=None):
        self.dict_path = dict_path
        self.userdict = userdict
        self.dictionary_init()
        self.stopwords_path = stopwords_path
        self.stopwords = stopwords
        self.stopwords_set = set()
        self.stopwords_init()

    def dictionary_init(self):
        jieba.set_dictionary(self.dict_path)
        for path in self.userdict:
            print path
            jieba.load_userdict(path)

    def stopwords_init(self):
        if self.stopwords_path:
            with open(self.stopwords_path, 'r') as stopwords:
                for stopword in stopwords:
                    self.stopwords_set.add(stopword.strip('\n').decode('utf-8'))

    def taiwan_country(self):
        return [u'臺北', u'台北', u'基隆', u'臺中', u'台中', u'臺南', u'台南', u'高雄',
                u'宜蘭', u'桃園', u'新竹', u'苗栗', u'彰化', u'南投', u'嘉義', u'雲林',
                u'屏東', u'臺東', u'台東', u'花蓮', u'澎湖']

    def wordToNumber(self, input_text):

        target = u''
        for s in input_text:

            if (s == u'零') or (s == '0'):
                to_word = u'0'
            elif (s == u'一') or (s == u'壹') or (s == '1'):
                to_word = u'1'
            elif (s == u'二') or (s == u'兩') or (s == u'貳') or (s == '2'):
                to_word = u'2'
            elif (s == u'三') or (s == u'參') or (s == '3'):
                to_word = u'3'
            elif (s == u'四') or (s == u'肆') or (s == '4'):
                to_word = u'4'
            elif (s == u'五') or (s == u'伍') or (s == '5'):
                to_word = u'5'
            elif (s == u'六') or (s == u'陸') or (s == '6'):
                to_word = u'6'
            elif (s == u'七') or (s == u'柒') or (s == '7'):
                to_word = u'7'
            elif (s == u'八') or (s == u'捌') or (s == '8'):
                to_word = u'8'
            elif (s == u'九') or (s == u'玖') or (s == '9'):
                to_word = u'9'
            else:
                to_word = s

        target = target + to_word
        return target

    def input_text_preprocessing(self, input_text):

        if type(input_text) is not unicode:
            input_text = input_text.decode('utf-8')

        #         input_text = self.wordToNumber(input_text)
        return input_text

    def get_names(self, input_text):
        """
        取得姓名
        :param input_text:
        :return:
        """

        input_text = self.input_text_preprocessing(input_text)
        names = []
        words = pseg.cut(input_text)
        print words
        for w, f in words:
            if f.lower() == 'nr':
                names.append(w)
        for name in names:
            print name.encode('utf-8')
        return names

    def lcut(self, input_text, return_type='pandas'):
        """
        斷詞
        :param input_text:
        :param return_type:
        :return: pandas
        """

        input_text = self.input_text_preprocessing(input_text)
        cut_raw = jieba.lcut(input_text)
        key = []

        for k in cut_raw:
            if self.stopwords:
                if k in self.stopwords_set:
                    continue

            key.append(k)

        result = pd.DataFrame({"word": key})
        if return_type == 'pandas':
            return result
        elif return_type == 'dict':
            return result.to_dict('index').values()
        else:
            return result

    def pseg_lcut(self, input_text, return_type='pandas'):
        """
        斷詞+詞性
        :param input_text:
        :param return_type:
        :return: pandas
        """

        input_text = self.input_text_preprocessing(input_text)
        cut_raw = pseg.lcut(input_text)
        key = []
        value = []

        for k, v in cut_raw:
            tag = v
            if self.stopwords:
                if k in self.stopwords_set:
                    continue

            if k in self.taiwan_country():
                tag = u'ns'
            if len(k) > 1 and tag == u'x':
                tag = u'n'
            key.append(k)
            value.append(tag)

        result = pd.DataFrame({"word": key, "tag": value})
        if return_type == 'pandas':
            return result
        elif return_type == 'dict':
            return result.to_dict('index').values()
        else:
            return result

    def pseg_lcut_combie_num_eng(self, input_text, return_type='pandas'):
        """
        將數字與英文結合成同一欄位
        :param input_text:
        :param return_type:
        :return: pandas
        """

        input_text = self.input_text_preprocessing(input_text)
        seg_pd = self.pseg_lcut(input_text)
        seg_dict_list = []
        m_eng_list = []
        CombieTuple = namedtuple('CombieTuple', {
            'index',
            'word',
            'sp'})

        for index, seg in seg_pd.iterrows():
            #     print type(seg)
            #     print seg
            seg_dict = {
                "word": seg['word'],
                "sp": seg['tag']
            }

            if seg['tag'] == 'm':
                #         m_eng_dict.update(seg_dict)
                combie_tuple = CombieTuple(
                    index=index,
                    word=seg['word'],
                    sp=seg['tag']
                )
                m_eng_list.append(combie_tuple)
            #             continue

            if seg['tag'] == 'eng':
                if m_eng_list:
                    if m_eng_list[0].index + 1 == index:
                        seg_dict = {
                            "word": m_eng_list[0].word + seg['word'],
                            "sp": m_eng_list[0].sp + '+' + seg['tag']
                        }
                        m_eng_list = []
                        del seg_dict_list[index - 1]

            seg_dict_list.append(seg_dict)

        if return_type == 'pandas':
            return pd.DataFrame(seg_dict_list)
        elif return_type == 'dict':
            return seg_dict_list
        else:
            return pd.DataFrame(seg_dict_list)


In [6]:
jieba_data_set = 'dataset_01'
jieba_dict_path1 = "/home/charles/dataset/jieba/" + jieba_data_set +  "/dict_taiwan.txt"
jieba_dict_path2 = "/home/charles/dataset/jieba/" + jieba_data_set +  "/userdict.txt"
jieba_dict_path3 = "/home/charles/dataset/jieba/" + jieba_data_set +  "/dict.txt.big"
jieba_dict_path4 = "/home/charles/dataset/jieba/" + jieba_data_set +  "/dict.txt.small"
jieba_stopwords_path = "/home/charles/dataset/jieba/" + jieba_data_set +  "/stopwords.txt"

js = JiebaSegmentor(dict_path=jieba_dict_path1,
                    userdict=[],
                    stopwords=True,
                    stopwords_path=jieba_stopwords_path)
# js = JiebaSegmentor(jieba_dict_path1, [jieba_dict_path2, jieba_dict_path3, jieba_dict_path4])

In [7]:
test_cut_raw_0 = js.lcut('北京在哪呢', return_type='df')
test_cut_raw_0
# for x in test_cut_raw_0:
#     print (x)

Building prefix dict from /home/charles/dataset/jieba/dataset_01/dict_taiwan.txt ...
DEBUG:jieba:Building prefix dict from /home/charles/dataset/jieba/dataset_01/dict_taiwan.txt ...
Loading model from cache /tmp/jieba.uabe385690ac6efabcf7cebe5190ee7b2.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.uabe385690ac6efabcf7cebe5190ee7b2.cache
Loading model cost 0.311 seconds.
DEBUG:jieba:Loading model cost 0.311 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


Unnamed: 0,word
0,北京


In [8]:
mapping_list = ['air', 'coffee', 'leave', 'po', 'point', 'qcall', 'traffic', 'weather']

def to_cat_name(x): 
    mapping_name_list = []
    for item in x:
        mapping_name_list.append(mapping_list[int(item)])
    return mapping_name_list

def cut_to_word(s):
    w_df = js.lcut(s, return_type='pandas')
    combie = ''
    for i,w in w_df.iterrows():
        combie = combie + w
        if i < len(w_df) - 1:
            combie = combie + ','
            
#     w_list = js.lcut(s, cut_type='list')
#     combie = ''
#     for i,w in enumerate(w_list):
#         combie = combie + w
#         if i < len(w_list) - 1:
#             combie = combie + ','
            
    return combie

def predict_class(model, data):

    y_predict_probability = model.predict(data, batch_size=64, verbose=1)
    predict_arr = []
    predictClass = []
    for row in y_predict_probability: 
            classIndex=0
            selectClass=0
            selectProbability=0
            for item in row: 
                #print(float(item))
                if(selectProbability <= float(item)):
                    #print(classIndex)
                    selectProbability=float(item)
                    selectClass=classIndex
                classIndex=classIndex+1
            predictClass.append(selectClass)
             
    return to_cat_name(predictClass), predictClass, y_predict_probability

def float_to_str(f, float_display):

    return round(f, float_display)

# # create a new context for this task
# ctx = decimal.Context()
# # 20 digits should be enough for everyone :D
# float_display = 4
# ctx.prec = float_display
# def float_to_str(f):
#     """
#      Convert the given float to a string,
#      without resorting to scientific notation
#     """

#     return round(f, float_display)
#     d1 = ctx.create_decimal(repr(f))
#     return format(d1, 'f')

# predict

In [23]:
model_name = 'model_intent'
# model_path = 'model/{}.h5'.format(model_name)
model_path = '/home/charles/dataset/model/intent/{}.h5'.format(model_name)
model = load_model(model_path)

# loading model > tokenizer
# tokenizer_path = 'model/tokenizer_intent.pickle'
tokenizer_path = '/home/charles/dataset/model/intent/tokenizer_intent.pickle'
with open(tokenizer_path, 'rb') as handle:
    tokenizer_intent = pickle.load(handle)

In [24]:
input_text = ['明天班機有正常起飛嗎','現在國道北上塞車嗎']

In [25]:
input_text_df = pd.DataFrame({'sentence':input_text})
input_text_df['cut_words'] = input_text_df['sentence'].apply(lambda s: cut_to_word(s.strip()))
input_text_df

Unnamed: 0,sentence,cut_words
0,明天班機有正常起飛嗎,"明天,班機,正常,起飛"
1,現在國道北上塞車嗎,"現在,國道,北上,塞車"


In [26]:
def preprocessing(cut_words, max_len=10):
    
    #將文字轉為數字序列
    train_seq_intent = tokenizer_intent.texts_to_sequences(cut_words)
#     print train_seq_intent

    # 截長補短，讓所有影評所產生的數字序列長度一樣
    seq = sequence.pad_sequences(train_seq_intent, maxlen=max_len)
#     print data.shape
    return seq

cut_words = input_text_df.cut_words
seq = preprocessing(cut_words, max_len=10)
seq

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)

In [27]:
y_predict_name, y_predict, y_predict_probability = predict_class(model, seq)

predict_arr = []
for row in y_predict_probability:
    row_arr = []
    for item in row:
#         print(float_to_str(item))
        row_arr.append(float_to_str(item,4 ))
    predict_arr.append(row_arr)
    
predict_df = pd.DataFrame({'1_sentence':input_text_df.sentence,
                           '2_y_predict':y_predict,
                           '3_y_predict_name':y_predict_name,
                           '4_y_predict_probability':predict_arr})

predict_df



Unnamed: 0,1_sentence,2_y_predict,3_y_predict_name,4_y_predict_probability
0,明天班機有正常起飛嗎,2,leave,"[0.1522, 0.3515, 0.4963]"
1,現在國道北上塞車嗎,2,leave,"[0.1522, 0.3515, 0.4963]"


In [28]:
# probability_list = list(predict_df['4_y_predict_probability'].values)
# predict_list = [range(0,len(mapping_list))]
# predict_name =[mapping_list]
# probability_mapping_df = pd.DataFrame({'predict':predict_list[0],
#                                        'predict_name':predict_name[0],
#                                        'probability':probability_list[0]})
# probability_mapping_df.sort_values(by=['probability'], ascending=False).reset_index(drop=True)

In [29]:
# robot_id='robot_a'
# data = []
# for index, row in predict_df.iterrows():
    
#     probability_list = row['4_y_predict_probability']
# #     print(probability_list) 
#     predict_list = range(0,len(mapping_list))
# #     print(predict_list) 
#     predict_name =mapping_list
# #     print(predict_name) 
#     probability_mapping_df = pd.DataFrame({'predict':predict_list,
#                                            'predict_name':predict_name,
#                                            'probability':probability_list})
# #     print(probability_mapping_df)
#     probability_mapping_df = probability_mapping_df.\
#                             sort_values(by=['probability'], ascending=False).\
#                             reset_index(drop=True)
# #     print(probability_mapping_df)
    
#     predict_result = dict(sentence=row['1_sentence'],
#                           predict=list(probability_mapping_df['predict']),
#                           predict_skill=list(probability_mapping_df['predict_name']),
#                           confidence=list(probability_mapping_df['probability']))
    
#     result = dict(robot_id=robot_id,
#                   skill=mapping_list,
#                   predict=predict_result)
#     data.append(result)

# data

In [184]:
robot_id = 'hr.00001318'
min_confidence = 0.9

HELPER_KEYSPACE = 'helper_keyspace'
HELPER_INTENT_MODEL_DATA_TABLE = 'intent_model_data'
HELPER_INTENT_TRAINING_DATA_TABLE = 'intent_training_data'

"""
與訓練句子比對
"""
cql = ("select skill_id,sentence from " + HELPER_INTENT_TRAINING_DATA_TABLE +
               " where robot_id = '" + robot_id + "';")
pd_df = c_dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
pd_df

Unnamed: 0,skill_id,sentence
0,leave,不來公司
1,leave,不能來公司
2,leave,不進公司了
3,leave,事假
4,leave,休假
5,leave,公出
6,leave,出勤
7,leave,出差
8,leave,填個特休假單
9,leave,填假單


In [185]:
cql = ("select * from " + HELPER_INTENT_MODEL_DATA_TABLE + " where robot_id = '" + robot_id + "';")
mapping_df = c_dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
if len(mapping_df):
    mapping_list = list(mapping_df['mapping'][0])
    
mapping_list

[u'leave', u'phone', u'weather']

In [186]:
sentence_list = []
skill_list = []
mapping_skill_list = []
probability_list = []
for ii, sentence in enumerate(['打個MVPN給陳俊宏天氣']):
    
    # 新句子比對訓練句子找出相關的skill
    skill_list_temp = []
    for i, row in pd_df.iterrows():
        if row['sentence'].lower().encode('utf8') in sentence.lower():
            skill_list_temp.append(row['skill_id'])
    # remove duplicate & 限制最多意圖數
    skill_list_temp = list(set(skill_list_temp))
    skill_list.append(skill_list_temp)
    # 補上none方便識別
    mapping_skill_list.append([None] * len(mapping_list))
    if len(skill_list) > 0:
        sentence_list.append(sentence)
    
    for i, value in enumerate(skill_list_temp):
        mapping_skill_list[ii][mapping_list.index(value)] = value         
    
    # 計算機率
    if len(skill_list[ii]) > 0:
        probability = float(1)/len(skill_list[ii])
    else:
        probability = 0
    # none的話就把機率補 0
    probability_list_temp = []
    for x in mapping_skill_list[ii]:
        if x:
            probability_list_temp.append(probability)
        else:
            probability_list_temp.append(0)
    probability_list.append(probability_list_temp)

keyword_predict_df = pd.DataFrame({'skill_list': skill_list,
                                  'mapping_skill_list': mapping_skill_list,
                                  'sentence': sentence_list,
                                  'probability':probability_list})
keyword_predict_df

Unnamed: 0,mapping_skill_list,probability,sentence,skill_list
0,"[None, phone, weather]","[0, 0.5, 0.5]",打個MVPN給陳俊宏天氣,"[phone, weather]"


In [187]:
for index, row in keyword_predict_df.iterrows():

    data = []
    if len(row['skill_list']) > 0:

        def count_probability(skill_list, mapping_skill_list):
            probability = float(1) / len(skill_list)
            probability_list = []
            for x in mapping_skill_list:
                if x:
                    probability_list.append(probability)
                else:
                    probability_list.append(0)

            return probability_list

        # predict_df['probability'] = predict_df.apply(count_probability, axis=1)

        probability_list = count_probability(row['skill_list'], row['mapping_skill_list'])
        #     print(probability_list)
        predict_list = range(0, len(mapping_list))
        #     print(predict_list)
        predict_name = mapping_list
        #     print(predict_name)
        probability_mapping_df = pd.DataFrame({'predict': predict_list,
                                               'predict_name': predict_name,
                                               'probability': probability_list})
        #     print(probability_mapping_df)
        probability_mapping_df = probability_mapping_df. \
            sort_values(by=['probability'], ascending=False). \
            reset_index(drop=True)
        #     print(probability_mapping_df)

        predict_result = dict(sentence=row['sentence'],
                              predict=list(probability_mapping_df['predict']),
                              predict_skill=list(probability_mapping_df['predict_name']),
                              confidence=list(probability_mapping_df['probability']),
                              min_confidence=min_confidence)

        result = dict(robot_id=robot_id,
                      predict=predict_result)
        # result = dict(robot_id=robot_id,
        #               skill=mapping_list,
        #               predict=predict_result)
        data.append(result)
    else:
        print 'model predict ~~~~~~~~~'
        
data

[{'predict': {'confidence': [0.5, 0.5, 0.0],
   'min_confidence': 0.9,
   'predict': [1, 2, 0],
   'predict_skill': [u'phone', u'weather', u'leave'],
   'sentence': '\xe6\x89\x93\xe5\x80\x8bMVPN\xe7\xb5\xa6\xe9\x99\xb3\xe4\xbf\x8a\xe5\xae\x8f\xe5\xa4\xa9\xe6\xb0\xa3'},
  'robot_id': 'hr.00001318'}]

In [188]:
# data = []
# if len(predict_df) > 0:
    
#     def count_probability(row):
#         probability = float(1) / len(row['skill_list'])
#         probability_list = []
#         for x in row['mapping_skill_list']:
#             if x:
#                 probability_list.append(probability)
#             else:
#                 probability_list.append(0)

#         return probability_list

#     predict_df['probability'] = predict_df.apply(count_probability, axis=1)

#     for index, row in predict_df.iterrows():

#         # print(predict_df)

#         probability_list = row['probability']
#         #     print(probability_list)
#         predict_list = range(0, len(mapping_list))
#         #     print(predict_list)
#         predict_name = mapping_list
#         #     print(predict_name)
#         probability_mapping_df = pd.DataFrame({'predict': predict_list,
#                                                'predict_name': predict_name,
#                                                'probability': probability_list})
#         #     print(probability_mapping_df)
#         probability_mapping_df = probability_mapping_df. \
#             sort_values(by=['probability'], ascending=False). \
#             reset_index(drop=True)
#         #     print(probability_mapping_df)

#         predict_result = dict(sentence=row['sentence'],
#                               predict=list(probability_mapping_df['predict']),
#                               predict_skill=list(probability_mapping_df['predict_name']),
#                               confidence=list(probability_mapping_df['probability']),
#                               min_confidence=min_confidence)

#         result = dict(robot_id=robot_id,
#                       predict=predict_result)
#         # result = dict(robot_id=robot_id,
#         #               skill=mapping_list,
#         #               predict=predict_result)
#         data.append(result)

# data

[{'predict': {'confidence': [0.3333333333333333,
    0.3333333333333333,
    0.3333333333333333],
   'min_confidence': 0.9,
   'predict': [0, 1, 2],
   'predict_skill': [u'leave', u'phone', u'weather'],
   'sentence': '\xe6\x89\x93\xe5\x80\x8bMVPN\xe7\xb5\xa6\xe9\x99\xb3\xe4\xbf\x8a\xe5\xae\x8f\xe5\xa4\xa9\xe6\xb0\xa3\xe6\xba\xab\xe5\xba\xa6'},
  'robot_id': 'hr.00001318'}]