In [1]:
%matplotlib inline

In [2]:
# encoding=UTF-8
# !flask/bin/python

from cassandra.cluster import Cluster
from cassandra.policies import DCAwareRoundRobinPolicy
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import BatchStatement
import pandas as pd
# from pyspark.sql.types import StringType, StructType, StructField


class CassandraType(object):
    PRODUCTION = 0
    TEST = 1
    TEST_DOCKER = 2


class CassandraDAO(object):

    # you have to install following items :
    # a. python-Cassandra driver
    # b. pyspark cassandra connector

    def __init__(self, type):
        #         print('runing father.__init__')
        if type == CassandraType.PRODUCTION:
            self.contact_points = ['192.168.95.127', '192.168.95.122']
            self.contact_points_str = "192.168.95.127,192.168.95.122"
        elif type == CassandraType.TEST:
            self.contact_points = ['192.168.0.90', '192.168.0.91', '192.168.0.92']
            self.contact_points_str = "192.168.0.90,192.168.0.91,192.168.0.92"
        else:
            self.contact_points = ['192.168.0.121', '192.168.0.122', '192.168.0.52']
            self.contact_points_str = "192.168.0.121,192.168.0.122,192.168.0.52"

        self.formatString = "org.apache.spark.sql.cassandra"
        self.username = "username"
        self.password = "password"
        self.cluster = None
        self.session = None
        self.createSession()

    def __del__(self):
        self.cluster.shutdown()

    def pandas_factory(self, colnames, rows):
        return pd.DataFrame(rows, columns=colnames)

    def createSession(self):
        print "contact_points = " + self.contact_points_str
        self.cluster = Cluster(
            contact_points=self.contact_points,  # random select a node
            #             load_balancing_policy = DCAwareRoundRobinPolicy(local_dc='datacenter1'),
            #         auth_provider = PlainTextAuthProvider(username='cassandra', password='cassandra')
        )
        self.session = self.cluster.connect()
        self.session.row_factory = self.pandas_factory
        self.session.default_fetch_size = None
        # needed for large queries, otherwise driver will do pagination. Default is 50000.

    def getSession(self):
        return self.session

    def execCQL(self, keyspace, cql):
        """
        execute CQL
        """
        self.session.set_keyspace(keyspace)
        self.session.execute_async(cql)

    def execCQLSelect(self, keyspace, cql):
        """
        execute CQL, select only
        """

        self.session.set_keyspace(keyspace)

        #       cassandra ResultSet
        async_results = self.session.execute_async(cql)
        return async_results

    def execCQLCallBackAnysc(self, keyspace, cql, handle_success, handle_error):
        """
        execute CQL, if success => handle_success function, else handle_error
        """
        self.session.set_keyspace(keyspace)
        async_results = self.session.execute_async(cql)
        async_results.add_callbacks(handle_success, handle_error)

    def execCQLSelectToPandasDF(self, keyspace, cql):
        """
        execute CQL, select only, return Pandas DataFrame
        """

        self.session.set_keyspace(keyspace)

        #       cassandra ResultSet
#         async_results = self.session.execute_async(cql, timeout=None)
        results = self.session.execute(cql, timeout=None)
        #         async_results = self.session.execute_async(cql)
        #       to Pandas DataFrame
#         return async_results.result()._current_rows
        return results._current_rows

    def execCQLSelectToDF(self, sqlContext, keyspace, cql):
        """
        execute CQL, select only, return Spark DataFrame
        """

        #       pandas dataframe to spark dataframe
        pandas_dataframe = self.execCQLSelectToPandasDF(keyspace, cql)
        if pandas_dataframe.empty:
            schema = StructType([])
            return sqlContext.createDataFrame([], schema)
        else:
            return sqlContext.createDataFrame(pandas_dataframe)

    def execCQLSelectToRDD(self, sqlContext, keyspace, cql):
        """
        execute CQL, select only, return Spark RDD
        """

        return self.execCQLSelectToDF(sqlContext, keyspace, cql).rdd.map(tuple)  # dataFrame to RDD

    @property
    def contactPoints(self):
        return self.contact_points

    @contactPoints.setter
    def contactPoints(self, contact_points):
        self.contact_points = contact_points

    @contactPoints.deleter
    def contactPoints(self):
        del self.contact_points

#     # pyspark cassandra connector
#     def readFromCassandraDF(self, sqlContext, keyspace, table):
#         """
#         read data from Cassandra, return Dataframe
#         """

#         return sqlContext.read \
#             .format(self.formatString) \
#             .options(table=table, keyspace=keyspace) \
#             .option("spark.cassandra.connection.host", self.contact_points_str) \
#             .load()

#     def readFromCassandraRDD(self, sqlContext, keyspace, table):
#         """
#         read data from Cassandra, return RDD
#         """

#         df = sqlContext.read \
#             .format(self.formatString) \
#             .options(table=table, keyspace=keyspace) \
#             .option("spark.cassandra.connection.host", self.contact_points_str) \
#             .load()
#         return df.rdd.map(tuple)  # dataFrame to RDD

#     def saveToCassandraDF(self, dataFrame, keyspace, table, mode="error"):
#         """
#         Save data to Cassandra using DataFrame, select one mode to save
        
#         SaveMode.ErrorIfExists (default) | "error"      When saving a DataFrame to a data source,
#                                                         if data already exists, an exception is expected to be thrown.
#         SaveMode.Append                  | "append"     When saving a DataFrame to a data source,
#                                                         if data/table already exists, contents of the DataFrame are
#                                                         expected to be appended to existing data.
#         SaveMode.Overwrite               | "overwrite"  Overwrite mode means that when saving a DataFrame to a data source,
#                                                         if data/table already exists, existing data is expected to be
#                                                         overwritten by the contents of the DataFrame.
#         SaveMode.Ignore                  | "ignore"     Ignore mode means that when saving a DataFrame to a data source,
#                                                         if data already exists, the save operation is expected to not
#                                                         save the contents of the DataFrame and to not change the
#                                                         existing data. This is similar to a CREATE TABLE IF NOT EXISTS

#                                                         in SQL.
#         """

#         dataFrame.write \
#             .format(self.formatString) \
#             .mode(mode) \
#             .options(table=table, keyspace=keyspace) \
#             .option("spark.cassandra.connection.host", self.contact_points_str) \
#             .save()


In [3]:
c_dao = CassandraDAO(CassandraType.PRODUCTION)
t_dao = CassandraDAO(CassandraType.TEST)

contact_points = 192.168.95.127,192.168.95.122
contact_points = 192.168.0.90,192.168.0.91,192.168.0.92


In [4]:
import pandas as pd
import jieba
import jieba.posseg as pseg

class JiebaSegmentor:

    def __init__(self, dict_path, userdict=[], stopwords = False, stopwords_path=None):
        self.dict_path = dict_path
        self.userdict = userdict
        self.dictionary_init()
        self.stopwords_path = stopwords_path
        self.stopwords = stopwords
        self.stopwords_set = set()
        self.stopwords_init()

    def dictionary_init(self):
        jieba.set_dictionary(self.dict_path)
        for path in self.userdict:
            print path
            jieba.load_userdict(path)
    
    def stopwords_init(self):
        if self.stopwords_path:
            with open(self.stopwords_path ,'r') as stopwords:
                for stopword in stopwords:
                    self.stopwords_set.add(stopword.strip('\n').decode('utf-8'))

    def taiwan_country(self):
        return [u'臺北', u'台北', u'基隆', u'臺中', u'台中', u'臺南', u'台南', u'高雄',
                u'宜蘭', u'桃園', u'新竹', u'苗栗', u'彰化', u'南投', u'嘉義', u'雲林',
                u'屏東', u'臺東', u'台東', u'花蓮', u'澎湖']

    def wordToNumber(self, input_text):

        target = u''
        for s in input_text:

            if (s == u'零') or (s == '0'):
                to_word = u'0'
            elif (s == u'一') or (s == u'壹') or (s == '1'):
                to_word = u'1'
            elif (s == u'二') or (s == u'兩') or (s == u'貳') or (s == '2'):
                to_word = u'2'
            elif (s == u'三') or (s == u'參') or (s == '3'):
                to_word = u'3'
            elif (s == u'四') or (s == u'肆') or (s == '4'):
                to_word = u'4'
            elif (s == u'五') or (s == u'伍') or (s == '5'):
                to_word = u'5'
            elif (s == u'六') or (s == u'陸') or (s == '6'):
                to_word = u'6'
            elif (s == u'七') or (s == u'柒') or (s == '7'):
                to_word = u'7'
            elif (s == u'八') or (s == u'捌') or (s == '8'):
                to_word = u'8'
            elif (s == u'九') or (s == u'玖') or (s == '9'):
                to_word = u'9'
            else:
                to_word = s

        target = target + to_word
        return target

    def input_text_preprocessing(self, input_text):

        if type(input_text) is not unicode:
            input_text = input_text.decode('utf-8')

#         input_text = self.wordToNumber(input_text)
        return input_text

    def get_names(self, input_text):

        input_text = self.input_text_preprocessing(input_text)
        names = []
        words = pseg.cut(input_text)
        print words
        for w, f in words:
            if f.lower() == 'nr':
                names.append(w)
        for name in names:
            print name.encode('utf-8')
        return names

    def lcut(self, input_text):

        input_text = self.input_text_preprocessing(input_text)
        cut_raw = jieba.lcut(input_text)
        key = []

        for k in cut_raw:
            if self.stopwords:
                if k in self.stopwords_set:
                    continue
            
            key.append(k)
        df = pd.DataFrame({"word": key})
        return df

    def pseg_lcut(self, input_text):

        input_text = self.input_text_preprocessing(input_text)
        cut_raw = pseg.lcut(input_text)
        key = []
        value = []

        for k, v in cut_raw:
            tag = v
            if self.stopwords:
                if k in self.stopwords_set:
                    continue
            
            if k in self.taiwan_country():
                tag = u'ns'
            if len(k) > 1 and tag == u'x':
                tag = u'n'
            key.append(k)
            value.append(tag)
        df = pd.DataFrame({"word": key, "tag": value})
        return df

In [5]:
jieba_path = "/nfs/aq_test/jieba/"
jieba_dict_path1 = jieba_path + "dict_taiwan.txt"
jieba_dict_path2 = jieba_path +"userdict.txt"
jieba_dict_path3 = jieba_path +"dict.txt.big"
jieba_dict_path4 = jieba_path +"dict.txt.small"
jieba_stopwords_path = jieba_path +"stopwords.txt"
segmentor = JiebaSegmentor(jieba_dict_path1, 
                           [jieba_dict_path2,jieba_dict_path3,jieba_dict_path4], 
                           stopwords = True, 
                           stopwords_path=jieba_stopwords_path)

Building prefix dict from /nfs/aq_test/jieba/dict_taiwan.txt ...
DEBUG:jieba:Building prefix dict from /nfs/aq_test/jieba/dict_taiwan.txt ...
Loading model from cache /tmp/jieba.u34e30d1c15f49735c28e2158fa64cd7a.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.u34e30d1c15f49735c28e2158fa64cd7a.cache
Loading model cost 0.396 seconds.
DEBUG:jieba:Loading model cost 0.396 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


/nfs/aq_test/jieba/userdict.txt
/nfs/aq_test/jieba/dict.txt.big
/nfs/aq_test/jieba/dict.txt.small


In [6]:
segmentor.stopwords=True
segmentor.pseg_lcut('或許今天阿伯的媽媽給我好大一包的紅包喔')

Unnamed: 0,tag,word
0,t,今天
1,ns,阿伯
2,n,媽媽
3,a,好
4,m,一包
5,n,紅包
6,e,喔


In [7]:
DB_KEYSPACE = "nlp_keyspace"
dcard_article = "dcard_article"
dcard_response = "dcard_response"
dcard_article_relationship = "dcard_article_relationship"
dcard_response_relationship = "dcard_response_relationship"
word_article_mapping_relationship = "word_article_mapping_relationship"
dcard_article_food = "dcard_article_food"
dcard_response_food = "dcard_response_food"
dcard_article_response_food = "dcard_article_response_food"
word_article_mapping_food = "word_article_mapping_food"
board_keyword = "board_keyword"

In [8]:
ppp

NameError: name 'ppp' is not defined

In [None]:
# target = 100000000
# step = 100000
# start = 0
# HELPER_KEYSPACE = 'nlp_keyspace'

In [14]:
cql = ("select distinct board, article_id, title, title_clean from " + dcard_article_response_food +";")
food_df = t_dao.execCQLSelectToPandasDF(DB_KEYSPACE, cql)
food_df.count()

board          9230
article_id     9230
title          9230
title_clean    9230
dtype: int64

In [25]:
test = food_df[["article_id", "title_clean"]].head(20)
test

Unnamed: 0,article_id,title_clean
0,232350138,你們都喝哪牌的汽水
1,231036157,台南東區平價甜點店
2,231547034,海產有毒，去廈門跟金門玩的人看一下!!
3,231805424,超不負責鮮蝦義大利麵食譜
4,231755783,抹醬吐司冷凍
5,233243669,最近去過12間台中咖啡廳
6,231708382,台中西區 | 牛逼館子
7,230985201,表單已關閉）大學生咖啡市場需求調查
8,231609439,就是有業味 真的太推：）
9,233232635,茶六套餐


In [27]:
list(test['article_id'])

[u'232350138',
 u'231036157',
 u'231547034',
 u'231805424',
 u'231755783',
 u'233243669',
 u'231708382',
 u'230985201',
 u'231609439',
 u'233232635',
 u'233087104',
 u'231728775',
 u'232581594',
 u'233190318',
 u'232963178',
 u'232051733',
 u'232529509',
 u'231688231',
 u'231432196',
 u'231012485']

In [30]:
test[test['title_clean'].str.len() > 5]

Unnamed: 0,article_id,title_clean
0,232350138,你們都喝哪牌的汽水
1,231036157,台南東區平價甜點店
2,231547034,海產有毒，去廈門跟金門玩的人看一下!!
3,231805424,超不負責鮮蝦義大利麵食譜
4,231755783,抹醬吐司冷凍
5,233243669,最近去過12間台中咖啡廳
6,231708382,台中西區 | 牛逼館子
7,230985201,表單已關閉）大學生咖啡市場需求調查
8,231609439,就是有業味 真的太推：）
10,233087104,我也要吃布丁


In [None]:
# cql = ("select * from " + dcard_article_relationship +";")
# relationship_df = t_dao.execCQLSelectToPandasDF(DB_KEYSPACE, cql)
# print len(relationship_df)
# relationship_df.head(50)

In [None]:
# cql = ("select * from " + dcard_article_food +";")
# food_df = t_dao.execCQLSelectToPandasDF(DB_KEYSPACE, cql)
# print len(food_df)
# food_df.head(50)

In [12]:
article_raw_pd_list = []

In [None]:
# print len(pd_df)
# relationship_df[["article_id", "title_clean"]].head(20)

In [None]:
# from pyspark.sql.functions import udf
# def update_column(x):
# #     check = QuestionTypeCheck(segmentor)
# #     check.source = 'ptt'
# #     label = check.check_question_type(x)
#     input_text = input_text.replace(u'Re: ', u'')
#     input_text = re.sub(u"\\[.*?]", u"", input_text)
#     segmentor = JiebaSegmentor(jieba_dict_path1, [jieba_dict_path2,jieba_dict_path3,jieba_dict_path4])
#     segmentor.pseg_lcut('test')
#     return 'label'

# update_column_udf = udf(update_column)
# new_df = df.withColumn('question_type',
#                     update_column_udf(df['question_type']))

In [None]:
# new_df = df.limit(100)

In [None]:
# dao.saveToCassandraDF(new_df, "nlp_keyspace", "dcard_article_test", "append")

In [16]:
def filter_by_tag(tag):
    v_tag = ['v', 'vd', 'vg', 'vi', 'vn', 'vq', 'vt']
    n_tag = ['n', 'ng', 'nr', 'nrfg', 'nrt', 'ns', 'nt','nz']
    a_tag = ['a', 'ad', 'ag', 'an']
#         print tag['word'][0] + '(' + tag['tag'][0] + ')'
    if tag in v_tag or tag in n_tag or tag in a_tag:
        return True
    return False

def filter_by_tag_udf(row):
    v_tag = ['v', 'vd', 'vg', 'vi', 'vn', 'vq', 'vt']
    n_tag = ['n', 'ng', 'nr', 'nrfg', 'nrt', 'ns', 'nt','nz']
#     a_tag = ['a', 'ad', 'ag', 'an']
#         print tag['word'][0] + '(' + tag['tag'][0] + ')'
#     if row['tag'] in v_tag or row['tag'] in n_tag or row['tag'] in a_tag:
    if row['tag'] in v_tag or row['tag'] in n_tag:
        return True
    return False

In [None]:
word_article_mapping_relationship, len(relationship_df)

In [15]:
word_article_mapping_food, len(food_df)

('word_article_mapping_food', 9230)

In [17]:
def get_word_count(pd_df, table):
    word_count = []
    for index, row in pd_df.iterrows():

        # question斷詞
        if not row['title_clean']:
            continue

    #     print row['question']
        words = segmentor.pseg_lcut(row['title_clean'])
    #     print words
        article_id = row['article_id']
        for index, row in words.iterrows():
    #         print row['word']

            # 只存名詞 and 動詞
            if filter_by_tag(row['tag']):
            # 每個詞更新到 mapping table
                cql = ("update " + table + " " +
                       "set article_id = article_id + ['" + str(article_id) + "'] " +
                       "where word = '" + row['word'] + "';")
#                 print cql
                t_dao.execCQL(DB_KEYSPACE, cql)
                word_count.append(row['word'])
    return word_count


In [None]:
word_count_relationship = get_word_count(relationship_df, word_article_mapping_relationship)
word_count_relationship

In [18]:
word_count_food = get_word_count(food_df,  word_article_mapping_food)
word_count_food

[u'\u559d',
 u'\u724c',
 u'\u6c7d\u6c34',
 u'\u53f0\u5357',
 u'\u6771\u5340',
 u'\u5e73\u50f9',
 u'\u751c\u9ede\u5e97',
 u'\u6d77\u7522',
 u'\u6709\u6bd2',
 u'\u53bb',
 u'\u5ec8\u9580',
 u'\u91d1\u9580',
 u'\u73a9',
 u'\u4eba',
 u'\u8d85\u4e0d',
 u'\u8ca0\u8cac',
 u'\u9bae\u8766',
 u'\u7fa9\u5927\u5229\u9eb5',
 u'\u98df\u8b5c',
 u'\u62b9\u91ac',
 u'\u5410\u53f8',
 u'\u51b7\u51cd',
 u'\u53bb\u904e',
 u'\u9593\u53f0',
 u'\u5496\u5561\u5ef3',
 u'\u4e2d\u897f\u5340',
 u'\u725b',
 u'\u903c',
 u'\u9928\u5b50',
 u'\u8868\u55ae',
 u'\u95dc\u9589',
 u'\u5927\u5b78\u751f',
 u'\u5496\u5561',
 u'\u5e02\u5834\u9700\u6c42',
 u'\u8abf\u67e5',
 u'\u696d\u5473',
 u'\u63a8',
 u'\u8336',
 u'\u5957\u9910',
 u'\u5403',
 u'\u5e03\u4e01',
 u'\u98df\u8a18',
 u'\u53f0\u4e2d',
 u'\u9e7d\u57d5',
 u'\u98df\u5708',
 u'\u4f6c',
 u'\u6389\u7259',
 u'\u9152\u5427',
 u'\u692a\u7cd6',
 u'\u9019\u662f',
 u'\u798f\u888b',
 u'\u5bbf\u820d',
 u'\u7c21\u55ae',
 u'\u6599\u7406',
 u'\u86e4\u870a',
 u'\u8766\u4ec1',
 u'\u9eb5\

In [19]:
# Dcard心情版實驗
# title文字雲
# response文字雲
# title+response文字雲
from collections import Counter
def counter(board, word_count, top):
#     print(len(word_count)),Counter(word_count).most_common()
    text_list = [x for x in Counter(word_count).most_common() if len(x[0]) >1 ]
    text_list = text_list[:top]
#     print text_list[0]
    word_count_data = ""
    for t in text_list:
        if len(t[0]) <= 1:
            continue
    #     if t[0] in utility_word:
    #         continue
    
        print("----------------------")
        print(t[0] + " : " + str(t[1]))
        cql = ("insert into " + board_keyword + 
               "(board, keyword, count) values('" + board + "','" + t[0] + "'," + str(t[1]) + ");")
#         cql = ("update " + board_keyword + " " +
#                "set keyword = keyword + ['" + t[0] + "'] " +
#                "where board = '" + board + "';")
        print cql
        t_dao.execCQL(DB_KEYSPACE, cql)
#     for _ in range(t[1]):
#         word_count_data = word_count_data + " " + (t[0])

In [None]:
counter(board="relationship", 
        word_count=word_count_relationship,
        top=50)

In [20]:
counter(board="food", 
        word_count=word_count_food,
        top=300)

----------------------
美食 : 554
insert into board_keyword(board, keyword, count) values('food','美食',554);
----------------------
推薦 : 438
insert into board_keyword(board, keyword, count) values('food','推薦',438);
----------------------
好吃 : 376
insert into board_keyword(board, keyword, count) values('food','好吃',376);
----------------------
台南 : 274
insert into board_keyword(board, keyword, count) values('food','台南',274);
----------------------
高雄 : 264
insert into board_keyword(board, keyword, count) values('food','高雄',264);
----------------------
台北 : 259
insert into board_keyword(board, keyword, count) values('food','台北',259);
----------------------
蛋糕 : 241
insert into board_keyword(board, keyword, count) values('food','蛋糕',241);
----------------------
餐廳 : 234
insert into board_keyword(board, keyword, count) values('food','餐廳',234);
----------------------
甜點 : 209
insert into board_keyword(board, keyword, count) values('food','甜點',209);
----------------------
全家 : 190
insert into bo

----------------------
咖哩 : 38
insert into board_keyword(board, keyword, count) values('food','咖哩',38);
----------------------
蜂蜜 : 38
insert into board_keyword(board, keyword, count) values('food','蜂蜜',38);
----------------------
食圈 : 38
insert into board_keyword(board, keyword, count) values('food','食圈',38);
----------------------
聚餐 : 38
insert into board_keyword(board, keyword, count) values('food','聚餐',38);
----------------------
燒烤 : 37
insert into board_keyword(board, keyword, count) values('food','燒烤',37);
----------------------
請益 : 37
insert into board_keyword(board, keyword, count) values('food','請益',37);
----------------------
健康 : 37
insert into board_keyword(board, keyword, count) values('food','健康',37);
----------------------
麻糬 : 36
insert into board_keyword(board, keyword, count) values('food','麻糬',36);
----------------------
鬆餅 : 36
insert into board_keyword(board, keyword, count) values('food','鬆餅',36);
----------------------
泡芙 : 36
insert into board_keyword(board, 

----------------------
雞蛋糕 : 22
insert into board_keyword(board, keyword, count) values('food','雞蛋糕',22);
----------------------
幸福 : 22
insert into board_keyword(board, keyword, count) values('food','幸福',22);
----------------------
板橋 : 22
insert into board_keyword(board, keyword, count) values('food','板橋',22);
----------------------
市場 : 22
insert into board_keyword(board, keyword, count) values('food','市場',22);
----------------------
試吃 : 22
insert into board_keyword(board, keyword, count) values('food','試吃',22);
----------------------
炒飯 : 22
insert into board_keyword(board, keyword, count) values('food','炒飯',22);
----------------------
甜點店 : 22
insert into board_keyword(board, keyword, count) values('food','甜點店',22);
----------------------
評價 : 22
insert into board_keyword(board, keyword, count) values('food','評價',22);
----------------------
珍珠奶茶 : 22
insert into board_keyword(board, keyword, count) values('food','珍珠奶茶',22);
----------------------
芒果 : 21
insert into board_keyword

----------------------
平台 : 14
insert into board_keyword(board, keyword, count) values('food','平台',14);
----------------------
麵線 : 14
insert into board_keyword(board, keyword, count) values('food','麵線',14);
----------------------
楠梓 : 14
insert into board_keyword(board, keyword, count) values('food','楠梓',14);
----------------------
超高 : 14
insert into board_keyword(board, keyword, count) values('food','超高',14);
----------------------
公館 : 14
insert into board_keyword(board, keyword, count) values('food','公館',14);
----------------------
自助餐 : 14
insert into board_keyword(board, keyword, count) values('food','自助餐',14);
----------------------
傳統 : 14
insert into board_keyword(board, keyword, count) values('food','傳統',14);
----------------------
豬排 : 14
insert into board_keyword(board, keyword, count) values('food','豬排',14);
----------------------
回歸 : 14
insert into board_keyword(board, keyword, count) values('food','回歸',14);
----------------------
幫忙 : 14
insert into board_keyword(board

In [None]:
response_cql = ""
count_list = ["a","b","c","d","e"]
for count in count_list:
    response_cql = response_cql + "'" + count + "',"
response_cql = response_cql[:-1]
response_cql

In [None]:
# # Libraries
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
# font_path = "/home/charles/YaHeiConsolas.ttf"
# # Create the wordcloud object
# wordcloud = WordCloud(width=640, height=640, margin=0, 
#                       font_path=font_path, colormap="Blues").generate(word_count_data[1:])
 
# # Display the generated image:
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis("off")
# plt.margins(x=0, y=0)
# plt.show()

In [None]:
# mapping_pd = pd.DataFrame({'name':['a','b','c','a','c','c'],
#                            'word':[['aa'],['bb'],['bb'],['aa'],['cc'],['cc']]})
mapping_pd = pd.DataFrame({'name':['a','b','c','a','c','c'],
                           'word':['aa','bb','bb','aa','cc','cc']})
# mapping_pd.groupby(['name'])['name','word'].apply(lambda x: x.append(x)).reset_index(drop=True)

df = mapping_pd.groupby('name').agg({'name':'first',
                                     'word':','.join}).reset_index(drop=True)
df
# df[['name','word']]

In [None]:
aa = ['*','B','C']
aa = [x.lower() for x in aa]
','.join(aa)

In [None]:
mapping_pd = pd.DataFrame({'name':['a','b','c','a','c','c'],
                           'word':['aa','bb','bb','aa','cc','cc']})
mapping_pd
# word_match_pd = board_keyword_df[board_keyword_df["keyword"].str.match('|'.join(word_list))]

In [None]:
word_list = ["aa"]
mapping_pd[mapping_pd["word"].str in word_list]