In [1]:
# encoding=UTF-8
# !flask/bin/python

from cassandra.cluster import Cluster
from cassandra.policies import DCAwareRoundRobinPolicy
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import BatchStatement
import pandas as pd
# from pyspark.sql.types import StringType, StructType, StructField


class CassandraType(object):
    PRODUCTION = 0
    TEST = 1
    TEST_DOCKER = 2


class CassandraDAO(object):

    # you have to install following items :
    # a. python-Cassandra driver
    # b. pyspark cassandra connector

    def __init__(self, type):
        #         print('runing father.__init__')
        if type == CassandraType.PRODUCTION:
            self.contact_points = ['192.168.95.127', '192.168.95.122']
            self.contact_points_str = "192.168.95.127,192.168.95.122"
        elif type == CassandraType.TEST:
            self.contact_points = ['192.168.0.41', '192.168.0.42']
            self.contact_points_str = "192.168.0.41,192.168.0.42"
        else:
            self.contact_points = ['192.168.0.121', '192.168.0.122', '192.168.0.52']
            self.contact_points_str = "192.168.0.121,192.168.0.122,192.168.0.52"

        self.formatString = "org.apache.spark.sql.cassandra"
        self.username = "username"
        self.password = "password"
        self.cluster = None
        self.session = None
        self.createSession()

    def __del__(self):
        self.cluster.shutdown()

    def pandas_factory(self, colnames, rows):
        return pd.DataFrame(rows, columns=colnames)

    def createSession(self):
        print "contact_points = " + self.contact_points_str
        self.cluster = Cluster(
            contact_points=self.contact_points,  # random select a node
            #             load_balancing_policy = DCAwareRoundRobinPolicy(local_dc='datacenter1'),
            #         auth_provider = PlainTextAuthProvider(username='cassandra', password='cassandra')
        )
        self.session = self.cluster.connect()
        self.session.row_factory = self.pandas_factory
        self.session.default_fetch_size = None
        # needed for large queries, otherwise driver will do pagination. Default is 50000.

    def getSession(self):
        return self.session

    def execCQL(self, keyspace, cql):
        """
        execute CQL
        """
        self.session.set_keyspace(keyspace)
        self.session.execute_async(cql)

    def execCQLSelect(self, keyspace, cql):
        """
        execute CQL, select only
        """

        self.session.set_keyspace(keyspace)

        #       cassandra ResultSet
        async_results = self.session.execute_async(cql)
        return async_results

    def execCQLCallBackAnysc(self, keyspace, cql, handle_success, handle_error):
        """
        execute CQL, if success => handle_success function, else handle_error
        """
        self.session.set_keyspace(keyspace)
        async_results = self.session.execute_async(cql)
        async_results.add_callbacks(handle_success, handle_error)

    def execCQLSelectToPandasDF(self, keyspace, cql):
        """
        execute CQL, select only, return Pandas DataFrame
        """

        self.session.set_keyspace(keyspace)

        #       cassandra ResultSet
#         async_results = self.session.execute_async(cql, timeout=None)
        results = self.session.execute(cql, timeout=None)
        #         async_results = self.session.execute_async(cql)
        #       to Pandas DataFrame
#         return async_results.result()._current_rows
        return results._current_rows

    def execCQLSelectToDF(self, sqlContext, keyspace, cql):
        """
        execute CQL, select only, return Spark DataFrame
        """

        #       pandas dataframe to spark dataframe
        pandas_dataframe = self.execCQLSelectToPandasDF(keyspace, cql)
        if pandas_dataframe.empty:
            schema = StructType([])
            return sqlContext.createDataFrame([], schema)
        else:
            return sqlContext.createDataFrame(pandas_dataframe)

    def execCQLSelectToRDD(self, sqlContext, keyspace, cql):
        """
        execute CQL, select only, return Spark RDD
        """

        return self.execCQLSelectToDF(sqlContext, keyspace, cql).rdd.map(tuple)  # dataFrame to RDD

    @property
    def contactPoints(self):
        return self.contact_points

    @contactPoints.setter
    def contactPoints(self, contact_points):
        self.contact_points = contact_points

    @contactPoints.deleter
    def contactPoints(self):
        del self.contact_points

#     # pyspark cassandra connector
#     def readFromCassandraDF(self, sqlContext, keyspace, table):
#         """
#         read data from Cassandra, return Dataframe
#         """

#         return sqlContext.read \
#             .format(self.formatString) \
#             .options(table=table, keyspace=keyspace) \
#             .option("spark.cassandra.connection.host", self.contact_points_str) \
#             .load()

#     def readFromCassandraRDD(self, sqlContext, keyspace, table):
#         """
#         read data from Cassandra, return RDD
#         """

#         df = sqlContext.read \
#             .format(self.formatString) \
#             .options(table=table, keyspace=keyspace) \
#             .option("spark.cassandra.connection.host", self.contact_points_str) \
#             .load()
#         return df.rdd.map(tuple)  # dataFrame to RDD

#     def saveToCassandraDF(self, dataFrame, keyspace, table, mode="error"):
#         """
#         Save data to Cassandra using DataFrame, select one mode to save
        
#         SaveMode.ErrorIfExists (default) | "error"      When saving a DataFrame to a data source,
#                                                         if data already exists, an exception is expected to be thrown.
#         SaveMode.Append                  | "append"     When saving a DataFrame to a data source,
#                                                         if data/table already exists, contents of the DataFrame are
#                                                         expected to be appended to existing data.
#         SaveMode.Overwrite               | "overwrite"  Overwrite mode means that when saving a DataFrame to a data source,
#                                                         if data/table already exists, existing data is expected to be
#                                                         overwritten by the contents of the DataFrame.
#         SaveMode.Ignore                  | "ignore"     Ignore mode means that when saving a DataFrame to a data source,
#                                                         if data already exists, the save operation is expected to not
#                                                         save the contents of the DataFrame and to not change the
#                                                         existing data. This is similar to a CREATE TABLE IF NOT EXISTS

#                                                         in SQL.
#         """

#         dataFrame.write \
#             .format(self.formatString) \
#             .mode(mode) \
#             .options(table=table, keyspace=keyspace) \
#             .option("spark.cassandra.connection.host", self.contact_points_str) \
#             .save()


In [2]:
dao = CassandraDAO('BACKUP')

contact_points = 192.168.0.121,192.168.0.122,192.168.0.52


In [3]:
import pandas as pd
import jieba
import jieba.posseg as pseg

class JiebaSegmentor:

    def __init__(self, dict_path, userdict=[], stopwords = False, stopwords_path=None):
        self.dict_path = dict_path
        self.userdict = userdict
        self.dictionary_init()
        self.stopwords_path = stopwords_path
        self.stopwords = stopwords
        self.stopwords_set = set()
        self.stopwords_init()

    def dictionary_init(self):
        jieba.set_dictionary(self.dict_path)
        for path in self.userdict:
            print path
            jieba.load_userdict(path)
    
    def stopwords_init(self):
        if self.stopwords_path:
            with open(self.stopwords_path ,'r') as stopwords:
                for stopword in stopwords:
                    self.stopwords_set.add(stopword.strip('\n').decode('utf-8'))

    def taiwan_country(self):
        return [u'臺北', u'台北', u'基隆', u'臺中', u'台中', u'臺南', u'台南', u'高雄',
                u'宜蘭', u'桃園', u'新竹', u'苗栗', u'彰化', u'南投', u'嘉義', u'雲林',
                u'屏東', u'臺東', u'台東', u'花蓮', u'澎湖']

    def wordToNumber(self, input_text):

        target = u''
        for s in input_text:

            if (s == u'零') or (s == '0'):
                to_word = u'0'
            elif (s == u'一') or (s == u'壹') or (s == '1'):
                to_word = u'1'
            elif (s == u'二') or (s == u'兩') or (s == u'貳') or (s == '2'):
                to_word = u'2'
            elif (s == u'三') or (s == u'參') or (s == '3'):
                to_word = u'3'
            elif (s == u'四') or (s == u'肆') or (s == '4'):
                to_word = u'4'
            elif (s == u'五') or (s == u'伍') or (s == '5'):
                to_word = u'5'
            elif (s == u'六') or (s == u'陸') or (s == '6'):
                to_word = u'6'
            elif (s == u'七') or (s == u'柒') or (s == '7'):
                to_word = u'7'
            elif (s == u'八') or (s == u'捌') or (s == '8'):
                to_word = u'8'
            elif (s == u'九') or (s == u'玖') or (s == '9'):
                to_word = u'9'
            else:
                to_word = s

        target = target + to_word
        return target

    def input_text_preprocessing(self, input_text):

        if type(input_text) is not unicode:
            input_text = input_text.decode('utf-8')

#         input_text = self.wordToNumber(input_text)
        return input_text

    def get_names(self, input_text):

        input_text = self.input_text_preprocessing(input_text)
        names = []
        words = pseg.cut(input_text)
        print words
        for w, f in words:
            if f.lower() == 'nr':
                names.append(w)
        for name in names:
            print name.encode('utf-8')
        return names

    def lcut(self, input_text):

        input_text = self.input_text_preprocessing(input_text)
        cut_raw = jieba.lcut(input_text)
        key = []

        for k in cut_raw:
            if self.stopwords:
                if k in self.stopwords_set:
                    continue
            
            key.append(k)
        df = pd.DataFrame({"word": key})
        return df

    def pseg_lcut(self, input_text):

        input_text = self.input_text_preprocessing(input_text)
        cut_raw = pseg.lcut(input_text)
        key = []
        value = []

        for k, v in cut_raw:
            tag = v
            if self.stopwords:
                if k in self.stopwords_set:
                    continue
            
            if k in self.taiwan_country():
                tag = u'ns'
            if len(k) > 1 and tag == u'x':
                tag = u'n'
            key.append(k)
            value.append(tag)
        df = pd.DataFrame({"word": key, "tag": value})
        return df

In [4]:
jieba_dict_path1 = "/home/charles/dataset/jieba/dict_taiwan.txt"
jieba_dict_path2 = "/home/charles/dataset/jieba/userdict.txt"
jieba_dict_path3 = "/home/charles/dataset/jieba/dict.txt.big"
jieba_dict_path4 = "/home/charles/dataset/jieba/dict.txt.small"
jieba_stopwords_path = "/home/charles/dataset/jieba/stopwords.txt"
segmentor = JiebaSegmentor(jieba_dict_path1, 
                           [jieba_dict_path2,jieba_dict_path3,jieba_dict_path4], 
                           stopwords = False, 
                           stopwords_path=jieba_stopwords_path)

Building prefix dict from /home/charles/dataset/jieba/dict_taiwan.txt ...
DEBUG:jieba:Building prefix dict from /home/charles/dataset/jieba/dict_taiwan.txt ...
Loading model from cache /tmp/jieba.u48306fa201322dcccc3d0c62898fbadc.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.u48306fa201322dcccc3d0c62898fbadc.cache
Loading model cost 0.389 seconds.
DEBUG:jieba:Loading model cost 0.389 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


/home/charles/dataset/jieba/userdict.txt
/home/charles/dataset/jieba/dict.txt.big
/home/charles/dataset/jieba/dict.txt.small


In [5]:
segmentor.stopwords=False
segmentor.pseg_lcut('或許今天阿伯的媽媽給我好大一包的紅包喔')

Unnamed: 0,tag,word
0,d,或許
1,t,今天
2,ns,阿伯
3,uj,的
4,n,媽媽
5,p,給
6,r,我
7,a,好
8,a,大
9,m,一包


In [6]:
segmentor.stopwords=True
segmentor.pseg_lcut('或許今天阿伯的媽媽給我好大一包的紅包喔')

Unnamed: 0,tag,word
0,t,今天
1,ns,阿伯
2,n,媽媽
3,a,好
4,m,一包
5,n,紅包
6,e,喔


In [152]:
HELPER_KEYSPACE = 'nlp_keyspace'
#select range
cql = ("select * from nlp_keyspace.dcard_article " +
        "where article_id > 0 and article_id < 5000 allow filtering;")
cql = ("select * from nlp_keyspace.dcard_article where article_id > 205000 and " +
       "article_id < 210000 allow filtering;")
pd_df = dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
print len(pd_df)
pd_df

341


Unnamed: 0,article_id,board,category,content,crawl_date,create_date,question_type,title
0,207159,mood,,前情提要(文超長)小時候的體罰影響真的很深最近因為考試所以壓力有點兒大rrrr\n想到了一些...,2019-02-23 03:20:57.730,2015-05-18T16:25:35.494Z,,Re: (文超長)小時候的體罰影響真的很深
1,206571,talk,,下個月似乎就要送舊了某收錢人員密我某:你要去送舊嗎?我:沒有耶，我有事某:那你要繳200喔!...,2019-02-18 06:07:54.808,2015-05-18T13:26:04.090Z,,林北就是不爽繳拉!
2,205492,mood,,期待好久的中央中文轉學考竟然不開大二的缺，覺得無敵傷心。要是大三才轉，學業一定會很嚴重地落後...,2019-02-23 03:20:40.203,2015-05-18T05:21:23.087Z,,被想轉的學系放鴿子
3,209409,talk,,很多肥宅都滿喜歡AKB48的，可是有正妹喜歡AKB48嗎？有沒有八卦？,2019-02-18 06:09:39.203,2015-05-19T11:18:08.141Z,,有正妹喜歡AKB48嗎？
4,207108,mood,,最近因為考試所以壓力有點兒大rrrr想到了一些以前的事情原PO家庭蠻嚴格的我一直記得我剛上小...,2019-02-23 03:19:54.895,2015-05-18T16:12:45.280Z,,(文超長)小時候的體罰影響真的很深
5,205870,mood,,2015/05/17之前，我們沒有見過半次面，或許你連我的聲音都沒有聽過，我們只在RC上交流...,2019-02-23 03:20:40.181,2015-05-18T08:06:22.356Z,,給101故事男孩，如果花知道
6,208405,mood,,班上有一個我不喜歡的人因為他跟我們這一群是朋友所以我跟他也只好表面上是朋友，其實心裡一直不喜...,2019-02-23 03:20:03.106,2015-05-19T04:23:02.951Z,,討厭的人的請求？
7,206930,mood,,今天真是彼母之不順早上七早八早起來載我弟去上課之後準備載我每天通勤的同學回來臺北上課突然發現...,2019-02-23 03:19:45.330,2015-05-18T15:29:28.457Z,,肥宅受難日
8,207149,talk,,如果有崩潰板我應該會果斷發在那.........本PO的字扭來扭去 擠在一起 像是小學生是...,2019-02-18 06:09:38.882,2015-05-18T16:23:32.544Z,,[求救] 我的字龍飛鳳舞 只是飛歪舞殘了
9,206547,talk,,"想問大家對於做一份報告願意付出每人不超過500元的金錢嗎？例如主題是蘋果, 願意買不同品種的...",2019-02-18 06:07:54.657,2015-05-18T13:17:10.899Z,,願意為了報告額外付出金錢？


In [31]:
article_raw_pd_list = []

In [32]:
target = 100000000
step = 100000
start = 0
HELPER_KEYSPACE = 'nlp_keyspace'

In [33]:
for x in range(start,target,step):
#     print str(x) + " ~ " + str(x+step)

    try:
        #select range
        cql = ("select article_id, question from dcard_query_table " +
                "where article_id > " + str(x) + " and article_id <= " + str(x+step) +
                " group by article_id allow filtering;")

#         cql = ("select * from nlp_keyspace.dcard_article " +
#                 "where article_id > " + str(x) + " and article_id <= " + str(x+step) + " allow filtering;")
        print cql
    #     cql = ("select * from nlp_keyspace.dcard_article_test;")
        pd_df = dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
        article_raw_pd_list.append(pd_df)
    except:
        print 'break'
        cql = ("select article_id, question from dcard_query_table " +
                "where article_id > " + str(x) + " and article_id <= " + str(x+step) +
                " group by article_id allow filtering;")
        print cql
    #     cql = ("select * from nlp_keyspace.dcard_article_test;")
        pd_df = dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
        article_raw_pd_list.append(pd_df)
        continue
    

select article_id, question from dcard_query_table where article_id > 0 and article_id <= 100000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 100000 and article_id <= 200000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 200000 and article_id <= 300000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 300000 and article_id <= 400000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 400000 and article_id <= 500000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 500000 and article_id <= 600000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 600000 and article_id <= 700000 group by article_id allow filtering;
select article_id, question from

select article_id, question from dcard_query_table where article_id > 5700000 and article_id <= 5800000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 5800000 and article_id <= 5900000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 5900000 and article_id <= 6000000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 6000000 and article_id <= 6100000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 6100000 and article_id <= 6200000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 6200000 and article_id <= 6300000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 6300000 and article_id <= 6400000 group by article_id allow filtering;
select articl

select article_id, question from dcard_query_table where article_id > 11400000 and article_id <= 11500000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 11500000 and article_id <= 11600000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 11600000 and article_id <= 11700000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 11700000 and article_id <= 11800000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 11800000 and article_id <= 11900000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 11900000 and article_id <= 12000000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 12000000 and article_id <= 12100000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 17100000 and article_id <= 17200000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 17200000 and article_id <= 17300000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 17300000 and article_id <= 17400000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 17400000 and article_id <= 17500000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 17500000 and article_id <= 17600000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 17600000 and article_id <= 17700000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 17700000 and article_id <= 17800000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 22800000 and article_id <= 22900000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 22900000 and article_id <= 23000000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 23000000 and article_id <= 23100000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 23100000 and article_id <= 23200000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 23200000 and article_id <= 23300000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 23300000 and article_id <= 23400000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 23400000 and article_id <= 23500000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 28500000 and article_id <= 28600000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 28600000 and article_id <= 28700000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 28700000 and article_id <= 28800000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 28800000 and article_id <= 28900000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 28900000 and article_id <= 29000000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 29000000 and article_id <= 29100000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 29100000 and article_id <= 29200000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 34200000 and article_id <= 34300000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 34300000 and article_id <= 34400000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 34400000 and article_id <= 34500000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 34500000 and article_id <= 34600000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 34600000 and article_id <= 34700000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 34700000 and article_id <= 34800000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 34800000 and article_id <= 34900000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 39900000 and article_id <= 40000000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 40000000 and article_id <= 40100000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 40100000 and article_id <= 40200000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 40200000 and article_id <= 40300000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 40300000 and article_id <= 40400000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 40400000 and article_id <= 40500000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 40500000 and article_id <= 40600000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 45600000 and article_id <= 45700000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 45700000 and article_id <= 45800000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 45800000 and article_id <= 45900000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 45900000 and article_id <= 46000000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 46000000 and article_id <= 46100000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 46100000 and article_id <= 46200000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 46200000 and article_id <= 46300000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 51300000 and article_id <= 51400000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 51400000 and article_id <= 51500000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 51500000 and article_id <= 51600000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 51600000 and article_id <= 51700000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 51700000 and article_id <= 51800000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 51800000 and article_id <= 51900000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 51900000 and article_id <= 52000000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 57000000 and article_id <= 57100000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 57100000 and article_id <= 57200000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 57200000 and article_id <= 57300000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 57300000 and article_id <= 57400000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 57400000 and article_id <= 57500000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 57500000 and article_id <= 57600000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 57600000 and article_id <= 57700000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 62700000 and article_id <= 62800000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 62800000 and article_id <= 62900000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 62900000 and article_id <= 63000000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 63000000 and article_id <= 63100000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 63100000 and article_id <= 63200000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 63200000 and article_id <= 63300000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 63300000 and article_id <= 63400000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 68400000 and article_id <= 68500000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 68500000 and article_id <= 68600000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 68600000 and article_id <= 68700000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 68700000 and article_id <= 68800000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 68800000 and article_id <= 68900000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 68900000 and article_id <= 69000000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 69000000 and article_id <= 69100000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 74100000 and article_id <= 74200000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 74200000 and article_id <= 74300000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 74300000 and article_id <= 74400000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 74400000 and article_id <= 74500000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 74500000 and article_id <= 74600000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 74600000 and article_id <= 74700000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 74700000 and article_id <= 74800000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 79800000 and article_id <= 79900000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 79900000 and article_id <= 80000000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 80000000 and article_id <= 80100000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 80100000 and article_id <= 80200000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 80200000 and article_id <= 80300000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 80300000 and article_id <= 80400000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 80400000 and article_id <= 80500000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 85400000 and article_id <= 85500000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 85500000 and article_id <= 85600000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 85600000 and article_id <= 85700000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 85700000 and article_id <= 85800000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 85800000 and article_id <= 85900000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 85900000 and article_id <= 86000000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 86000000 and article_id <= 86100000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 90900000 and article_id <= 91000000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 91000000 and article_id <= 91100000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 91100000 and article_id <= 91200000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 91200000 and article_id <= 91300000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 91300000 and article_id <= 91400000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 91400000 and article_id <= 91500000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 91500000 and article_id <= 91600000 group by article_id allow filtering;

select article_id, question from dcard_query_table where article_id > 96500000 and article_id <= 96600000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 96600000 and article_id <= 96700000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 96700000 and article_id <= 96800000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 96800000 and article_id <= 96900000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 96900000 and article_id <= 97000000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 97000000 and article_id <= 97100000 group by article_id allow filtering;
select article_id, question from dcard_query_table where article_id > 97100000 and article_id <= 97200000 group by article_id allow filtering;

In [22]:
# cql=("select * from nlp_keyspace.dcard_query_table where article_id >200000000 and article_id < 200200000 group by article_id allow filtering;")
# pd_df = dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
# article_raw_pd_list.append(pd_df)

In [34]:
# del pd_df
print len(article_raw_pd_list)
query_df=pd.DataFrame(columns=['article_id','question'])

query_df=pd.concat(article_raw_pd_list).reset_index(drop=True)

1000


In [35]:
print len(query_df)
query_df

111038


Unnamed: 0,article_id,question
0,76367,自以為的以和為貴
1,13946,魯蛇成功記
2,83078,中華電信的iPhone6....
3,10305,剛剛考資格考第一科
4,46779,我想，這是一個里程碑
5,42565,最無奈的錯過
6,56309,有沒有感情過很坎坷的同路人
7,21766,想去旅行的城市
8,23962,既然遇見為何還是擦身而過
9,29720,愛情總是悄悄地出現 -5


In [8]:
# from pyspark.sql.functions import udf
# def update_column(x):
# #     check = QuestionTypeCheck(segmentor)
# #     check.source = 'ptt'
# #     label = check.check_question_type(x)
#     input_text = input_text.replace(u'Re: ', u'')
#     input_text = re.sub(u"\\[.*?]", u"", input_text)
#     segmentor = JiebaSegmentor(jieba_dict_path1, [jieba_dict_path2,jieba_dict_path3,jieba_dict_path4])
#     segmentor.pseg_lcut('test')
#     return 'label'

# update_column_udf = udf(update_column)
# new_df = df.withColumn('question_type',
#                     update_column_udf(df['question_type']))

In [9]:
# new_df = df.limit(100)

In [10]:
# dao.saveToCassandraDF(new_df, "nlp_keyspace", "dcard_article_test", "append")

In [36]:
def filter_by_tag(tag):
    v_tag = ['v', 'vd', 'vg', 'vi', 'vn', 'vq', 'vt']
    n_tag = ['n', 'ng', 'nr', 'nrfg', 'nrt', 'ns', 'nt','nz']
    a_tag = ['a', 'ad', 'ag', 'an']
#         print tag['word'][0] + '(' + tag['tag'][0] + ')'
    if tag in v_tag or tag in n_tag or tag in a_tag:
        return True
    return False

def filter_by_tag_udf(row):
    v_tag = ['v', 'vd', 'vg', 'vi', 'vn', 'vq', 'vt']
    n_tag = ['n', 'ng', 'nr', 'nrfg', 'nrt', 'ns', 'nt','nz']
    a_tag = ['a', 'ad', 'ag', 'an']
#         print tag['word'][0] + '(' + tag['tag'][0] + ')'
    if row['tag'] in v_tag or row['tag'] in n_tag or row['tag'] in a_tag:
        return True
    return False

In [38]:
WORD_ARTICLE_MAPPING_TABLE = 'word_article_mapping'

for index, row in query_df.iterrows():
    
    # question斷詞
    if not row['question']:
        continue
    
#     print row['question']
    words = segmentor.pseg_lcut(row['question'])
#     print words
    article = row['article_id']
    for index, row in words.iterrows():
#         print row['word']
        
        # 只存名詞 and 動詞
        if filter_by_tag(row['tag']):
        # 每個詞更新到 mapping table
            cql = ("update " + WORD_ARTICLE_MAPPING_TABLE + " " +
                   "set article = article + [" + str(article) + "] " +
                   "where word = '" + row['word'] + "';")
#             print cql
#             dao.execCQL(HELPER_KEYSPACE, cql)


In [12]:
ppp

NameError: name 'ppp' is not defined

# query test

In [39]:
segmentor.stopwords=True
segmentor.pseg_lcut('肯德基新產品完勝麥當勞')

Unnamed: 0,tag,word
0,nr,肯德基
1,a,新
2,n,產品
3,v,完勝
4,nrt,麥當勞


In [40]:
from gensim.corpora import *
from gensim.models import TfidfModel
from datetime import datetime
from gensim import models
tfidf_model_path = "/home/charles/dataset/tfidf_model/v2"
tfidf_model = TfidfModel.load(tfidf_model_path+"/tfidf.model")
dictionary = Dictionary.load(tfidf_model_path+"/tfidf_corpus_dict")

In [41]:
stopword_set = set()
with open(jieba_stopwords_path,'r') as stopwords:
    for stopword in stopwords:
        stopword_set.add(stopword.strip('\n').decode('utf-8'))

In [42]:
def find_keyword(text):
# #     test_cut_raw_0 = jieba.lcut(text)
#     test_cut_raw_0 =pseg.lcut(text)
#     text_subject_dict={}
#     for w,n in test_cut_raw_0:
#         print w + ' (' + n + ')'
#         text_subject_dict.update({w:n})
    
    if type(text) is not unicode:
        text = text.decode('utf-8')
        
    segmentor.stopwords=True

    text_subject_dict={}
    words_pd = segmentor.pseg_lcut(text)
    for index, row in words_pd.iterrows():
        text_subject_dict.update({row['word']:row['tag']})
        
    
    # jieba分词语料集 去除停用词
#     corpus0 = []
#     # load stopwords set
#     stopword_set = set()
#     with open(jieba_stopwords_path,'r') as stopwords:
#         for stopword in stopwords:
#             stopword_set.add(stopword.strip('\n').decode('utf8'))
#     for index, row in words_pd.iterrows():
#         if row['word'] not in stopword_set:
#             corpus0.append(row['word'])
    
    corpus0 = list(words_pd['word'])
    
    test_corpus_0 = dictionary.doc2bow(corpus0)
    test_corpus_tfidf_0 = tfidf_model[test_corpus_0]
    test_corpus_tfidf_0 = sorted(test_corpus_tfidf_0, key=lambda item: item[1], reverse=True)
    id2token = dict(zip(dictionary.token2id.values(), dictionary.token2id.keys()))
    result = []
    
    keyword_dict={}
    keyword_frames_list=[]
    word_list=[]
    tag_list=[]
    tfidf_list=[]
    for i in range(len(test_corpus_tfidf_0)):
        result.append({id2token[test_corpus_tfidf_0[i][0]]: test_corpus_tfidf_0[i][1]})
        print "--------------------------------------"
        word = id2token[test_corpus_tfidf_0[i][0]]
        tfidf = test_corpus_tfidf_0[i][1]
        print 'keyword: ' + word + '(' + text_subject_dict[word] + ') , tfidf: ' + str(tfidf)
        word_list.append(word)
        tag_list.append(text_subject_dict[word])
        tfidf_list.append(tfidf)
    
    keyword_pd=pd.DataFrame({"word":word_list,
                             "tag":tag_list,
                             "tfidf":tfidf_list})
    return keyword_pd
#         try:
#             w2v_predict(w2v_model, word)
#         except KeyError as er:
#             print 'no synonyms'
    

In [50]:
# TFIDF 字不會重複出現
keyword_pd = find_keyword('柯文哲2020選總統')
keyword_pd

--------------------------------------
keyword: 柯文哲(n) , tfidf: 0.7892749298187913
--------------------------------------
keyword: 選(v) , tfidf: 0.48881048728300514
--------------------------------------
keyword: 總統(n) , tfidf: 0.37163072085296306


Unnamed: 0,tag,tfidf,word
0,n,0.789275,柯文哲
1,v,0.48881,選
2,n,0.371631,總統


In [51]:
# 查對應表
mapping_dict={}
mapping_frames_list=[]

# 只存名詞 and 動詞
keyword_pd['filter_by_tag'] = keyword_pd.apply(filter_by_tag_udf, axis=1)
keyword_pd = keyword_pd[keyword_pd['filter_by_tag'] == True]
print keyword_pd
top_n = 3
for index, row in keyword_pd.head(top_n).iterrows():
    
    cql = ("select * from nlp_keyspace.word_article_mapping where word='" + row['word'] + "';")
    print cql
    pd_df = dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
    mapping_frames_list.append(pd_df)

if len(mapping_frames_list) > 0:
    mapping_pd=pd.DataFrame(columns=['row','article'])
    mapping_pd=pd.concat(mapping_frames_list).reset_index(drop=True)
mapping_pd

  tag     tfidf word filter_by_tag
0   n  0.789275  柯文哲          True
1   v  0.488810    選          True
2   n  0.371631   總統          True
select * from nlp_keyspace.word_article_mapping where word='柯文哲';
select * from nlp_keyspace.word_article_mapping where word='選';
select * from nlp_keyspace.word_article_mapping where word='總統';


Unnamed: 0,word,article
0,柯文哲,"[62729, 41495, 44917, 22088, 40245, 40309, 696..."
1,選,"[17042, 33765, 15662, 15808, 37238, 49332, 441..."
2,總統,"[96854, 16633, 6196, 34614, 69325, 11161, 6426..."


In [52]:
# 統計哪個字出現最多次
article_list=[]
for index, row in mapping_pd.iterrows():
    #單個字去除重複文章id
    for item in list(set(row['article'])):
        article_list.append(item)
#         article_list.append(list(set(item)))
#         print type(item)
        
print str(len(article_list))
# article_list

466


In [53]:
from collections import Counter
result = Counter(article_list)
count_pd = pd.DataFrame(dict(result).items(), columns=['article_id', 'count'])
count_pd = count_pd.sort_values(by=['count'], ascending=False)
print len(count_pd)
count_pd

459


Unnamed: 0,article_id,count
163,1377629,2
396,882547,2
90,717009,2
427,814411,2
137,854316,2
183,706415,2
243,39434,2
0,462848,1
306,938650,1
307,889499,1


In [54]:
# 至少要出現兩個字
top_pd = count_pd[count_pd['count'] > 1]
top_pd
# if len(top_pd) == 0:
#     top_pd = count_pd[count_pd['count'] == 1]

Unnamed: 0,article_id,count
163,1377629,2
396,882547,2
90,717009,2
427,814411,2
137,854316,2
183,706415,2
243,39434,2


In [55]:
# 依照次數分類查詢

count_list = sorted(list(set(top_pd['count'])), reverse=True)
count_list

[2]

In [56]:
response_frames_list = []
for count in count_list:
    print count
    ans_pd = top_pd[top_pd['count'] == count]
    for index, row in ans_pd.iterrows():
    
#         cql = ("select * from nlp_keyspace.dcard_response where article_id=" +
#                str(row['article_id']) + ";")
        
        cql = ("select * from nlp_keyspace.dcard_query_table where article_id=" +
               str(row['article_id']) + ";")
        print cql
        pd_df = dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
        response_frames_list.append(pd_df)
    
    if len(response_frames_list) > 0:
        answer_pd=pd.concat(response_frames_list).reset_index(drop=True)
        if len(answer_pd) > 10:
            break
        
answer_pd

2
select * from nlp_keyspace.dcard_query_table where article_id=1377629;
select * from nlp_keyspace.dcard_query_table where article_id=882547;
select * from nlp_keyspace.dcard_query_table where article_id=717009;
select * from nlp_keyspace.dcard_query_table where article_id=814411;
select * from nlp_keyspace.dcard_query_table where article_id=854316;
select * from nlp_keyspace.dcard_query_table where article_id=706415;
select * from nlp_keyspace.dcard_query_table where article_id=39434;


Unnamed: 0,article_id,floor,board,like_count,question,response
0,1377629,1,trending,4,柯文哲開微博圖總統大位？ 市府澄清：增加善意,名嘴先去醒醒腦…
1,1377629,2,trending,3,柯文哲開微博圖總統大位？ 市府澄清：增加善意,建議多方攝取資訊\n一體兩面的事情很多\n每個人解讀方式也不同
2,1377629,3,trending,3,柯文哲開微博圖總統大位？ 市府澄清：增加善意,無期限支持柯文哲選中華民國總統
3,1377629,4,trending,0,柯文哲開微博圖總統大位？ 市府澄清：增加善意,臺灣名嘴的話原po敢信？？
4,1377629,5,trending,0,柯文哲開微博圖總統大位？ 市府澄清：增加善意,現在覺得他有時候的反應太矯情
5,1377629,6,trending,0,柯文哲開微博圖總統大位？ 市府澄清：增加善意,開微博反攻大陸
6,1377629,7,trending,3,柯文哲開微博圖總統大位？ 市府澄清：增加善意,如果是馬英九去開設微博帳號不知道會被罵成多慘\n\n說不定還有人要用叛亂罪起訴他
7,1377629,8,trending,0,柯文哲開微博圖總統大位？ 市府澄清：增加善意,如果柯p用推特是不是就要選美國總統了ㄎㄎ
8,882547,1,trending,0,測測自己的政見意向，你會選哪一位總統呢？,希望也有政黨票的xd
9,882547,2,trending,0,測測自己的政見意向，你會選哪一位總統呢？,還蠻有趣的


In [57]:
# response_dict={}
# response_frames_list=[]
# top_count = 0
# count_list = list(top_pd['count'])
# for index, row in top_pd.iterrows():
    
#     cql = ("select * from nlp_keyspace.dcard_response where article_id=" + str(row['article_id']) + ";")
#     print cql
#     pd_df = dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
    
    
#     response_frames_list.append(pd_df)
# #     if len(pd_df) > 4:
# #         break

# # response_pd=pd.DataFrame(columns=['row','article'])
# answer_pd = pd.DataFrame({})
# if len(response_frames_list) > 0:
#     answer_pd=pd.concat(response_frames_list).reset_index(drop=True)

# 透過question_type查wiki,google search, google map

In [58]:
# for index, row in top_pd.iterrows():
#     cql = ("select * from nlp_keyspace.dcard_article_test where article_id=" + str(row['article_id']) + ";")
#     print cql
#     article_df = dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
#     print article_df['question'][0]

In [59]:
if len(answer_pd) == 0:
    print 'AQ還在學習中......'

# 過濾答案

In [60]:
# import re
# def filter_url(row):
    
#     pattern = re.compile('http://\S+|https://\S+')
#     match = pattern.search(row['content'])
#     if match:
#         return True
#     return False

# # 過濾url的答案
# mask = (answer_pd['has_url'] == False)
# answer_pd = answer_pd.loc[mask]

In [61]:
if len(answer_pd):
    answer_pd = answer_pd.sort_values(by=['like_count'], ascending=False)
#     print answer_pd
answer_pd

Unnamed: 0,article_id,floor,board,like_count,question,response
36,814411,4,trending,52,柯文哲:台灣醫療沒人敢改 除非我當總統,我覺得柯p是個很誠實的人\n他在想什麼都很明確的跟大家說\n總比很多人心照不宣好
35,814411,1,trending,43,柯文哲:台灣醫療沒人敢改 除非我當總統,卡等2020柯P選總統
38,814411,6,trending,10,柯文哲:台灣醫療沒人敢改 除非我當總統,感覺他至少比小英還有老朱強
22,717009,5,trending,8,總統該選誰？,你讓我突然覺得自己只會看人就投票很膚淺\n我有空也來研究政策好了\n\n星期一天氣晴
37,814411,5,trending,8,柯文哲:台灣醫療沒人敢改 除非我當總統,拜託別再出來亂了\n先把你跳票的政見搞好好嗎
50,854316,1,trending,8,總統到底可以選誰啊,你之前有去幫其他四組獨立候選人連署嗎？
69,706415,4,trending,7,「世界台灣皇帝」藍信祺選總統　政見：每人發100億,第一個說要公開建國的候選人耶\n台獨的不支持一下嗎
80,39434,4,trending,6,連勝文 vs 柯文哲 - 你要選誰? 為什麼? 最後30天民調,柯P的當選絕對是台灣政治轉型的第一步，只是希望他未來在市議會還能扛得住XD
78,39434,1,trending,6,連勝文 vs 柯文哲 - 你要選誰? 為什麼? 最後30天民調,2:45有點奇葩
41,814411,9,trending,6,柯文哲:台灣醫療沒人敢改 除非我當總統,柯批選總統 \n我絕對從美國飛回來投票


In [62]:
for i,r in answer_pd.reset_index().iterrows():
    print str(i) + '. ' + r['content']

KeyError: 'content'


# 沒找到就回答罐頭訊息

In [None]:
ppp

# 測試

In [None]:
cql = ("select * from nlp_keyspace.word_article_mapping;")
map_df = dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)

In [None]:
len(map_df)

In [None]:
word_df = map_df[map_df['word'].str.len() == 1]
print len(word_df)
word_df