In [1]:
# encoding=UTF-8
# !flask/bin/python

from cassandra.cluster import Cluster
from cassandra.policies import DCAwareRoundRobinPolicy
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import BatchStatement
import pandas as pd


class CassandraType(object):
    PRODUCTION = 0
    TEST = 1
    TEST_DOCKER = 2


class CassandraDAO(object):

    # you have to install following items :
    # a. python-Cassandra driver
    # b. pyspark cassandra connector

    def __init__(self, type):
        #         print('runing father.__init__')
        if type == CassandraType.PRODUCTION:
            self.contact_points = ['192.168.95.127', '192.168.95.122']
            self.contact_points_str = "192.168.95.127,192.168.95.122"
        elif type == CassandraType.TEST:
            self.contact_points = ['192.168.0.41', '192.168.0.42']
            self.contact_points_str = "192.168.0.41,192.168.0.42"
        else:
            self.contact_points = ['192.168.0.121', '192.168.0.122', '192.168.0.52']
            self.contact_points_str = "192.168.0.121,192.168.0.122,192.168.0.52"

        self.formatString = "org.apache.spark.sql.cassandra"
        self.username = "username"
        self.password = "password"
        self.cluster = None
        self.session = None
        self.createSession()

    def __del__(self):
        self.cluster.shutdown()

    def pandas_factory(self, colnames, rows):
        return pd.DataFrame(rows, columns=colnames)

    def createSession(self):
        print "contact_points = " + self.contact_points_str
        self.cluster = Cluster(
            contact_points=self.contact_points,  # random select a node
            #             load_balancing_policy = DCAwareRoundRobinPolicy(local_dc='datacenter1'),
            #         auth_provider = PlainTextAuthProvider(username='cassandra', password='cassandra')
        )
        self.session = self.cluster.connect()
        self.session.row_factory = self.pandas_factory
        self.session.default_fetch_size = 10000000  # needed for large queries, otherwise driver will do pagination. Default is 50000.

    def getSession(self):
        return self.session

    def execCQL(self, keyspace, cql):
        """
        execute CQL
        """
        self.session.set_keyspace(keyspace)
        self.session.execute_async(cql)

    def execCQLSelect(self, keyspace, cql):
        """
        execute CQL, select only
        """

        self.session.set_keyspace(keyspace)

        #       cassandra ResultSet
        async_results = self.session.execute_async(cql)
        return async_results

    def execCQLCallBackAnysc(self, keyspace, cql, handle_success, handle_error):
        """
        execute CQL, if success => handle_success function, else handle_error
        """
        self.session.set_keyspace(keyspace)
        async_results = self.session.execute_async(cql)
        async_results.add_callbacks(handle_success, handle_error)

    def execCQLSelectToPandasDF(self, keyspace, cql):
        """
        execute CQL, select only, return Pandas DataFrame
        """

        self.session.set_keyspace(keyspace)

        #       cassandra ResultSet
        async_results = self.session.execute_async(cql)
        #         async_results = self.session.execute_async(cql)
        #       to Pandas DataFrame
        return async_results.result()._current_rows


    def execCQLSelectToRDD(self, sqlContext, keyspace, cql):
        """
        execute CQL, select only, return Spark RDD
        """

        return self.execCQLSelectToDF(sqlContext, keyspace, cql).rdd.map(tuple)  # dataFrame to RDD

    @property
    def contactPoints(self):
        return self.contact_points

    @contactPoints.setter
    def contactPoints(self, contact_points):
        self.contact_points = contact_points

    @contactPoints.deleter
    def contactPoints(self):
        del self.contact_points


# dcard article question cleaning data

In [3]:
dao = CassandraDAO('BACKUP')
HELPER_KEYSPACE = 'nlp_keyspace'
DCARD_ARTICLE_TABLE = 'dcard_article'
DCARD_ARTICLE_TEST_TABLE = 'dcard_article_test'

contact_points = 192.168.0.121,192.168.0.122,192.168.0.52


In [4]:
cql = 'select * from ' + DCARD_ARTICLE_TEST_TABLE + ';'
## select range
# cql = ("select * from nlp_keyspace.dcard_article_test " +
#         "where article_id > 225705809 and article_id < 225800010 allow filtering;")
pd_df = dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
pd_df

Unnamed: 0,article_id,board,category,content,crawl_date,create_date,question,question_target,question_type,title
0,224297810,job,,想請問在健身房的工作內容有什麼呢需要做什麼 還是要會什麼技能櫃檯/教練,2019-02-15 07:49:24.035,2016-07-03T16:51:05.200Z,健身房工作經驗,,other,健身房工作經驗（問）
1,225961291,talk,,首次發文，手機排版，請見諒（跟朋友借的帳號）首先，刺青是一門藝術並不是每個刺青的人都是屁孩 ...,2019-02-19 08:25:10.219,2017-03-11T09:36:15.788Z,刺青的意義,,other,#圖 刺青的意義
2,225196435,food,,,2019-02-13 01:39:36.743,2016-11-15T03:16:45.628Z,蜂蜜真假,,other,蜂蜜真假
3,224746504,talk,,各種淡紫控啊~覺得淡紫超美的～～😘還在繼續搜集中～(燈光比較不好，淡紫色沒有很明顯…😭)－愛...,2019-02-19 06:09:49.412,2016-09-12T14:33:05.434Z,愛各種淡紫色,,other,愛各種淡紫色
4,230071785,trending,,剛剛看到謝和炫出的這首歌，聽完簡直感動的說不出話來....台北年輕人這次一定要出來投票衝高投...,2019-02-15 02:46:30.961,2018-11-15T19:22:07.846Z,台灣的未來交給柯文哲！,,other,台灣的未來交給柯文哲！
5,226029225,food,,不知道有沒有人記得前幾天的單身狗廚房肉篇XD這次小廚跟朋友一行人去南投的某茶園露營(不透漏確...,2019-02-13 03:28:19.805,2017-03-22T04:04:36.302Z,單身狗廚房x野外露營篇,,other,#圖 單身狗廚房x野外露營篇
6,226491861,food,,想跟大家分享這間超好吃的日本料理店！他位於現在很多人要去的小琉球的乘船處----東港！我個人...,2019-02-13 03:54:05.395,2017-05-28T16:40:26.874Z,龍允日式手作壽司,,other,#食記 #東港 龍允日式手作壽司
7,229830476,language,,請問第三句 那個「となり 」是翻成登山「時」的意思嗎？,2019-02-15 07:05:51.092,2018-10-11T12:06:50.282Z,日文翻譯問題,,other,日文翻譯問題
8,465696,talk,,ㄜ 因為突然想到很想知道，雖然感覺這樣問有點奇怪= =前幾個星期有看到新聞在介紹，有個女生之...,2019-02-18 06:33:17.498,2015-08-17T13:33:21.449Z,尋~一個卡片遊戲 給背包客專用 (電視上看到,,other,尋~一個卡片遊戲(?) 給背包客專用 (電視上看到
9,230081344,trending,,本人台中人從小有嚴重過敏體質，鼻子容易塞住，也很容易流鼻水，更容易狂打噴嚏，成年之後狀況已經...,2019-02-15 02:47:31.216,2018-11-17T11:02:23.407Z,我不想吸髒空氣,,other,我不想吸髒空氣


In [None]:
def remove_question_tag(row):
    title = row['title']
#     replace_str = [u'問',u'圖',u'更',u'轉',u'更新',u'文長',
#                    u'閒聊',u'黑特',u'分享',u'慎入',u'圖多',
#                    u'有雷',u'心得',u'開箱',u'轉載',u'情報']
#     for s in replace_str:
#         title = title.replace(u'【' + s + u'】', u'')
#         title = title.replace(u'(' + s + u')', u'')
#         title = title.replace(u'（' + s + u'）', u'')
#         title = title.replace(u'[' + s + u']', u'')
#         title = title.replace(u'#' + s , u'')

#     title = re.sub(u"#.*? ", u"", title)#  ‘# ’開頭刪除
#     title = re.sub('\s','',title)  #将string中的所有空白字符删除
    title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\（.*?）|\\【.*?】|\\#.*? |\\#.*?#", "", title)
    return title

pd_df['question'] = pd_df.apply(remove_question_tag, axis=1)
pd_df = pd_df[pd_df['question'] != '']
pd_df

# dcard response cleaning data

In [6]:
dao = CassandraDAO('BACKUP')
HELPER_KEYSPACE = 'nlp_keyspace'
DCARD_RESPONSE_TABLE = 'dcard_response'

contact_points = 192.168.0.121,192.168.0.122,192.168.0.52


In [32]:
article_id = 632972
cql = 'select * from ' + DCARD_RESPONSE_TABLE + " where article_id =" + str(article_id) + ";"
## select range
# cql = ("select * from nlp_keyspace.dcard_article_test " +
#         "where article_id > 225705809 and article_id < 225800010 allow filtering;")
r_df = dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
r_df = r_df.sort_values(by=['like_count'], ascending=False)
r_df

Unnamed: 0,article_id,floor,content,crawl_date,create_date,like_count
8,632972,11,台科大不知道甩掉中興幾條街了\n還在那邊我讀普大我最屌zzzzz,2019-02-14 03:06:05.963,2015-10-08T05:23:09.632Z,98
0,632972,1,安靜一點，別說出真相\n\n有些人比較喜歡活在謊言，妳這樣太殘忍了,2019-02-14 03:08:44.246,2015-10-08T05:12:02.803Z,85
10,632972,13,原PO未來一定不是當老闆的料\n完完全全不需要各領域幫助,2019-02-14 03:07:09.303,2015-10-08T05:23:52.770Z,46
16,632972,21,比這個真的滿無聊的 ＝＝\n普通大學也是一堆混吃等畢業的雜魚大學雜魚科系啊...,2019-02-14 03:07:09.321,2015-10-08T05:35:49.304Z,38
5,632972,7,誰跟你高職考國英數自社呀= =\n不懂就閉嘴,2019-02-14 03:06:05.923,2015-10-08T05:15:09.466Z,34
25,632972,37,"你所謂的""事實""實在扭曲到讓人不忍直視。\n\n這已經是明擺著的歧視了，而且還伴隨著落後無用...",2019-02-14 03:07:09.351,2015-10-08T08:17:12.011Z,27
4,632972,6,我親愛的弟弟說對餐飲有興趣\n\n不見在家裡有煮過飯\n對啦 會自理宵夜了\n對啦 偶爾...,2019-02-14 03:07:09.272,2015-10-08T05:15:06.284Z,22
27,632972,39,國家沒把技職體系當一回事，從這裡價值觀就被扭曲了。\n工廠技術人員、髮型設計師、修理汽機車、...,2019-02-14 03:08:44.358,2015-10-08T08:30:30.209Z,21
19,632972,26,你可別小看技職的喔 很多技職修機台能力 \n\n比一堆眼睛長在頭頂上 只會啃書本拿來吹噓的強多了喔,2019-02-14 03:07:09.328,2015-10-08T06:00:47.860Z,21
9,632972,12,國英數+專業一專業二\n(專一專二不是單科喔且各自佔了兩百分)\n各科專一專二考試項目htt...,2019-02-14 03:08:44.293,2015-10-08T05:23:42.009Z,21


In [33]:
mask = (r_df['content'].str.len() <= 50)
r_df = r_df.loc[mask]
r_df

Unnamed: 0,article_id,floor,content,crawl_date,create_date,like_count
8,632972,11,台科大不知道甩掉中興幾條街了\n還在那邊我讀普大我最屌zzzzz,2019-02-14 03:06:05.963,2015-10-08T05:23:09.632Z,98
0,632972,1,安靜一點，別說出真相\n\n有些人比較喜歡活在謊言，妳這樣太殘忍了,2019-02-14 03:08:44.246,2015-10-08T05:12:02.803Z,85
10,632972,13,原PO未來一定不是當老闆的料\n完完全全不需要各領域幫助,2019-02-14 03:07:09.303,2015-10-08T05:23:52.770Z,46
16,632972,21,比這個真的滿無聊的 ＝＝\n普通大學也是一堆混吃等畢業的雜魚大學雜魚科系啊...,2019-02-14 03:07:09.321,2015-10-08T05:35:49.304Z,38
5,632972,7,誰跟你高職考國英數自社呀= =\n不懂就閉嘴,2019-02-14 03:06:05.923,2015-10-08T05:15:09.466Z,34
19,632972,26,你可別小看技職的喔 很多技職修機台能力 \n\n比一堆眼睛長在頭頂上 只會啃書本拿來吹噓的強多了喔,2019-02-14 03:07:09.328,2015-10-08T06:00:47.860Z,21
2,632972,4,高職主要不也一樣在念國英數自社?\n可是會教一些其他的科目啊\n例如會計，經濟，計概等等\n,2019-02-14 03:06:05.914,2015-10-08T05:13:58.402Z,17
21,632972,31,拿成績來當智商，\n\n然後認為自己高人一等的時候，\n\n其實你就已經輸大了。,2019-02-14 03:08:44.334,2015-10-08T06:49:51.802Z,15
1,632972,3,坐等高職生玻璃心碎滿地,2019-02-14 03:07:09.245,2015-10-08T05:13:46.699Z,14
22,632972,32,若真要說也不是中興的來發文吧Zzz\n你也考不上台科啦,2019-02-14 03:07:09.333,2015-10-08T07:09:14.627Z,14


In [34]:
print len(r_df)
for i,r in r_df.reset_index().iterrows():
    print '--------------------------'
    print str(i) + '. ' + r['content']
    

36
--------------------------
0. 台科大不知道甩掉中興幾條街了
還在那邊我讀普大我最屌zzzzz
--------------------------
1. 安靜一點，別說出真相

有些人比較喜歡活在謊言，妳這樣太殘忍了
--------------------------
2. 原PO未來一定不是當老闆的料
完完全全不需要各領域幫助
--------------------------
3. 比這個真的滿無聊的 ＝＝
普通大學也是一堆混吃等畢業的雜魚大學雜魚科系啊...
--------------------------
4. 誰跟你高職考國英數自社呀= =
不懂就閉嘴
--------------------------
5. 你可別小看技職的喔 很多技職修機台能力 

比一堆眼睛長在頭頂上 只會啃書本拿來吹噓的強多了喔
--------------------------
6. 高職主要不也一樣在念國英數自社?
可是會教一些其他的科目啊
例如會計，經濟，計概等等

--------------------------
7. 拿成績來當智商，

然後認為自己高人一等的時候，

其實你就已經輸大了。
--------------------------
8. 坐等高職生玻璃心碎滿地
--------------------------
9. 若真要說也不是中興的來發文吧Zzz
你也考不上台科啦
--------------------------
10. 先不談這個了，請先承認你不尊重技職。
--------------------------
11. 聞道有先後，術業有專攻。
某成大跟某興大的以後你們就知道這句話的意思了
--------------------------
12. 再講的話就要被玻璃碎片插爆了ㄛ
--------------------------
13. 那總統還是建中哈佛呢 考不上建中哈佛  怎麼還敢批評總統~
--------------------------
14. 原po點出事實就讓一堆人玻璃心碎滿地了
早點接受，ok
--------------------------
15. 對啊我們是很忙
不像116整天只會發廢文
還安慰自己是四大其中之一
--------------------------