In [2]:
# encoding=UTF-8
# !flask/bin/python

from cassandra.cluster import Cluster
from cassandra.policies import DCAwareRoundRobinPolicy
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import BatchStatement
import pandas as pd


class CassandraType(object):
    PRODUCTION = 0
    TEST = 1
    TEST_DOCKER = 2


class CassandraDAO(object):

    # you have to install following items :
    # a. python-Cassandra driver
    # b. pyspark cassandra connector

    def __init__(self, type):
        #         print('runing father.__init__')
        if type == CassandraType.PRODUCTION:
            self.contact_points = ['192.168.95.127', '192.168.95.122']
            self.contact_points_str = "192.168.95.127,192.168.95.122"
        elif type == CassandraType.TEST:
            self.contact_points = ['192.168.0.41', '192.168.0.42']
            self.contact_points_str = "192.168.0.41,192.168.0.42"
        else:
            self.contact_points = ['192.168.0.121', '192.168.0.122', '192.168.0.52']
            self.contact_points_str = "192.168.0.121,192.168.0.122,192.168.0.52"

        self.formatString = "org.apache.spark.sql.cassandra"
        self.username = "username"
        self.password = "password"
        self.cluster = None
        self.session = None
        self.createSession()

    def __del__(self):
        self.cluster.shutdown()

    def pandas_factory(self, colnames, rows):
        return pd.DataFrame(rows, columns=colnames)

    def createSession(self):
        print "contact_points = " + self.contact_points_str
        self.cluster = Cluster(
            contact_points=self.contact_points,  # random select a node
            #             load_balancing_policy = DCAwareRoundRobinPolicy(local_dc='datacenter1'),
            #         auth_provider = PlainTextAuthProvider(username='cassandra', password='cassandra')
        )
        self.session = self.cluster.connect()
        self.session.row_factory = self.pandas_factory
        self.session.default_fetch_size = 10000000  # needed for large queries, otherwise driver will do pagination. Default is 50000.

    def getSession(self):
        return self.session

    def execCQL(self, keyspace, cql):
        """
        execute CQL
        """
        self.session.set_keyspace(keyspace)
        self.session.execute_async(cql)

    def execCQLSelect(self, keyspace, cql):
        """
        execute CQL, select only
        """

        self.session.set_keyspace(keyspace)

        #       cassandra ResultSet
        async_results = self.session.execute_async(cql)
        return async_results

    def execCQLCallBackAnysc(self, keyspace, cql, handle_success, handle_error):
        """
        execute CQL, if success => handle_success function, else handle_error
        """
        self.session.set_keyspace(keyspace)
        async_results = self.session.execute_async(cql)
        async_results.add_callbacks(handle_success, handle_error)

    def execCQLSelectToPandasDF(self, keyspace, cql):
        """
        execute CQL, select only, return Pandas DataFrame
        """

        self.session.set_keyspace(keyspace)

        #       cassandra ResultSet
        async_results = self.session.execute_async(cql)
        #         async_results = self.session.execute_async(cql)
        #       to Pandas DataFrame
        return async_results.result()._current_rows


    def execCQLSelectToRDD(self, sqlContext, keyspace, cql):
        """
        execute CQL, select only, return Spark RDD
        """

        return self.execCQLSelectToDF(sqlContext, keyspace, cql).rdd.map(tuple)  # dataFrame to RDD

    @property
    def contactPoints(self):
        return self.contact_points

    @contactPoints.setter
    def contactPoints(self, contact_points):
        self.contact_points = contact_points

    @contactPoints.deleter
    def contactPoints(self):
        del self.contact_points


In [23]:
import regex as re
import pandas as pd

In [242]:
HELPER_KEYSPACE = 'nlp_keyspace'
DCARD_ARTICLE_TABLE = 'dcard_article'
DCARD_RESPONSE_TABLE = 'dcard_response'
DCARD_ARTICLE_TEST_TABLE = 'dcard_article_test'
WORD_ARTICLE_MAPPING = 'word_article_mapping'
DCARD_QUERY_TABLE = 'dcard_query_table'
dao = CassandraDAO('BACKUP')

contact_points = 192.168.0.121,192.168.0.122,192.168.0.52


In [311]:
article_id = 228078922

# question cleaning

In [312]:
cql = 'select * from ' + DCARD_ARTICLE_TABLE + " where article_id =" + str(article_id) + ";"
## select range
# cql = ("select * from nlp_keyspace.dcard_article_test " +
#         "where article_id > 225705809 and article_id < 225800010 allow filtering;")
a_df = dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
a_df

Unnamed: 0,article_id,board,category,content,crawl_date,create_date,question_type,title
0,228078922,talk,,不知道各位有沒有這樣的感覺，很多碩博班學生受到某些失職的教授“奴役”，這些學生超時工作、不合...,2019-02-20 03:14:04.855,2018-01-07T06:43:29.108Z,,碩博生心聲 — 老闆決定一切


In [313]:
def remove_question_tag(row):
    question = row['title']
    question = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\（.*?）|\\【.*?】|\\#.*? |\\#.*?#", u"", question)
    question = re.sub(u'(Re:|re:)', u'', question)
    return question

a_df['question'] = a_df.apply(remove_question_tag, axis=1)
# 去除 question 爲空 的 row
mask = (~(a_df['question'] == ''))
a_df = a_df[mask]
a_df

Unnamed: 0,article_id,board,category,content,crawl_date,create_date,question_type,title,question
0,228078922,talk,,不知道各位有沒有這樣的感覺，很多碩博班學生受到某些失職的教授“奴役”，這些學生超時工作、不合...,2019-02-20 03:14:04.855,2018-01-07T06:43:29.108Z,,碩博生心聲 — 老闆決定一切,碩博生心聲 — 老闆決定一切


In [314]:
a_df = a_df.drop(['category', 'content', 'crawl_date', 'create_date', 'question_type', 'title'], axis=1)
a_df

Unnamed: 0,article_id,board,question
0,228078922,talk,碩博生心聲 — 老闆決定一切


# response cleaning

In [317]:
cql = 'select * from ' + DCARD_RESPONSE_TABLE + " where article_id =" + str(article_id) + ";"
## select range
# cql = ("select * from nlp_keyspace.dcard_article_test " +
#         "where article_id > 225705809 and article_id < 225800010 allow filtering;")
r_df = dao.execCQLSelectToPandasDF(HELPER_KEYSPACE, cql)
r_df = r_df.rename(index=str, columns={"content": "response"})
r_df

Unnamed: 0,article_id,floor,response,crawl_date,create_date,like_count


In [318]:
def has_url(row):

    rule = 'http://\S+|https://\S+'
    pattern = re.compile(rule)
    match = pattern.search(row['response'])
    if match:  
       return True 

#     rule = '(.png|.jpg|.jpeg|.gif|.svg|.txt)'
#     pattern = re.compile(rule)
#     match = pattern.search(row['content'])
#     if match:  
#        return True 
    
    return False

# r_df['has_url'] = r_df.apply(has_url, axis=1)
# # 去除has_url的row
# mask = (~(r_df['has_url']))
# r_df = r_df[mask]
# r_df

In [319]:
# 過濾 response有回答floor EX:B1,B3......
def has_floor_tag(row):

    rule = '(B[\d+]|b[\d+]|[\d+]F|[\d+]樓)'
    pattern = re.compile(rule)
    match = pattern.search(row['response'])
    if match:  
        return True 

    return False

# r_df['has_floor'] = r_df.apply(has_floor_tag, axis=1)
# # 去除has_floor的row
# mask = (~(r_df['has_floor']))
# r_df = r_df[mask]
# r_df

In [322]:
if len(r_df) > 0 :
    
    # 去除has_url的row
    r_df['has_url'] = r_df.apply(has_url, axis=1)
    mask = (~(r_df['has_url']))
    r_df = r_df[mask]
    
    # 去除has_floor的row
    r_df['has_floor'] = r_df.apply(has_floor_tag, axis=1)
    mask = (~(r_df['has_floor']))
    r_df = r_df[mask]

    r_df = r_df.drop(['crawl_date', 'create_date', 'has_url', 'has_floor'], axis=1)
else:
    r_df = r_df.drop(['crawl_date', 'create_date',], axis=1)

In [None]:
# # test no response
# import numpy as np
# columns = ['article_id', 'floor', 'response', 'like_count']
# empty_df = pd.DataFrame({}, columns =columns)
# empty_df

# create query table

In [323]:
from math import isnan
ans_df = pd.merge(a_df, r_df, how='left', on='article_id')
ans_df = ans_df.dropna()
print len(ans_df)
ans_df = ans_df[ans_df['response'].str.len() <= 40]
print len(ans_df)
ans_df = ans_df.sort_values(by=['like_count'], ascending=False)

print 'len(ans_df) = ' + str(len(ans_df))
ans_df

0
0
len(ans_df) = 0


Unnamed: 0,article_id,board,question,floor,response,like_count


In [None]:
# like_count 分群
top = int(len(ans_df)*0.25)
ans_df.head(top)

In [None]:
for index, row in ans_df.iterrows():
    
    cql = ("insert into " + DCARD_QUERY_TABLE +
           "(article_id, floor, board, question, response, like_count) " + 
           "values(" + str(row['article_id']) + "," + str(row['floor']) + ",'" + 
           row['board'] + "','" + row['question'] + "','" + row['response'] + "'," +
           str(row['like_count']) + ");")
    print cql
#     dao.execCQL(HELPER_KEYSPACE, cql)