In [1]:
import gc
import os
import time
import pandas as pd
from datetime import datetime

# 1. set file path & political dict
ref: https://github.com/crownpku/Awesome-Chinese-NLP

In [2]:
path = r'/Users/zoe/Desktop/ptt_crawler//'
exec_dt = datetime.strftime(datetime.now(), '%Y%m%d_%H%M%S')
os.listdir(path)

['Gossiping-1-5.json',
 'Gossiping-39361-39363_reply_20191228_191753.csv',
 'ntusd-negative.txt',
 'ptt_id_check.ipynb',
 'Gossiping-28800-29600_reply_20191229_192300.csv',
 '.DS_Store',
 'Gossiping-28800-29600.json',
 'ptt_excel.xlsx',
 'Gossiping-28800-29600_article_20191229_192300.csv',
 'Gossiping-1-5_article_20191228_201931.csv',
 'Gossiping-1-5_article_20191228_191753.csv',
 'local_script',
 'neo4j_community_pagerank_1204-20191211T114233Z-001.zip',
 'python -m PttWebCrawler -b PublicServan -i 100 200',
 'Gossiping-39361-39363.json',
 'test.json',
 'Gossiping-37500-39075.json',
 'ntusd-positive.txt',
 'Gossiping-37700-39294_1207_article_20191229_192300.csv',
 'Gossiping-37700-39294_1207_reply_20191229_192300.csv',
 'neo4j_community_pagerank_1204',
 'Gossiping-39361-39363_article_20191228_191753.csv',
 '.ipynb_checkpoints',
 'Gossiping-37700-39294_1207.json',
 'test2.json',
 'Gossiping-1-5_reply_20191228_201931.csv',
 'Gossiping-1-5_reply_20191228_191753.csv',
 'HatePolitics-2600-4

In [3]:
party_dict = {
      'deep_dpp_kw' : ['賴清德', '台獨', '基進側翼', '阿扁', '陳水扁']
    , 'light_dpp_kw' : ['民進黨', '林佳龍']
    , 'tsai_kw' : ['蔡英文', '蔡總統', '小英']
    , 'han_kw' : ['韓國瑜', '韓導', '韓國魚', '國瑜']
    , 'light_kmt_kw' : ['國民黨', '吳敦義', '馬英九']
    , 'deep_kmt_kw' : ['兩岸統一', '和平協議']
    , 'third_party_kw' : ['柯文哲', '郭董', '郭台銘', '柯粉', '民眾黨', '時代力量', '黃國昌', '吳崢', '柯P', '李登輝']
    , 'unrelated_kw' : ['親民黨', '宋楚瑜']
    , 'hk_protest' : ['反送中', '逃犯條例', '連儂牆', '真普選']
    , 'china_spy' : ['共諜案', '王立強', '向心']
}

filter_lst = ['com', 'imgur', 'https' , 'www', '新聞', '討論']

# 2. politics check
## notes:
#### 1. trans dict to config file
#### 2. build political dict from frequent words of article title & filter stop words
#### 3. reference https://github.com/sweslo17/chinese_sentiment/blob/master/dict/user_dic.dic
#### 4. others: open source political dict, wikipedia
#### 5. 中文詞庫 https://scidm.nchc.org.tw/dataset/nchc_2019_te_04
#### 6.  政治網路口碑的情感分析：語意關連性之觀點 http://csw.shu.edu.tw/File/Download/edmitems/%E5%82%B3%E6%92%AD%E7%A0%94%E7%A9%B6%E8%88%87%E5%AF%A6%E8%B8%908(2)-03%20%E9%99%B6%E6%8C%AF%E8%B6%85.pdf

In [4]:
def article_pol_tendency_check(input_df, party_dict):
    cols = ['article_id', 'article_title', 'author', 'board',
            'content', 'date', 'ip', 'all', 'boo', 'count', 'neutral', 'push',
            're_flag', 'expl_flag', 'pol_word_cnt', 'party']
    art_df = input_df.copy()
    art_df.reset_index(drop=True, inplace=True)
    pol_list = []
    [ pol_list.extend(x) for x in party_dict.values()]
    
    art_df.loc[art_df.article_title.isna(), 'article_title'] = 'NoData'
    art_df.loc[art_df.content.isna(), 'content'] = 'NoData'
    art_df['corpus'] = art_df.article_title + ' ' + art_df.content
    art_df['pol_word_cnt'] = art_df.corpus.str.count('|'.join(pol_list))

    art_df['party'] = 'undef'

    for slc_kw in party_dict: 
        rs_lst = []
        in_lst = party_dict[slc_kw]
        print(slc_kw)
        print(in_lst)
        ex_lst = []

        [ex_lst.extend(party_dict[x]) for x in party_dict if x != slc_kw]
        #因Re文會引述原文，為了確保分析的準確性，先排除這些文章
        for row in art_df.itertuples():
            temp_corpus = row.corpus
            if any(kw in temp_corpus for kw in in_lst)\
            and not any(kw in temp_corpus for kw in ex_lst)\
            and row.re_flag == 'N':
                art_df.loc[row.Index, 'party'] = slc_kw
    return art_df[cols]

In [5]:
for x in os.listdir(path):
    if 'article' in  x:
        print(x)

Gossiping-28800-29600_article_20191229_192300.csv
Gossiping-1-5_article_20191228_201931.csv
Gossiping-1-5_article_20191228_191753.csv
Gossiping-37700-39294_1207_article_20191229_192300.csv
Gossiping-39361-39363_article_20191228_191753.csv


In [6]:
f_nm = 'Gossiping-37700-39294_1207_article_20191229_192300.csv'
df = pd.read_csv(path + f_nm)
temp_df = article_pol_tendency_check(df, party_dict)
temp_df.to_csv(path + f_nm.split('.')[0] + '_pol_check_{}.csv'.format(exec_dt))

deep_dpp_kw
['賴清德', '台獨', '基進側翼', '阿扁', '陳水扁']
light_dpp_kw
['民進黨', '林佳龍']
tsai_kw
['蔡英文', '蔡總統', '小英']
han_kw
['韓國瑜', '韓導', '韓國魚', '國瑜']
light_kmt_kw
['國民黨', '吳敦義', '馬英九']
deep_kmt_kw
['兩岸統一', '和平協議']
third_party_kw
['柯文哲', '郭董', '郭台銘', '柯粉', '民眾黨', '時代力量', '黃國昌', '吳崢', '柯P', '李登輝']
unrelated_kw
['親民黨', '宋楚瑜']
hk_protest
['反送中', '逃犯條例', '連儂牆', '真普選']
china_spy
['共諜案', '王立強', '向心']


### notes:
#### 1. check senti 
#### 2. personal political spectrum check
#### 3. for accuracy we set senti in colab with CKIP & GPU acceleration, but for testing, you can just use string count or jieba with tradional繁體 dict + sentiment dict + politcal dict 
#### 4. sync time format but concern not erasing incorrect format

In [7]:
#repl_dict = {'\)':'', '\(':''}
#reply_df.target.replace(repl_dict, regex=True, inplace=True)
#ex: hugh509)

In [8]:
# try the api of gensim plugin?
# try mutiple processing
#https://speakerdeck.com/fukuball/head-first-chinese-text-segmentation?slide=64
#https://radimrehurek.com/gensim/models/ldamulticore.html

# https://zhuanlan.zhihu.com/p/30925299
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
# https://github.com/bmabey/pyLDAvis/tree/master/pyLDAvis
# https://medium.com/pyladies-taiwan/%E4%BB%A5-jieba-%E8%88%87-gensim-%E6%8E%A2%E7%B4%A2%E6%96%87%E6%9C%AC%E4%B8%BB%E9%A1%8C-%E4%BA%94%E6%9C%88%E5%A4%A9%E4%BA%BA%E7%94%9F%E7%84%A1%E9%99%90%E5%85%AC%E5%8F%B8%E6%AD%8C%E8%A9%9E%E5%88%86%E6%9E%90-ii-fdf5d3708662