In [1]:
import xml.etree.ElementTree as ET
from collections import Counter

# Parse the XML data
def get_text(doc_path):
    tree = ET.parse(doc_path)
    root = tree.getroot()

    # Extract metadata
    metadata = root.find('./teiHeader')
    metadata_dict = {}
    for item in metadata.findall('metadata'):
        name = item.get('name')
        value = item.text
        metadata_dict[name] = value

    # Extract text content
    text = root.find('./text')
    body_author = text.find('./body').get('author')
    title_author = text.find('./title').get('author')
    sentences = text.findall('body/s')
    comments = text.findall('comment')
    comments_pairs = [([(word.get('type'), word.text) for word in c.findall('s/w')], c.get('c_type')) for c in comments]
    sentences_pairs = [[(word.get('type'), word.text) for word in sent.findall('w')] for sent in sentences]
    text = []
    c = Counter()
    for sentense in sentences_pairs:
        sentense_parsed = ''.join([word[1] for word in sentense])
        text.append(sentense_parsed)
    text = '\n'.join(text)
    for comment in comments_pairs:
        c[comment[1]] += 1
    file_name = doc_path.split('/')[-1]
    return {'date': file_name[:6], 'text': text, 'pos': c['pos'], 'neu': c['neu'], 'neg': c['neg']}


In [2]:
import os
from tqdm.auto import tqdm

directory = '/nfs/nas-6.1/wclu/cllt/ptt_data/HatePolitics'
data = []
 
for root, dirs, files in os.walk(directory):
    for filename in tqdm(files):
        if filename != '.DS_Store':
            doc_path = os.path.join(root, filename)
            try:
                data.append(get_text(doc_path))
            except:
                continue

import pandas as pd
data_df = pd.DataFrame(data)
data_df['num_com'] = data_df['pos'] + data_df['neu'] + data_df['neg']


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3815 [00:00<?, ?it/s]

  0%|          | 0/3780 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/1178 [00:00<?, ?it/s]

## BM25

In [3]:
from ckip_transformers.nlp import CkipWordSegmenter
from rank_bm25 import BM25Okapi

ws_driver  = CkipWordSegmenter(model="bert-base", device=0)

2023-05-30 13:24:05.552928: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
import numpy as np
def retrieve_bm25(year, month, query):
    date = year + month
    df = data_df[data_df['date']==date].reset_index(drop=True)
    corpus = df['text'].tolist()
    corpus_tokenized = ws_driver(corpus, batch_size=64, max_length=509)
    bm25 = BM25Okapi(corpus_tokenized)
    tokenized_query = ws_driver([query], batch_size=1, max_length=509)[0]
    scores = bm25.get_scores(tokenized_query)
    top_n = np.argsort(scores)[::-1][:20]
    rel_doc = df.iloc[top_n].sort_values(by=['num_com'], ascending=False).head(5)
    return rel_doc.reset_index(drop=True)


In [5]:
from transformers import BertForSequenceClassification
from transformers import BertTokenizer
import torch

tokenizer=BertTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment', cache_dir='/nfs/nas-6.1/wclu/cache')
model=BertForSequenceClassification.from_pretrained('IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment', cache_dir='/nfs/nas-6.1/wclu/cache')


In [6]:
id2label = {
    0: 'negative',
    1: 'positive'
}
def sentiment_analysis(year, month, query):
    rel_doc = retrieve_bm25(year, month, query)
    doc_text = rel_doc['text'].tolist()
    x = tokenizer(doc_text, padding='longest', truncation=True, max_length=512, return_tensors="pt")
    output = model(x['input_ids'])
    sentiment = []
    for logit in output.logits:
        sentiment.append(id2label[int(logit.argmax())])
    rel_doc['senti'] = sentiment
    return sentiment 

In [33]:
import numpy as np
def retrieve_tool(year, month, query):
    rel_doc = retrieve_bm25(year, month, query)
    return '\n'.join(rel_doc['text'].tolist())[:1000]

In [14]:
from ckip_ner import ner_person

def ner_tool(year, month):
    date = year + month
    df = data_df[data_df['date']==date].reset_index(drop=True)
    corpus = df['text'].tolist()
    return ner_person(corpus)[0][0]

In [34]:
# Import things that are needed generically
from langchain import LLMMathChain, SerpAPIWrapper
from langchain.agents import AgentType, initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain.tools import BaseTool, StructuredTool, Tool, tool

openai_api_key='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'
llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)
tools = [
    StructuredTool.from_function(
        func=retrieve_tool,
        name = "retrieval",
        description="useful for when you need to retrieve information on specific date. The input of month should be number of two digit. For example, February: 02."
    ),
    StructuredTool.from_function(
        func=sentiment_analysis,
        name = "sentiment analysis",
        description="Useful for when you need to know the sentiment on specific date. The input of month should be number of two digit. For example, February: 02."
    ),
    StructuredTool.from_function(
        func=ner_tool,
        name = "named entity recognition",
        description="Useful for when you need to know the most popular people on specific date. The input of month should be number of two digit. For example, February: 02."
    ),
]
agent = initialize_agent(tools, llm, agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=True)

In [35]:
prompt = '''
I want to know the most popular people in February, 2023. 
Also, please tell me some information about the person at that time, and the sentiment toward him. 
Reply in traditional Chinese.
'''

In [36]:
agent.run(prompt)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
```
{
  "action": "named entity recognition",
  "action_input": {
    "year": "2023",
    "month": "02"
  }
}
```
[0m

Tokenization: 100%|██████████| 1020/1020 [00:00<00:00, 1685.73it/s]
Inference: 100%|██████████| 23/23 [00:10<00:00,  2.18it/s]



Observation: [38;5;200m[1;3m林智堅[0m
Thought:[32;1m[1;3mAction:
```
{
  "action": "retrieval",
  "action_input": {
    "year": "2023",
    "month": "02",
    "query": "林智堅"
  }
}
```

[0m

Tokenization: 100%|██████████| 1020/1020 [00:00<00:00, 1725.38it/s]
Inference: 100%|██████████| 23/23 [00:10<00:00,  2.17it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 13530.01it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 150.28it/s]
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-ZUVi9bYEx0Di5RrvC2VXibwz on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..



Observation: [36;1m[1;3m1.新聞網址︰
https://newtalk.tw/news/view/2023-02-13/857325
2.新聞來源︰
Newtalk
3.完整新聞標題
不大聲說了嗎？民進黨闢謠專區悄悄下架「林智堅沒有抄襲」頁面
4.完整新聞內容︰
前新竹市長林智堅論文抄襲案，目前雙雙進入司法程序，台大國發所論文1月中旬台北地
院首度召開準備程序庭，傳喚被告林智堅到庭，中華大學論文也被國科會證實，竹科管理
局已依法處理，且已進入司法程序。去年還是民進黨主席的總統蔡英文，曾要全體黨公職
大聲地跟大家說「林智堅沒有抄襲」，並在民進黨網站闢謠專區中放上為林智堅辯護的頁
面，不過，近日被發現已將林智堅論文相關貼文下架。
針對林智堅論文抄襲爭議，總統蔡英文去年8月曾要求全體黨公職表態，「只要是完整看
過兩本論文，而且完整了解事情來龍去脈的人，都願意選擇相信智堅沒有抄襲」，還呼籲
黨公職一起「大聲地跟大家說」，讓社會可以理解整件事情的始末和真相。
民進黨也在黨的網站「闢謠專區」頁面中，放上了「林智堅論文為原創」、「林智堅中華
大學碩論享有著作權 且口試早於著作權轉移 沒有抄襲、欽權疑慮」等貼文，試圖為林智
堅辯護。
不過，隨著林智堅論文案先後被台灣大學與中華大學學倫會判定抄襲，案件也在調查局新
竹市調查站調查官余正煌正式開告後進入司法程序，資深媒體人黃揚明9日曾在臉書發文
表示，民進黨官網的「闢謠專區」仍留有堅稱前新竹市長林智堅論文沒抄襲的2篇文章。
黃揚明呼籲，檢討學倫機制之外，也該把這些顛倒是非的內容下架。
經過了四天，民進黨原先的「闢謠專區」頁面中，已悄悄將林智堅論文沒有抄襲，且為原
創等貼文頁面下架。
5.附註、心得、想法︰
闢謠闢到全黨火葬場了吼
是不是收到風聲台大的結果要出來了
看過小智兩本論文的人現在還有呼吸嗎
還好我下架前看過這原文了
綠共吃屎
1.新聞網址︰
※ 網址超過一行 請縮網址 ※
https://newtalk.tw/news/view/2023-02-15/857533
2.新聞來源︰  新頭殼
3.完整新聞標題
※ 請完整轉載標題 請勿修改與編排 ※
翁達瑞：若連林智堅都無法保護 平民百姓受冤屈豈更不能寄望民進黨
4.完整新聞內容︰
※ 請完整轉載原文 請勿修改內文與編排 ※
前新竹市長林智堅論文抄襲風

Tokenization: 100%|██████████| 1020/1020 [00:00<00:00, 2387.96it/s]
Inference: 100%|██████████| 23/23 [00:10<00:00,  2.16it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 19972.88it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 143.20it/s]



Observation: [36;1m[1;3m1.新聞網址︰
https://newtalk.tw/news/view/2023-02-13/857325
2.新聞來源︰
Newtalk
3.完整新聞標題
不大聲說了嗎？民進黨闢謠專區悄悄下架「林智堅沒有抄襲」頁面
4.完整新聞內容︰
前新竹市長林智堅論文抄襲案，目前雙雙進入司法程序，台大國發所論文1月中旬台北地
院首度召開準備程序庭，傳喚被告林智堅到庭，中華大學論文也被國科會證實，竹科管理
局已依法處理，且已進入司法程序。去年還是民進黨主席的總統蔡英文，曾要全體黨公職
大聲地跟大家說「林智堅沒有抄襲」，並在民進黨網站闢謠專區中放上為林智堅辯護的頁
面，不過，近日被發現已將林智堅論文相關貼文下架。
針對林智堅論文抄襲爭議，總統蔡英文去年8月曾要求全體黨公職表態，「只要是完整看
過兩本論文，而且完整了解事情來龍去脈的人，都願意選擇相信智堅沒有抄襲」，還呼籲
黨公職一起「大聲地跟大家說」，讓社會可以理解整件事情的始末和真相。
民進黨也在黨的網站「闢謠專區」頁面中，放上了「林智堅論文為原創」、「林智堅中華
大學碩論享有著作權 且口試早於著作權轉移 沒有抄襲、欽權疑慮」等貼文，試圖為林智
堅辯護。
不過，隨著林智堅論文案先後被台灣大學與中華大學學倫會判定抄襲，案件也在調查局新
竹市調查站調查官余正煌正式開告後進入司法程序，資深媒體人黃揚明9日曾在臉書發文
表示，民進黨官網的「闢謠專區」仍留有堅稱前新竹市長林智堅論文沒抄襲的2篇文章。
黃揚明呼籲，檢討學倫機制之外，也該把這些顛倒是非的內容下架。
經過了四天，民進黨原先的「闢謠專區」頁面中，已悄悄將林智堅論文沒有抄襲，且為原
創等貼文頁面下架。
5.附註、心得、想法︰
闢謠闢到全黨火葬場了吼
是不是收到風聲台大的結果要出來了
看過小智兩本論文的人現在還有呼吸嗎
還好我下架前看過這原文了
綠共吃屎
1.新聞網址︰
※ 網址超過一行 請縮網址 ※
https://newtalk.tw/news/view/2023-02-15/857533
2.新聞來源︰  新頭殼
3.完整新聞標題
※ 請完整轉載標題 請勿修改與編排 ※
翁達瑞：若連林智堅都無法保護 平民百姓受冤屈豈更不能寄望民進黨
4.完整新聞內容︰
※ 請完整轉載原文 請勿修改內文與編排 ※
前新竹市長林智堅論文抄襲風

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-ZUVi9bYEx0Di5RrvC2VXibwz on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-ZUVi9bYEx0Di5RrvC2VXibwz on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit ht

[32;1m[1;3mAction:
```
{
  "action": "Final Answer",
  "action_input": "2023年2月最受歡迎的人是林智堅。根據檢索到的新聞，林智堅在當時因論文抄襲案受到爭議。其中一篇新聞指出，民進黨闢謠專區悄悄下架了「林智堅沒有抄襲」頁面。另一篇新聞則報導了翁達瑞的言論，認為若連林智堅都無法保護，平民百姓受冤屈豈更不能寄望民進黨。"
}
```

[0m

[1m> Finished chain.[0m


'2023年2月最受歡迎的人是林智堅。根據檢索到的新聞，林智堅在當時因論文抄襲案受到爭議。其中一篇新聞指出，民進黨闢謠專區悄悄下架了「林智堅沒有抄襲」頁面。另一篇新聞則報導了翁達瑞的言論，認為若連林智堅都無法保護，平民百姓受冤屈豈更不能寄望民進黨。'

## Semantic Search

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('GanymedeNil/text2vec-large-chinese', device=2 ,cache_folder='/nfs/nas-6.1/wclu/cache')

corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True, normalize_embeddings=True, device=2, show_progress_bar=True)

In [None]:
def retrieve_semantic(query):
    query_embedding = embedder.encode([query], convert_to_tensor=True, normalize_embeddings=True, device=2, show_progress_bar=True)
    hits = util.semantic_search(query_embedding, corpus_embeddings, score_function=util.dot_score, top_k=10)
    rel_doc = [corpus[hit['corpus_id']] for hit in hits[0]]
    return rel_doc