In [1]:
!pip3 install slack_sdk==3.21.0

[0m

In [2]:
!pip3 install arxiv==1.4.4

[0m

In [3]:
!pip3 install accelerate>=0.12.0
!pip3 install transformers[torch]==4.25.1

[0mCollecting transformers[torch]==4.25.1
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.28.1
    Uninstalling transformers-4.28.1:
      Successfully uninstalled transformers-4.28.1
Successfully installed transformers-4.25.1
[0m

# SlackとDolly 2.0のGPTで論文の要約をする

## Reference
- [DataBricks Dolly 2.0, Commercial Use, TRUE Open Source LLM](https://www.youtube.com/watch?v=GpWqjNf0SCM)
- [最新の論文をChatGPTで要約して毎朝Slackに共有してくれるbotを作る！](https://zenn.dev/ozushi/articles/ebe3f47bf50a86)
- [Slack API を使用してメッセージを投稿する](https://zenn.dev/kou_pg_0131/articles/slack-api-post-message)
- [【Slack】インストールするボットユーザーがありませんと出たときの対処方法](https://the-simple.jp/slack-nobotuser#Step1Bot)

In [4]:
import os
import random
import time

from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
import arxiv

In [5]:
from transformers import MarianMTModel, MarianTokenizer
from transformers import pipeline

2023-04-16 03:02:22.457424: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [6]:
import torch
from transformers import pipeline

In [7]:
SLACK_API_TOKEN = 'SLACK_API_TOKEN'    # ボットとして API を実行するためのトークン
# Slackに投稿するチャンネル名を指定する
SLACK_CHANNEL = "要約"

In [8]:
generate_text = pipeline(model="databricks/dolly-v2-2-8b", torch_dtype=torch.bfloat16, trust_remote_code=True, device_map='auto' )

Downloading (…)lve/main/config.json:   0%|          | 0.00/820 [00:00<?, ?B/s]

Downloading (…)instruct_pipeline.py:   0%|          | 0.00/7.15k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

In [13]:
def get_summary(result):
    system = """与えられた論文の全体を128文字以内にまとめた後、タイトルと原文とその日本語訳、概要、日付、新規性や差分、独特の手法、実験結果（評価結果の数値がある場合は、この実験結果にその内容を書いてください）を以下のフォーマットで日本語で出力してください。```
    # タイトルの原文
    # タイトルの日本語訳
    # リンク
    # 日付(yyyy/MM/dd)
    ## 一言でいうと
    ### 概要
    ### 新規性・差分
    ### 手法
    ### 結果
    ### コメント
    ```"""

    system_eng = """After summarizing the entire given paper in 128 characters or less, please output the title, original text and its Japanese translation, summary, date, novelty or difference, unique methods, and experimental results (if there are numerical evaluation results, please describe them in this experimental results) in Japanese in the following format. ````
    # Original title
    # Japanese translation of title
    # Link
    # Date (yyyy/MM/dd)
    ## In a nutshell.
    ### Summary
    ### Novelty/Difference
    ### Methodology
    ### Results
    ### Comments
    ```"""
    
    # 論文の要約を取得して日本語に翻訳する
    summary = result.summary
    # 論文のタイトルを取得して日本語に翻訳する
    title = result.title
    
    text = f"title: {title}\nbody: {summary}"
    date_str = result.published.strftime("%Y-%m-%d %H:%M:%S")
    print(f'# 日付(yyyy/MM/dd)\n{date_str}')
    print(text)
    
    print()
    
    # Load the MarianMTModel and MarianTokenizer for English to Japanese translation
    fugu_translator = pipeline('translation', model='staka/fugumt-en-ja')
    
    try:
        # j_text = fugu_translator(text)
        
        # Dolly2.0で要約して翻訳の実施
        eng_dooly2_generate_text = generate_text(text)
        j_text = fugu_translator(eng_dooly2_generate_text)
    except IndexError:
        j_text = []
        pass
    
    japanese_translations = []
    for translation in j_text:
        japanese_translations.append(translation['translation_text'])
    
    japanese_text = ''.join(japanese_translations)
    print(japanese_text)
    
    text_ = f"title: {title}\ndate: {date_str}\n"
    japan_text = text_ + japanese_text
    print()
    return japan_text

In [10]:
#queryを用意
# query_list = ['ti:%22 Anomaly Detection %22', 'ti:%22 AIOps %22']
query_list = ['AIOps', 'Anomaly Detection', 'Ops']
message_list = ['AIOps', 'Anomaly Detection', 'Ops']

# query_list = ['AIOps']
# message_list = ['AIOps']

In [11]:
# Slack APIクライアントを初期化する
client = WebClient(token=SLACK_API_TOKEN)

In [14]:
for j in range(len(query_list)):
    query = query_list[j]
    # arxiv APIで最新の論文情報を取得する
    search = arxiv.Search(
        query=query,  # 検索クエリ（
        max_results=5,  # 取得する論文数
        sort_by=arxiv.SortCriterion.SubmittedDate,  # 論文を投稿された日付でソートする
        sort_order=arxiv.SortOrder.Descending,  # 新しい論文から順に取得する
    )
    
    #searchの結果をリストに格納
    result_list = []
    for result in search.results():
        result_list.append(result)

    #ランダムにnum_papersの数だけ選ぶ
    num_papers = 5
    results = random.sample(result_list, k=num_papers)
    
    today = time.strftime('%Y-%m-%d', time.localtime())
    for i, result in enumerate(results):
        print(result)
        message_base =  "本日 " + str(today) + f"{message_list[j]} の" + "論文 " + str(i+1) + "本目です\n" + f"リンク: {result}\n"
        
        text = get_summary(result)
        message = message_base + text
        try:
            # Slackにメッセージを投稿する
            response = client.chat_postMessage(
                channel=SLACK_CHANNEL,
                text=message
            )
            print(f"Message posted: {response['ts']}")
        except SlackApiError as e:
            print(f"Error posting message: {e}")
            continue

http://arxiv.org/abs/2212.13245v1
# 日付(yyyy/MM/dd)
2022-12-26 18:24:45
title: Studying the Characteristics of AIOps Projects on GitHub
body: Artificial Intelligence for IT Operations (AIOps) leverages AI approaches to
handle the massive data generated during the operations of software systems.
Prior works have proposed various AIOps solutions to support different tasks in
system operations and maintenance (e.g., anomaly detection). In this work, we
investigate open-source AIOps projects in-depth to understand the
characteristics of AIOps in practice. We first carefully identify a set of
AIOps projects from GitHub and analyze their repository metrics (e.g., the used
programming languages). Then, we qualitatively study the projects to understand
their input data, analysis techniques, and goals. Finally, we analyze the
quality of these projects using different quality metrics, such as the number
of bugs. We also sample two sets of baseline projects from GitHub: a random
sample of machine 