In [1]:
!python3 -m pip install --upgrade pip

[0m

In [2]:
# !pip install openai==1.2.3
!pip install openai==1.3.4
!pip3 install arxiv==2.1.0
!pip install -U duckduckgo-search==4.4

!pip install python-dotenv tiktoken
!pip install pdfplumber

[0m

In [3]:
from time import time

class Timer:
    def __init__(self, logger=None, format_str="{:.3f}[s]", prefix=None, suffix=None, sep=" "):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)

In [4]:
from contextlib import contextmanager
from time import time

class Timer:
    """処理時間を表示するクラス
    with Timer(prefix=f'pred cv={i}'):
        y_pred_i = predict(model, loader=test_loader)
    
    with Timer(prefix='fit fold={} '.format(i)):
        clf.fit(x_train, y_train, 
                eval_set=[(x_valid, y_valid)],  
                early_stopping_rounds=100,
                verbose=verbose)

    with Timer(prefix='fit fold={} '.format(i), verbose=500):
        clf.fit(x_train, y_train, 
                eval_set=[(x_valid, y_valid)],  
                early_stopping_rounds=100,
                verbose=verbose)
    """
    def __init__(self, logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None, sep=' ', verbose=0):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None
        self.verbose = verbose

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)

In [5]:
import openai
import pdfplumber
from openai import OpenAI
import tiktoken
from dotenv import load_dotenv
import os
import json
import arxiv
import datetime as dt

load_dotenv()

True

In [6]:
# MODEL_NAME = "gpt-3.5-turbo-0125"
# MODEL_NAME = "gpt-3.5-turbo-instruct"
MODEL_NAME = "gpt-4-0125-preview"
TEMPERATURE = 0.7
# OpenAIクライアントの初期化
client = OpenAI()

In [7]:

def generate_research_questions_and_purpose_with_gpt(objective, num_questions, client):
    # プランナーエージェント: 研究目的から研究質問と検索文字列を生成します
    # Construct the prompt dynamically
    prompt_content = f"You are a helpful assistant capable of generating research questions along with their purposes for a systematic literature review.\n"
    prompt_content = f"Given the research objective: '{objective}', generate {num_questions} distinct research questions, each followed by its specific purpose. 'To examine', or 'To investigate'."
    
    response = client.chat.completions.create(
        # model="gpt-3.5-turbo",
        model="gpt-3.5-turbo-0125",
        messages=[
            {"role": "system", "content": "You are a helpful assistant capable of generating research questions along with their purposes for a systematic literature review."},
            {"role": "user", "content": prompt_content}
        ],
        # response_format={ "type": "json_object" },
        temperature=TEMPERATURE,
    )
    result = response.choices[0].message.content
    return {"research_questions": result}


In [8]:
objective = "RAG Evaluation Methods"
# objective = "RAG検証方法"

num_questions = 5

In [9]:
questions_and_purposes = generate_research_questions_and_purpose_with_gpt(objective, num_questions, client)
print(questions_and_purposes)

{'research_questions': 'Research Question 1: What are the existing methods for evaluating RAG (Red, Amber, Green) status in project management?\nPurpose: To examine the different evaluation techniques and approaches used to assess RAG status in project management and identify their strengths and limitations.\n\nResearch Question 2: How do different industries utilize RAG evaluation methods in project management?\nPurpose: To investigate the application of RAG evaluation methods across various industries and determine any sector-specific adaptations or best practices.\n\nResearch Question 3: What are the key factors influencing the accuracy and reliability of RAG evaluations in project management?\nPurpose: To examine the factors that impact the precision and dependability of RAG assessments in project management and suggest potential strategies for improvement.\n\nResearch Question 4: How do stakeholders perceive the effectiveness of RAG evaluation methods in project management?\nPurpo

In [10]:
print(questions_and_purposes['research_questions'])

Research Question 1: What are the existing methods for evaluating RAG (Red, Amber, Green) status in project management?
Purpose: To examine the different evaluation techniques and approaches used to assess RAG status in project management and identify their strengths and limitations.

Research Question 2: How do different industries utilize RAG evaluation methods in project management?
Purpose: To investigate the application of RAG evaluation methods across various industries and determine any sector-specific adaptations or best practices.

Research Question 3: What are the key factors influencing the accuracy and reliability of RAG evaluations in project management?
Purpose: To examine the factors that impact the precision and dependability of RAG assessments in project management and suggest potential strategies for improvement.

Research Question 4: How do stakeholders perceive the effectiveness of RAG evaluation methods in project management?
Purpose: To investigate the perspective

In [11]:
def extract_search_strings(content):
    possible_operators = ['AND', 'OR', 'NOT', '"']
    search_strings = []
    for line in content.split('\n'):
        if any(op in line for op in possible_operators):
            search_strings.append(line.strip())  # strip()を追加して余分な空白を削除
    return search_strings if search_strings else [content]

def generate_search_string_with_gpt(objective, research_questions, client):
    # 生成された検索文字列を使用して学術データベースをクエリし、関連論文の初期セットを取得します。
    # Removed the explicit instruction for logical operators
    combined_prompt = f"Given the research objective: '{objective}', and the following research questions: {research_questions['research_questions']}, generate two concise search string for identifying relevant literature for literature review.Do not include OR. Use AND if needed."

    response = client.chat.completions.create(
        # model="gpt-3.5-turbo",
        model="gpt-3.5-turbo-0125",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": combined_prompt}
        ],
        # response_format={ "type": "json_object" },
        temperature=TEMPERATURE,
    )
    
    content = response.choices[0].message.content
    search_string = extract_search_strings(content)
    return search_string

In [12]:
generate_search_string = generate_search_string_with_gpt(objective, questions_and_purposes, client)
print(generate_search_string)

['Search String 1: ("evaluating RAG status" OR "RAG evaluation methods") AND ("project management" OR "project monitoring")', 'Search String 2: ("RAG evaluation methods" OR "Red Amber Green assessment techniques") AND ("industry applications" OR "sector-specific adaptations")']


In [13]:
# # キーワードの入力
# search_strings = [
#     '"Risk Assessment and Governance evaluation methods"',
#     '"Organizations measure success outcomes Risk Assessment Governance initiatives"'
# ]

In [14]:
SYSTEM = """
### 指示 ###
論文の内容を理解した上で，重要なポイントを箇条書きで3点書いてください。

### 箇条書きの制約 ###
- 最大3個
- 日本語
- 箇条書き1個を50文字以内

### 対象とする論文の内容 ###
{text}

### 出力形式 ###
タイトル(和名)

- 箇条書き1
- 箇条書き2
- 箇条書き3
"""

In [15]:
# arXivの更新頻度を加味して、365日前の論文を検索
N_DAYS =365

MAX_RESULT = 10  # 取得する論文数の上限
# MODEL_NAME = "gpt-3.5-turbo-0613"
# MODEL_NAME = "gpt-3.5-turbo-1106"
MODEL_NAME = "gpt-3.5-turbo-0125"

# MODEL_NAME = "gpt-3.5-turbo-instruct"
TEMPERATURE = 0.7
# OpenAIクライアントの初期化
client = OpenAI()

# テンプレートを用意
QUERY_TEMPLATE = '%28 ti:%22{}%22 OR abs:%22{}%22 %29 AND submittedDate: [{} TO {}]'

# 検索を行い、結果を取得する関数
def search_arxiv(keyword):
    # Construct the default API client.
    client = arxiv.Client()
    # 2日前からN_DAYS前までの論文を検索
    today = dt.datetime.today() - dt.timedelta(days=2)
    # today = dt.datetime.today()
    
    base_date = today - dt.timedelta(days=N_DAYS)
    query = QUERY_TEMPLATE.format(keyword, keyword, base_date.strftime("%Y%m%d%H%M%S"), today.strftime("%Y%m%d%H%M%S"))

    search = arxiv.Search(
        query=query,
        max_results=MAX_RESULT,
        sort_by=arxiv.SortCriterion.SubmittedDate,
        sort_order=arxiv.SortOrder.Descending,
    )

    results = client.results(search)
    return results

# 論文の要約を取得する関数
def get_summary(result):
    text = f"title: {result.title}\nbody: {result.summary}"

    messages = [
        {"role" : "system", "content" : SYSTEM},
        {"role": "user", "content": text}
    ]
    
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
        temperature=TEMPERATURE,
    )
    return response.choices[0].message.content


In [16]:
import re
def simplify_search_queries(complex_queries):
    simplified_queries = []

    for query in complex_queries:
        # 数字とピリオドを除去して、クエリの本体だけを抽出
        clean_query = re.sub(r'^\d+\.\s*', '', query)
        
        # 括弧を除去
        clean_query = re.sub(r'[()"]', '', clean_query)
        
        # 'AND' と 'OR' で分割
        split_queries = re.split(r'\sAND\s|\sOR\s', clean_query)
        
        # 分割したクエリをリストに追加
        for sub_query in split_queries:
            sub_query = sub_query.strip()
            if sub_query and sub_query not in simplified_queries:
                simplified_queries.append(sub_query)
                
    return simplified_queries

In [17]:
simplified_queries = simplify_search_queries(generate_search_string)
# 複数の単語の間のスペースをハイフンに置換
# modified_queries = [query.replace(" ", "") for query in simplified_queries]
modified_queries = [query.replace(" ", "_") for query in simplified_queries]
# modified_queries = [query.split(" ") for query in simplified_queries]

for query in modified_queries:
    print(query)

Search_String_1:_evaluating_RAG_status
RAG_evaluation_methods
project_management
project_monitoring
Search_String_2:_RAG_evaluation_methods
Red_Amber_Green_assessment_techniques
industry_applications
sector-specific_adaptations


In [18]:
simplified_queries

['Search String 1: evaluating RAG status',
 'RAG evaluation methods',
 'project management',
 'project monitoring',
 'Search String 2: RAG evaluation methods',
 'Red Amber Green assessment techniques',
 'industry applications',
 'sector-specific adaptations']

In [20]:
import arxiv
# デフォルトのAPIクライアントを構築する。
arxivclient = arxiv.Client()

# 検索条件を指定する。
# query: 検索キーワードを指定する。ここでは "GPT-4" を指定。
# max_results: 取得する論文の最大件数を指定する。ここでは 10 件。
# sort_by: 論文の並び替え条件を指定する。ここでは投稿日時の降順（最新順）。

# for query in modified_queries:
for query in simplified_queries:
    print(query)
    search = arxiv.Search(
        query = query,
        max_results = 10,
        sort_by = arxiv.SortCriterion.SubmittedDate
    )

    # 検索を実行し、結果を取得する。
    results = arxivclient.results(search)
    # 取得した論文のタイトルを1件ずつ表示する。
    for r in results:
        print(f"\n{str(r.title)}\n{get_summary(r)}\n{r}")
    print("-" * 50)

Search String 1: evaluating RAG status

Holo-Relighting: Controllable Volumetric Portrait Relighting from a Single Image
ホロ・リライティング: 単一画像から制御可能なボリューメトリックポートレートのリライティング

- 単一画像から新しい視点と照明を合成可能
- ヘッドポーズに依存した照明効果の生成
- 物理的な照明の事前知識なしで非ランバート照明効果を生成
http://arxiv.org/abs/2403.09632v1

Generating functional of correlators of twist-$2$ operators in $\mathcal{N} = 1$ SUSY Yang-Mills theory, I
$\mathcal{N} = 1$ SUSY Yang-Mills理論におけるtwist-$2$演算子の相関関数の生成汎関数

- twist-$2$演算子の相関関数の生成汎関数を計算
- 大$N$展開のleadingとnext-to-leading orderで計算
- 非摂動解への強いUV漸近制約を設定
http://arxiv.org/abs/2403.09617v1

pARam: Leveraging Parametric Design in Extended Reality to Support the Personalization of Artifacts for Personal Fabrication
pARam: パラメトリックデザインを拡張現実に活用して、個人製作用のアーティファクトのパーソナライゼーションをサポートする

- 拡張現実とパラメトリックデザインを組み合わせたpARamは、複雑な3Dモデリングの必要性を排除し、ジェスチャーや照明推定などの実践的な入力を通じて、個人製作のためのアーティファクトのインタラクティブな構成を可能にする。
- pARamを使用したユーザーは、コンテキストに関連するパラメータを選択し、環境を考慮して設定を行うことに成功し、その効果を示した。
- パラメトリックデザインの拡張現実における活用は、個人製作のための複雑なデザイン手法を合理化する一方、適切な表現

In [21]:
generate_search_string


# デフォルトのAPIクライアントを構築する。
arxivclient = arxiv.Client()

# 検索条件を指定する。
# query: 検索キーワードを指定する。ここでは "GPT-4" を指定。
# max_results: 取得する論文の最大件数を指定する。ここでは 10 件。
# sort_by: 論文の並び替え条件を指定する。ここでは投稿日時の降順（最新順）。
for query in generate_search_string:
    print(query)
    search = arxiv.Search(
        query = query,
        max_results = 10,
        sort_by = arxiv.SortCriterion.SubmittedDate
    )

    # 検索を実行し、結果を取得する。
    results = arxivclient.results(search)
    # 取得した論文のタイトルを1件ずつ表示する。
    for r in results:
        print(f"\n{str(r.title)}\n{get_summary(r)}\n{r}")
    print("-" * 50)

Search String 1: ("evaluating RAG status" OR "RAG evaluation methods") AND ("project management" OR "project monitoring")
--------------------------------------------------
Search String 2: ("RAG evaluation methods" OR "Red Amber Green assessment techniques") AND ("industry applications" OR "sector-specific adaptations")
--------------------------------------------------


In [None]:
# client = arxiv.Client()
# for query in modified_queries:
#     search = arxiv.Search(
#         query = query,
#         max_results = 10,
#         sort_by = arxiv.SortCriterion.SubmittedDate
#     )

#     # 検索を実行し、結果を取得する。
#     results = client.results(search)
#     # 取得した論文のタイトルを1件ずつ表示する。
#     for r in results:
#         print(r.title)
#         print(r)
        
#         print(r.summary)
#     print("-" * 50)

In [None]:
# results = arxiv.Search(
#         query = "AI",
#         max_results = 10,
#         sort_by = arxiv.SortCriterion.SubmittedDate
#     )
# print(results)

In [None]:
# for k in modified_queries:
#     for keyword in k:
#         print(f"Searching for: {keyword}\n")
#         try:
#             results = search_arxiv(str(keyword))
#         except Exception as e:
#             print(f"Error searching for keyword '{keyword}': {e}")
#             continue  # 検索中にエラーが発生した場合、次のキーワードの検索に進む

#         for result in results:
#             try:
#                 # summary = get_summary(result)
#                 print()
#             except KeyboardInterrupt:
#                 print("KeyboardInterrupt detected, skipping to the next result.")
#                 continue  # KeyboardInterruptが発生した場合、次の論文の処理に進む
#             except Exception as e:
#                 print(f"Error getting summary for result '{result.title}': {e}")
#                 continue  # その他のエラーが発生した場合、次の論文の処理に進む

#             # エラーが発生しなかった場合の処理をここに記述
#             print(f"title: {result.title}")
#             print(f"published: {result.published}")
#             # print(f"abstract: {result.summary}")
#             print(f"PDF link: {result.pdf_url}")
#             # print(f"summary: {summary}")
#         print("-" * 50)


```
from bs4 import BeautifulSoup
import requests
import re

def get_search_results(keyword, number=5):
    # Google Scholarの検索URLを構築
    html_doc = requests.get(f"https://scholar.google.co.jp/scholar?hl=ja&num={number}&q=" + keyword).text
    soup = BeautifulSoup(html_doc, "html.parser")  # BeautifulSoupの初期化
    
    # 必要な情報を抽出
    tags_title_url = soup.find_all("h3", {"class": "gs_rt"})  # タイトル&URL
    tags_author_year = soup.find_all("div", {"class": "gs_a"})  # 著者&年
    tags_citations = soup.find_all("div", {"class": "gs_fl"})  # 引用元リンクが含まれるセクション

    for tag_title, tag_author_year, tag_citation in zip(tags_title_url, tags_author_year, tags_citations):
        title = tag_title.text.replace("[HTML]", "").replace("[PDF]", "")
        url = tag_title.find('a')['href']
        
        citation_link = None
        for a in tag_citation.find_all('a'):
            if "引用" in a.text:
                citation_link = a['href']
                break
                
        citations = re.search(r'\d+', a.text) if citation_link else '0'
        citations = citations.group(0) if citations else '0'

        print(f"Title: {title}\nURL: {url}\nCitations: {citations}")
        
        # 引用元リンクがあれば、そのURLを表示（引用元ページからさらに情報を取得する場合に使用）
        if citation_link:
            print(f"Citation URL: https://scholar.google.co.jp{citation_link}")

        # 概要の取得はGoogle ScholarのHTML構造に依存するため、概要を直接取得することは推奨されていません。
        # 代わりに、論文のURLを訪れて内容を確認してください。
        
        print("--------------------------------------------------\n")

search_strings = [
    '"Risk Assessment and Governance evaluation methods"',
    '"Organizations measure success outcomes Risk Assessment Governance initiatives"'
]

for keyword in search_strings:
    print(f"Searching for: {keyword}\n")
    get_search_results(keyword, number=2)

# search_strings = [
#     '1. "Risk Assessment and Governance evaluation methods"',
#     '2. "Organizations measure success outcomes Risk Assessment Governance initiatives"'
# ]


for keyword in generate_search_string:
    print(f"Searching for: {keyword}\n")
    get_search_results(keyword, number=2)



def get_title_and_url(soup):
    """obtain title and url from soup
    :param soup: parsed html by BeautifulSoup
    :return: title_list, url_list
    """
    tags1 = soup.find_all("h3", {"class": "gs_rt"})
    title_list = []
    url_list = []
    for tag1 in tags1:
        # タイトル取得
        # PDF, 書籍, B, HTML, 引用, Cのタグを除去
        title = re.sub(r"\[(PDF|書籍|B|HTML|引用|C)\]", "", tag1.text)
        # 空白区切りを廃止
        title = "_".join(title.split(" "))
        if title[0] == "_":
            title = title[1:]
        title_list.append(title)

        # url取得
        try:
            url = tag1.select("a")[0].get("href")
            url_list.append(url)
        except IndexError:
            url_list.append(None)
    return title_list, url_list


def get_writer_and_year(soup):
    """obtain writer(author) and year from soup
    :param soup: parsed html by BeautifulSoup
    :return: writer_list, year_list
    """
    tags2 = soup.find_all("div", {"class": "gs_a"})
    writer_list = []
    year_list = []
    for tag2 in tags2:
        # 著者取得
        """
        writer = tag2.text
        writer = re.sub(r"\d", "", writer)
        for char in range(0, len(writer)):
            if writer[char] == "-":
                writer = writer[2 : char - 1]
                break
        """
        writer = tag2.text.split("\xa0- ")[0]
        writer_list.append(writer)

        # 論文発行年取得
        year = tag2.text
        year = re.sub(r"\D", "", year)
        # yearが5桁以上だった場合の例外処理
        if len(year) > 4:
            year_list.append(year[len(year) - 4 : len(year)])
        else:
            year_list.append(year)
    return writer_list, year_list


def get_citations(soup):
    """obtain number of citations from soup
    :param soup: parsed html by BeautifulSoup
    :return: ci_num_list
    """
    tags3 = soup.find_all(text=re.compile("引用元"))
    ci_num_list = []
    for tag3 in tags3:
        # 被引用数取得
        citation = tag3.replace("引用元", "")
        ci_num_list.append(int(citation))
    return ci_num_list


def get_id(soup):
    """obtain paper id from soup
    :param soup: parsed html by BeautifulSoup
    :return: ci_num_list
    """
    tags4 = soup.find_all("div", {"class": "gs_fl"})
    p_id_list = []
    for tag4 in tags4:
        # 論文ID取得
        try:
            elem = tag4.find_all("a")[2]["href"]
            a = 15
            while True:
                if elem[a] == "&":
                    break
                a += 1
            p_id_list.append(elem[15:a])
        except:
            print("")
    return p_id_list

def year_list_to_cite_years(year_list,p_year):
    """convert year_list into cite_years
    :param year_list,p_year:
    :return: cite_years
    """
    year_list_int = []
    for s in year_list:
        try:
            year_list_int.append(int(s))
        except:
            pass
    y = [p_year+i for i in range(2021 - p_year + 1)]
    cite_years = [0 for _ in range(2021 - p_year + 1)]
    for year in year_list_int:
        if year >= p_year and year <= 2021:
            cite_years[year - p_year] += 1
    list_return = [y, cite_years]
#    cite_years = pd.DataFrame(cite_years,
#                       index=y,
#                       columns=['total'])
#    cite_years  = cite_years.T
    return list_return



def make_url(keyword, conf, author, year, paper_id=None):
    """make url for search papers
    normal search (keyword, conf, author, year) or target search (paper_id)
    :param keyword: str or None
    :param conf: str or None, conference information
    :param author: str or None, author information
    :param year: int or None, published year
    :param paper_id: None or int, paper information
    :return: url
    """
    assert (
        keyword is not None
        or conf is not None
        or author is not None
        or year is not None
        or paper_id is not None
    ), "KeywordNotFoundError"
    url = "https://scholar.google.co.jp/scholar?"
    if paper_id is not None:
        url += f"&cites={paper_id}"
    else:
        url += "&as_sdt=0%2C5"
        if keyword is not None:
            url += f"&as_q={'%20'.join(keyword.split())}"
        else:
            url += "&as_q="
        if conf is not None:
            url += f"&as_publication={'%20'.join(conf.split())}"
        if author is not None:
            author = "+".join(author.split())
            url += f"&as_sauthors={'%20'.join(author.split())}"
        if year is not None:
            url += f"&as_ylo={year}"
    return url



def get_snippet(soup):
    """obtain snippet from soup
    :param soup: parsed html by BeautifulSoup
    :return: snippet_list
    """
    tags = soup.find_all("div", {"class": "gs_rs"})
    snippet_list = [tags[i].text for i in range(len(tags))]
    return snippet_list


def grep_candidate_papers(url):
    html_doc = requests.get(url).text
    soup = BeautifulSoup(html_doc, "html.parser")

    title_list, url_list = get_title_and_url(soup)
    writer_list, year_list = get_writer_and_year(soup)
    ci_num_list = get_citations(soup)
    p_id_list = get_id(soup)
    snippet_list = get_snippet(soup)

    for i in range(len(title_list)):
        print("-" * 20)
        print(f"paper number: {i}")
        print(f"paper title: {title_list[i]}")
        print(f"published year: {year_list[i]}")
        print(f"citations: {ci_num_list[i]}")

    print(f"\nSelect a paper number between 0 and {len(title_list)-1}")
    while True:
        try:
            target_paper_num = int(input("Select paper number: "))
            if 0 <= target_paper_num < len(title_list):
                break
            else:
                print(f"Please enter a number between 0 and {len(title_list)-1}.")
        except ValueError:
            print("Invalid input. Please enter a valid number.")
    
    target_paper = {
        "title": title_list[target_paper_num],
        "writer": writer_list[target_paper_num],
        "year": year_list[target_paper_num],
        "citations": ci_num_list[target_paper_num],
        "url": url_list[target_paper_num],
        "paper_id": p_id_list[target_paper_num],
        "snippet": snippet_list[target_paper_num],
    }
    return target_paper



def scraping_papers(url):
    """scrape 100 papers
    :param url: target url
    :return: title_list, url_list, writer_list, year_list, ci_num_list, p_id_list, snippet_list
    """
    url_each = url.split("&")
    url_each[0] = url_each[0] + "start={}"
    url_base = "&".join(url_each)

    title_list = []
    url_list = []
    writer_list = []
    year_list = []
    ci_num_list = []
    p_id_list = []
    snippet_list = []

    for page in range(0, 100, 10):
        print("Loading next {} results".format(page + 10))
        url_tmp = url_base.format(page)
        html_doc = requests.get(url_tmp).text
        soup = BeautifulSoup(html_doc, "html.parser")

        title_list_tmp, url_list_tmp = get_title_and_url(soup)
        writer_list_tmp, year_list_tmp = get_writer_and_year(soup)
        ci_num_list_tmp = get_citations(soup)
        p_id_list_tmp = get_id(soup)
        snippet_list_tmp = get_snippet(soup)

        title_list.extend(title_list_tmp)
        url_list.extend(url_list_tmp)
        writer_list.extend(writer_list_tmp)
        year_list.extend(year_list_tmp)
        ci_num_list.extend(ci_num_list_tmp)
        p_id_list.extend(p_id_list_tmp)
        snippet_list.extend(snippet_list_tmp)

        sleep(np.random.randint(5, 10))
    return (
        title_list,
        url_list,
        writer_list,
        year_list,
        ci_num_list,
        p_id_list,
        snippet_list,
    )
```

In [None]:

def check_paper_relevance_and_keywords(title, search_string, client):
    # Adjust the prompt to ask for relevance and keywords
    prompt = (f"Determine if the paper titled '{title}' is relevant to the topic '{search_string}'. "
              "and in return just informed paper is relevant or paper is not relevant, to the point.")

    data = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {"role": "system", "content": "You are a knowledgeable assistant."},
            {"role": "user", "content": prompt}
        ]
    }
    response = client.chat.completions.create(
        # model="gpt-3.5-turbo",
        model="gpt-3.5-turbo-0125",
        messages=[
            {"role": "system", "content": "You are a knowledgeable assistant."},
            {"role": "user", "content": prompt}
        ],
        # response_format={ "type": "json_object" },
        temperature=TEMPERATURE,
    )
    content = response.choices[0].message.content.strip().lower()
    
    return content

```
# キーワードの入力
search_strings = [
    '"Risk Assessment and Governance evaluation methods"',
    '"Organizations measure success outcomes Risk Assessment Governance initiatives"'
]

for keyword in search_strings:
    print(f"Searching for: {keyword}\n")
    
    # 検索用URLの作成
    url = make_url(keyword=keyword, conf=None, author=None, year=None)
    
    # 候補となる論文の選択
    print("Please select a paper")
    selected_paper = grep_candidate_papers(url)
    
    # 選択された論文の情報を表示
    print(f"Selected Paper: {selected_paper['title']}")
    print(f"URL: {selected_paper['url']}")
    print(f"Citations: {selected_paper['citations']}")
    print(f"Snippet: {selected_paper['snippet']}\n")
    
    # 選択された論文の引用論文情報の取得
    url_cite = make_url(paper_id=selected_paper["paper_id"])
    cited_papers_info = scraping_papers(url_cite)
    
    # 引用論文情報の表示 (例: タイトルとURL)
    for title, url in zip(cited_papers_info[0], cited_papers_info[1]):
        print(f"Cited Paper Title: {title}")
        print(f"Cited Paper URL: {url}\n")
```

```
# 検索パラメータを設定
# N_DAYS = 365  # 過去30日間の論文を検索
N_DAYS = 30  # 過去30日間の論文を検索

MAX_RESULT = 5  # 最大結果数
QUERY_TEMPLATE = 'all:{} AND submittedDate:[{} TO {}]'  # 検索クエリテンプレート

# 検索を行い、結果を取得する関数
def search_arxiv_with_keywords(keywords):
    client = arxiv.Client()
    results_list = []

    today = dt.datetime.today() - dt.timedelta(days=2)
    base_date = today - dt.timedelta(days=N_DAYS)

    for keyword in keywords:
        query = QUERY_TEMPLATE.format(keyword, base_date.strftime("%Y-%m-%d"), today.strftime("%Y-%m-%d"))
        
        search = arxiv.Search(
            query=query,
            max_results=MAX_RESULT,
            sort_by=arxiv.SortCriterion.SubmittedDate,
            sort_order=arxiv.SortOrder.Descending,
        )

        results = client.results(search)
        
        for result in results:
            # 特定の条件に基づいてフィルタリングを行う場合はここに追加
            results_list.append({
                'title': result.title,
                'summary': result.summary,
                'url': result.entry_id  # arXivへのリンク
            })

    return results_list

search_results = search_arxiv_with_keywords(generate_search_string)
for result in search_results:
    print(f"Title: {result['title']}\nSummary: {result['summary']}\nURL: {result['url']}\n")

# うまくいかないのでクエリを小さくする(本ちゃんではこれ使いたくねーな)

import re

def simplify_search_queries(complex_queries):
    simplified_queries = []

    for query in complex_queries:
        # 数字とピリオドを除去して、クエリの本体だけを抽出
        clean_query = re.sub(r'^\d+\.\s*', '', query)
        
        # 括弧を除去
        clean_query = re.sub(r'[()"]', '', clean_query)
        
        # 'AND' と 'OR' で分割
        split_queries = re.split(r'\sAND\s|\sOR\s', clean_query)
        
        # 分割したクエリをリストに追加
        for sub_query in split_queries:
            sub_query = sub_query.strip()
            if sub_query and sub_query not in simplified_queries:
                simplified_queries.append(sub_query)
                
    return simplified_queries


simplified_queries = simplify_search_queries(generate_search_string)
for query in simplified_queries:
    print(query)

from tqdm import tqdm
import time

# for query in tqdm(simplified_queries, desc="Searching"):
#     print(f"Query: {query}")
#     search_results = search_arxiv_with_keywords(query)
#     for result in search_results:
#         print(f"Title: {result['title']}\nSummary: {result['summary']}\nURL: {result['url']}\n")
#     time.sleep(2)  # 検索ごとに2秒間待機します。



from duckduckgo_search import DDGS
import requests
from bs4 import BeautifulSoup


# テキスト検索用の関数
def search_text(keywords, region='wt-wt', safesearch='moderate', timelimit=None, max_results=3):
    with DDGS() as ddgs:
        results = [r for r in ddgs.text(keywords, region=region, safesearch=safesearch, timelimit=timelimit, max_results=max_results)]
        time.sleep(2)
    return results

# BeautifulSoupを使ってウェブページから情報を抽出する関数
def extract_info_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # 必要な情報を抽出するためのコードをここに追加します。
        # 例えば、ページの全ての段落テキストを取得する場合:
        paragraphs = soup.find_all('p')
        text = ' '.join([p.text for p in paragraphs])
        return text
    except Exception as e:
        print(f"Error extracting information from {url}: {e}")
        return None




for query in tqdm(simplified_queries, desc="Searching"):
    print(query)
    text_results = search_text(query)
    print(text_results)
    for result in text_results:
        print(f"Title: {result['title']}\nURL: {result['href']}\n")
        # # URLから情報を抽出
        # extracted_info = extract_info_from_url(result['href'])
        # print(f"Extracted Info: {extracted_info}\n")

import time
from tqdm import tqdm
from duckduckgo_search import DDGS  # あるいは適切なモジュール名

# テキスト検索用の関数
def search_text(keywords, max_results=3):
    with DDGS() as ddgs:
        results = [r for r in ddgs.text(keywords, max_results=max_results)]
        time.sleep(2)  # 検索ごとに2秒間待機
    return results

# 検索クエリを簡略化する関数
def simplify_search_queries(complex_queries):
    simplified_queries = []
    # (クエリを簡略化するロジック)
    return simplified_queries

# 複雑なクエリのリスト
complex_queries = [
    '1. ("RAG evaluation methods" OR "risk assurance governance evaluation methods") AND ("accuracy" AND "efficiency")',
    '2. ("RAG evaluation methods" OR "risk assurance governance evaluation methods") AND ("factors influencing" AND "development" AND "implementation")'
]

# 簡略化されたクエリのリストを取得
simplified_queries = simplify_search_queries(complex_queries)

# tqdmを使用して進捗状況を表示しながら検索を実行
for query in tqdm(simplified_queries, desc="Searching"):
    print(f"Query: {query}")
    search_results = search_text(query)
    for result in search_results:
        print(f"Title: {result['title']}\nSummary: {result['summary']}\nURL: {result['url']}\n")
```