https://aclanthology.org/events/eacl-2024/#2024eacl-long

Right click --> Save As... --> Save as HTML

In [3]:
import pandas as pd
from bs4 import BeautifulSoup

# Load the local HTML file (you need to adjust the path to where the HTML is stored)
file_path = '/content/The 18th Conference of the European Chapter of the Association for Computational Linguistics - ACL Anthology.html'
with open(file_path, 'r', encoding='utf-8') as f:
    content = f.read()

# Parse the page content with BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')

# Locate the section for long papers
long_papers_section = soup.find(id="2024eacl-long")

# Find all the individual paper entries
papers = long_papers_section.find_all('p', class_='d-sm-flex align-items-stretch')

# Extract the title and URL for each paper and store in a list
long_papers = []
for paper in papers:
    title_tag = paper.find('strong')
    title = title_tag.get_text(strip=True) if title_tag else "No title"
    url_tag = title_tag.find('a')
    url = f"https://aclanthology.org{url_tag['href']}" if url_tag else "No URL"
    long_papers.append({'title': title, 'url': url})

# Create a DataFrame from the extracted data
df_long_papers = pd.DataFrame(long_papers)

# Display or save the DataFrame
df_long_papers

Unnamed: 0,title,url
0,Proceedings of the 18th Conference of the Euro...,https://aclanthology.org/2024.eacl-long.0/
1,Enhancing Ethical Explanations of Large Langua...,https://aclanthology.org/2024.eacl-long.1/
2,Multi-Relational Hyperbolic Word Embeddings fr...,https://aclanthology.org/2024.eacl-long.2/
3,Anisotropy Is Inherent to Self-Attention in Tr...,https://aclanthology.org/2024.eacl-long.3/
4,Generating Benchmarks for Factuality Evaluatio...,https://aclanthology.org/2024.eacl-long.4/
...,...,...
177,Analyzing the Evaluation of Cross-Lingual Know...,https://aclanthology.org/2024.eacl-long.177/
178,Large-Scale Label Interpretation Learning for ...,https://aclanthology.org/2024.eacl-long.178/
179,MLCopilot: Unleashing the Power of Large Langu...,https://aclanthology.org/2024.eacl-long.179/
180,Text-Guided Image Clustering,https://aclanthology.org/2024.eacl-long.180/


In [4]:
!pip -q install openai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/383.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m378.9/383.0 kB[0m [31m14.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.0/383.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [22]:
import pandas as pd
from openai import OpenAI
from google.colab import userdata
import ast  # Safe alternative to eval for parsing strings to Python dictionaries

# Initialize the OpenAI API client
client = OpenAI(
    api_key=userdata.get('OPENAI_TOKEN')  # Ensure you have your API key here
)

base_prompt = """
Given the following title of a research paper, determine if it relates to any of the following topics: Instruction Tuning, Knowledge Graph, and Hallucination.

For each topic, provide a confidence score from 0.0 to 1.0. These scores should represent how closely the paper title relates to the topic, with 0 being completely unrelated and 1 being strongly related. Return the results in valid Python dictionary format as follows:

{
    "Instruction Tuning": <score>,
    "Knowledge Graph": <score>,
    "Hallucination": <score>
}

Ensure the output is strictly formatted as a Python dictionary.
"""

# Function to interact with GPT-4o-mini model and get the relevance scores for each topic
def check_paper_topics(title, retries=5):
    prompt = base_prompt + f"\n\nPaper Title: {title}\n"

    for attempt in range(retries):
        try:
            # Send the prompt to GPT-4 and get the response
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                model="gpt-4o-mini",
            )

            # Extract the generated Python dictionary response
            response_text = chat_completion.choices[0].message.content.strip()

            # Use ast.literal_eval to safely convert the string to a Python dictionary
            relevance_scores = ast.literal_eval(response_text)

            # Check if the response is indeed a dictionary
            if isinstance(relevance_scores, dict):
                return relevance_scores
            else:
                print(f"Attempt {attempt + 1}: Response is not a dict. Regenerating...")
        except Exception as e:
            print(f"Attempt {attempt + 1} failed with error: {e}. Retrying...")

    # If all retries fail, return None
    print(f"Failed to generate a valid response after {retries} attempts. Skipping this title.")
    return None

# List to store papers with relevant topics
related_papers = []

# Iterate through each paper title in the DataFrame
for index, row in df_long_papers.iterrows():
    title = row['title']
    url = row['url']

    # Check if the paper is related to the specified topics
    response = check_paper_topics(title)
    print(f"model response: {response}\t{type(response)}\t{url}")

    if response:
        # Find the topic with the highest score
        max_score_topic = max(response, key=lambda topic: response[topic])

        # If the highest score is more than 0.7, set relevant_topic to the max score topic, else set it to None
        if response[max_score_topic] > 0.7:
            relevant_topic = max_score_topic
        else:
            relevant_topic = None

        # If relevant_topic is not None, append the title and topic to the related_papers list
        if relevant_topic is not None:
            related_papers.append({'title': title, 'topic': relevant_topic, 'url': url})

# Create a new DataFrame with the relevant papers and their topics
df_related_papers = pd.DataFrame(related_papers)

# Output the result
print(len(df_related_papers))
print(df_related_papers)


model response: {'Instruction Tuning': 0.1, 'Knowledge Graph': 0.2, 'Hallucination': 0.1}	<class 'dict'>	https://aclanthology.org/2024.eacl-long.0/
model response: {'Instruction Tuning': 0.3, 'Knowledge Graph': 0.4, 'Hallucination': 0.5}	<class 'dict'>	https://aclanthology.org/2024.eacl-long.1/
model response: {'Instruction Tuning': 0.1, 'Knowledge Graph': 0.8, 'Hallucination': 0.2}	<class 'dict'>	https://aclanthology.org/2024.eacl-long.2/
model response: {'Instruction Tuning': 0.0, 'Knowledge Graph': 0.0, 'Hallucination': 0.1}	<class 'dict'>	https://aclanthology.org/2024.eacl-long.3/
model response: {'Instruction Tuning': 0.2, 'Knowledge Graph': 0.4, 'Hallucination': 0.9}	<class 'dict'>	https://aclanthology.org/2024.eacl-long.4/
model response: {'Instruction Tuning': 0.0, 'Knowledge Graph': 0.0, 'Hallucination': 0.7}	<class 'dict'>	https://aclanthology.org/2024.eacl-long.5/
model response: {'Instruction Tuning': 0.2, 'Knowledge Graph': 0.3, 'Hallucination': 0.1}	<class 'dict'>	https:/

In [23]:
# Exclude rows where the topic is "Knowledge Graph"
df_filtered = df_related_papers[df_related_papers['topic'] != 'Knowledge Graph']

# Display the new filtered DataFrame
print(df_filtered)

                                                title               topic  \
1   Generating Benchmarks for Factuality Evaluatio...       Hallucination   
3   PEARL: Prompting Large Language Models to Plan...  Instruction Tuning   
4   Generation-driven Contrastive Self-training fo...  Instruction Tuning   
6   LaMini-LM: A Diverse Herd of Distilled Models ...  Instruction Tuning   
9   Style-News: Incorporating Stylized News Genera...       Hallucination   
15  “According to . . . ”: Prompting Language Mode...  Instruction Tuning   
17  Few-Shot Dialogue Summarization via Skeleton-A...  Instruction Tuning   
18  Ask, Assess, and Refine: Rectifying Factual Co...       Hallucination   
21  Contrastive Decoding Reduces Hallucinations in...       Hallucination   
22  ShouldItry multiple optimizers when fine-tunin...  Instruction Tuning   

                                             url  
1     https://aclanthology.org/2024.eacl-long.4/  
3    https://aclanthology.org/2024.eacl-long.29/  

In [None]:
# TODO: Abstract도 프롬프트에 넣어주기