In [None]:
import requests
import json
import os
import time
import subprocess

# Get citation

In [None]:
article_ids = ["1005.1176"]

In [None]:
def get_cite_id(article_ids):
    try:        
        paper_ids = [f"ARXIV:{article_id}" for article_id in article_ids]
        
        params = {
            'fields': 'externalIds,citations.externalIds,citations.year,citations.url',
            'offset': 0,
            'limit': 50
        }
        
        response = requests.post(
            'https://api.semanticscholar.org/graph/v1/paper/batch',
            params=params,
            json={"ids": paper_ids}  
        )
        
        if response.status_code == 200:
            data = response.json()
            
            return data       
        else:
            print(f"Request failed with status code {response.status_code} for file {input_file}")
            print(response.text)
    
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
cite_id_output = get_cite_id(article_ids)

## Get article_id and citation

In [None]:
def get_article_id_and_citation(data):
    for item in data:
        if isinstance(item, dict):
            articleID = {
                "paperId": item.get("paperId"),
                "ArXiv": item.get("externalIds", {}).get("ArXiv")
            }
            
            if articleID["paperId"] and articleID["ArXiv"] and articleID["ArXiv"].strip():
                
                result = {
                    "articleID": articleID,
                    "citation": []
                }
                
                for citation in item.get("citations", []):
                    if citation and citation.get("externalIds"):
                        arxiv_id = citation["externalIds"].get("ArXiv", "null")
                    else:
                        arxiv_id = "null"

                    citation_info = {
                        "paperId": citation.get("paperId"),
                        "ArXiv": arxiv_id,
                        "year": citation.get("year", "null"),
                        "url": citation.get("url", "null")
                    }
                    result["citation"].append(citation_info)

                try:
                    return result
                except IOError as e:
                    print(f"Error writing file {file_name}: {e}")
            else:
                print(f"Skipping article due to missing paperId or ArXiv info: {articleID}")
        else:
            print(f"Skipping non-dictionary item: {item}")

In [None]:
article_id_and_citation = get_article_id_and_citation(cite_id_output)

## Get citeID with Year

In [None]:
def get_cite_id_with_year(data):
    article_id = data['articleID']['ArXiv']

    citations = data['citation']
    filtered_citations = [
        citation for citation in citations if citation['ArXiv'] != "null"
    ]

    filtered_citations = sorted(filtered_citations, key=lambda x: x['year'], reverse=True)

    result_citations = [citation['ArXiv'] for citation in filtered_citations[:15]]

    result = {
        'article_id': article_id,
        'cite_id': result_citations
    }

    return result

In [None]:
cite_id_with_year = get_cite_id_with_year(article_id_and_citation)

## Get citation to list

In [None]:
citation_list = cite_id_with_year['cite_id']

# Get information article_id

In [None]:
def get_in4_article_id(article_id):
    try:
        paper_id = f"ARXIV:{article_id}"

        params = {
            'fields': 'externalIds,title,authors,year',
            'offset': 0,
            'limit': 1
        }

        response = requests.post(
            'https://api.semanticscholar.org/graph/v1/paper/batch',
            params=params,
            json={"ids": [paper_id]}  
        )

        if response.status_code == 200:
            data = response.json()

            return data

        else:
            print(f"Request failed with status code {response.status_code} for article_id {article_id}")
            print(response.text)

    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
article_id = article_ids[0] 
in4_article_id = get_in4_article_id(article_id)  

# Merge JSON FILE

In [None]:
def process_article_id(article_id):
    try:
        paper_id = f"ARXIV:{article_id}"

        params = {
            'fields': 'externalIds,title,authors,year',
            'offset': 0,
            'limit': 1
        }

        response = requests.post(
            'https://api.semanticscholar.org/graph/v1/paper/batch',
            params=params,
            json={"ids": [paper_id]}  
        )
        if response.status_code == 200:
            data = response.json()
            
            if data:
                for item in data:
                    if item: 
                        arxiv_id = item.get("externalIds", {}).get("ArXiv", None)
                        
                        new_item = {
                            "Arxiv": arxiv_id,  
                            "paperId": item.get("paperId"),
                            "title": item.get("title"),
                            "year": item.get("year"),
                            "authors": item.get("authors", [])
                        }
                        
                return new_item
            else:
                print(f"No data found for article_id: {article_id}")
        else:
            print(f"Request failed with status code {response.status_code}")
            print(response.text)
    
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
title_article_id = process_article_id(article_id)

In [None]:
dict_arxiv_cite_id = cite_id_with_year

## merge title with dict

In [None]:
def merge_title_dict(title_data, dict_data):    
    merged_output = {}
    # Prepare the merged output
    merged_output = {
        "article_id": title_data,
        "cite_id": dict_data["cite_id"]
    }    
    
    return merged_output

In [None]:
merged_article_cite_id = merge_title_dict(title_article_id, dict_arxiv_cite_id)

## Norm title and dict

In [None]:
def normalize_json(data):
    if 'authors' in data['article_id']:
        data['article_id']['authors'] = [author['name'] for author in data['article_id']['authors']]
    return data

In [None]:
norm_merged_article_cite_id = normalize_json(merged_article_cite_id)

In [None]:
with open("normalized_merged_article_cite_id.json", 'w', encoding='utf-8') as output_f:
        json.dump(norm_merged_article_cite_id, output_f, ensure_ascii=False, indent=4)

# Download PDF

In [None]:
!python -m pip install -q --upgrade arxiv-dl
!pip -q install paper-cli

In [None]:
def download_papers(paper_ids, download_dir):
    for paper_id in paper_ids:
        subprocess.run(['paper', paper_id, '-d', download_dir])

In [None]:
paper_ids = norm_merged_article_cite_id['cite_id']

In [None]:
download_dir = "PDF"
os.makedirs(download_dir, exist_ok=True)

download_papers(paper_ids, download_dir)

# Convert PDF to XML (Using Grobid)

In [None]:
config_path = "config.json"

In [None]:
!python3 -m pip install grobid-client-python

In [None]:
from grobid_client.grobid_client import GrobidClient, ServerUnavailableException  

In [None]:
def process_batches(config_path, input_folder, output_folder, num_files_per_batch=50, max_retries=3):
    """
    Process batches using GROBID.
    """
    try:
        client = GrobidClient(config_path=config_path)
    except ServerUnavailableException:
        print("Error: Unable to connect to the GROBID server.")
        return
    except Exception as e:
        print(f"Unexpected error during client initialization: {e}")
        return

    os.makedirs(output_folder, exist_ok=True)

    for attempt in range(max_retries):
        try:
            client.process(
                "processFulltextDocument",
                input_folder,
                output=output_folder,
                n=num_files_per_batch,
                segment_sentences=True
            )
            break
        except Exception as e:
            print(f"Error during processing (attempt {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                print("Retrying...")
                time.sleep(60)
            else:
                print(f"Skipping batch after {max_retries} failed attempts.")

In [None]:
input_folder = "PDF"
output_folder = "XML"
process_batches(config_path, input_folder, output_folder)

# Get citing

## Get citing from XML

In [None]:
import xml.etree.ElementTree as ET

In [None]:
input_dir = "XML"
output_file = 'citation_arxiv.json'

In [None]:
def calculate_label_ref(ref_positions, sent):
    """
    Tính toán nhãn label_ref dựa trên khoảng cách giữa các tham chiếu trong câu văn.

    :param ref_positions: Danh sách các ref_id trong câu (ví dụ: ["b42", "b41", "b29"]).
    :param sent: Câu văn chứa các thẻ tham chiếu.
    :return: nhãn label_ref (0, 1, 2, hoặc 3).
    """
    if not ref_positions:
        return None  

    refs_in_sentence = re.findall(r'\[#([a-zA-Z0-9]+)\]', sent)

    ref_positions_actual = []
    for ref in ref_positions:
        if ref in refs_in_sentence:
            ref_positions_actual.append(ref)
    
    if not ref_positions_actual:
        return None

    ref_positions_data = []
    for ref in ref_positions_actual:
        start_pos = sent.find(f"[#{ref}]")
        end_pos = start_pos + len(f"[#{ref}]") - 1
        ref_positions_data.append((ref, start_pos, end_pos))
    
    distances = []
    for i in range(len(ref_positions_data) - 1):
        ref_1, start_1, end_1 = ref_positions_data[i]
        ref_2, start_2, end_2 = ref_positions_data[i + 1]

        distance = start_2 - (end_1 + 1)
        distances.append(distance)

    if len(ref_positions_data) == 1:
        label_ref = 0  
    elif len(ref_positions_data) > 1:
        if all(d <= 1 for d in distances):
            label_ref = 1  
        elif all(d > 1 <= 1 for d in distances):
            label_ref = 2 
        else:
            label_ref = 3  

    return label_ref

In [None]:
def extract_citations(input_file):
    cite_id = os.path.basename(input_file).split('_')[0]  # Extract citation ID from filename
    citation_sentences = []
    references = {}

    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    tree = ET.parse(input_file)
    root = tree.getroot()

    def clean_sentence_text(sentence, ns):
        refs = sentence.findall('.//tei:ref[@type="bibr"]', ns)
        sentence_text = ET.tostring(sentence, encoding='unicode', method='text').strip()

        for ref in refs:
            target = ref.get('target')
            ref_text = ref.text if ref.text else ''
            if target:
                sentence_text = sentence_text.replace(f"{ref_text}", f"[{target}]")

        return sentence_text.strip()

    for biblStruct in root.findall('.//tei:listBibl/tei:biblStruct', ns):
        ref_id = biblStruct.get('{http://www.w3.org/XML/1998/namespace}id')
        title_analytic = biblStruct.find('.//tei:analytic/tei:title[@type="main"]', ns)
        title_monogr = biblStruct.find('.//tei:monogr/tei:title[@type="main"]', ns)
        if title_analytic is not None:
            title = title_analytic.text
        elif title_monogr is not None:
            title = title_monogr.text
        else:
            title = "No title"
        
        authors = []
        for author in biblStruct.findall('.//tei:analytic/tei:author/tei:persName', ns):
            surname = author.find('tei:surname', ns).text if author.find('tei:surname', ns) is not None else ""
            forename = author.find('tei:forename', ns).text if author.find('tei:forename', ns) is not None else ""
            authors.append(f"{forename} {surname}".strip())
        
        year = biblStruct.find('.//tei:imprint/tei:date[@type="published"]', ns).get('when') if biblStruct.find('.//tei:imprint/tei:date[@type="published"]', ns) is not None else "Unknown Year"
        
        arxiv_id = biblStruct.find('.//tei:idno[@type="arXiv"]', ns)
        if arxiv_id is not None:
            arxiv_id = arxiv_id.text
        else:
            arxiv_id = "No arXiv ID"
            idno_alternate = biblStruct.find('.//tei:idno', ns)
            if idno_alternate is not None:
                arxiv_id = idno_alternate.text
                
        references[ref_id] = {
            "id": ref_id,
            "title": title,
            "authors": authors,
            "year": year,
            "arXiv": arxiv_id
        }

    number = 0
    for body in root.findall('.//tei:body', ns):
        for paragraph in body.findall('.//tei:p', ns):
            sentences = paragraph.findall('.//tei:s', ns)
            
            for i, sentence in enumerate(sentences):
                sent_text = ET.tostring(sentence, encoding='unicode', method='text').strip()
                refs = sentence.findall('.//tei:ref[@type="bibr"]', ns)
                ref_ids = [ref.get('target').replace('#', '') for ref in refs if ref.get('target') is not None]
                cleaned_sent_text = clean_sentence_text(sentence, ns)

                sent_before = (
                    clean_sentence_text(sentences[i - 1], ns) if i > 0 else None
                )
                sent_after = (
                    clean_sentence_text(sentences[i + 1], ns) if i < len(sentences) - 1 else None
                )

                merge_sent = f"{sent_before if sent_before else ''} {cleaned_sent_text} {sent_after if sent_after else ''}".strip()

                label_ref = calculate_label_ref(ref_ids, cleaned_sent_text)
                if refs:  
                    number += 1
                    sentence_references = [
                        references.get(ref_id, {"id": ref_id, "title": "Unknown", "authors": [], "year": "Unknown Year"})
                        for ref_id in ref_ids
                    ]

                    citation_sentences.append({
                        "Number": number,
                        "sent": cleaned_sent_text,
                        "sent_before": sent_before,
                        "sent_after": sent_after,
                        "merge_sent": merge_sent,
                        "ids": ref_ids,
                        "label": label_ref,
                        "references": sentence_references,
                    })

    return cite_id, citation_sentences

In [None]:
def process_directory(input_dir):
    all_citations = []
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.xml'):
                input_file = os.path.join(root, file)
                cite_id, citation_sentences = extract_citations(input_file)
                all_citations.append({
                    "cite_id": cite_id,
                    "citation_sentences": citation_sentences
                })

    return all_citations

In [None]:
def update_json_file(output_file, new_data):
    if os.path.exists(output_file):
        with open(output_file, 'r', encoding='utf-8') as json_file:
            try:
                existing_data = json.load(json_file)
            except json.JSONDecodeError:
                existing_data = []  
    else:
        existing_data = []

    existing_data.extend(new_data)

    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(existing_data, json_file, ensure_ascii=False, indent=4)

In [None]:
# Process the input directory
all_citations = process_directory(input_dir)
# Update the JSON file with new data without overwriting the old data
update_json_file(output_file, all_citations)

## merge citing and cited

In [None]:
def normalize_author_name(author_name):
    author_name = author_name.strip().lower()

    parts = author_name.split()

    if len(parts[0]) == 1:
        parts[0] = parts[0].upper() #+ "."

    normalized_name = " ".join([part.capitalize() for part in parts])

    return normalized_name

def normalize_authors(authors_list):
    norm_author_name = [normalize_author_name(author) for author in authors_list if isinstance(author, str)]
    
    ref_authors_dot = []
    for author in norm_author_name:
        parts = author.split()
        if len(parts[0]) == 1:
            ref_authors_dot.append(parts[0].upper() + "." + " " + " ".join(parts[1:]))
        else:
            ref_authors_dot.append(author)

    return norm_author_name, ref_authors_dot

def authors_match(ref_authors, article_authors):
    if not ref_authors or not article_authors:
        return False
    
    ref_authors_norm, ref_authors_dot = normalize_authors(ref_authors)

    article_authors_norm, _ = normalize_authors(article_authors)

    for author in ref_authors_dot: 
        if author in article_authors_norm:
            return True
    return False

def map_citation_sentences(file1_path, file2_path, output_file):
    with open(file1_path, 'r', encoding='utf-8') as f1:
        data_file1 = json.load(f1)
    
    with open(file2_path, 'r', encoding='utf-8') as f2:
        data_file2 = json.load(f2)
    
    for article in data_file2:
        article_id = article['article_id']
        article_arxiv_id = article_id.get("Arxiv")
        article_authors = article_id.get("authors", [])  
        cite_ids = article.get("cite_id", [])
        
        article_citation_sentences = []
        
        for cite_id in cite_ids:
            citing_sentences = []

            for item in data_file1:
                if item["cite_id"] == cite_id:
                    for sentence in item.get("citation_sentences", []):
                        matching_references = []
                        reference_ids = []
                        for ref in sentence.get("references", []):
                            ref_arxiv_id = ref.get("arXiv")
                            ref_authors = ref.get("authors", [])
                            ref_title = ref.get("title", "")

                            ref_id = ref.get("id")
                            
                            article_arxivID = f"arXiv:{article_arxiv_id}"
                            
                            if (ref_arxiv_id == article_arxiv_id or ref_arxiv_id == article_arxivID) or authors_match(ref_authors, article_authors) or (ref_title in [author for author in article_authors]):
                                matching_references.append(ref)
                                reference_ids.append(ref_id)
            
                        if matching_references:
                            citing_sentences.append({
                                "sent": sentence["sent"],
                                "id": reference_ids 
                            })
            
            if citing_sentences:
                article_citation_sentences.append({
                    "cite_id": cite_id,
                    "citing_sentences": citing_sentences
                })
        
        if article_citation_sentences:
            article["citation_sentences"] = article_citation_sentences

    with open(output_file, 'w', encoding='utf-8') as output_f:
        json.dump(data_file2, output_f, ensure_ascii=False, indent=4)
    print(f"Data merged and saved into {output_file}")


In [None]:
# Đường dẫn tới file JSON input
file1_path = 'citation_arxiv.json'  # Citation
file2_path = "normalized_merged_article_cite_id.json"  # dict{article_id, cite_id}
output_file_1 = "merged_output_ref_arxiv_author_title.json" 

# Thực hiện hàm
map_citation_sentences(file1_path, file2_path, output_file_1)

In [None]:
import re

def update_label_ref(citing_sentences):
    for citing_sentence in citing_sentences:
        sent = citing_sentence['sent']
        ids = citing_sentence['id']

        all_ids_in_sent = re.findall(r'\[#(\w+)\]', sent)

        if len(ids) == 1:
            id = ids[0]
            positions = [m.start() for m in re.finditer(r'\[#' + re.escape(id) + r'\]', sent)]
        
            has_only_one_tag = len(all_ids_in_sent) == 1

            is_surrounded_by_text = all(
                (
                    (pos > 0 and sent[pos - 1] == ']') and  
                    (pos + len(id) + 3 < len(sent) and sent[pos + len(id) + 3] == '[')  
                )
                for pos in positions
            )

            for pos in positions:
                prev_char = sent[pos - 1] if pos > 0 else "N/A"
                prev_condition = (prev_char == ']')
       
                next_char = sent[pos + len(id) + 3] if pos + len(id) + 3 < len(sent) else "N/A"
                next_condition = (next_char == '[')
 
            if has_only_one_tag:
                citing_sentence['label_ref'] = 0
            elif is_surrounded_by_text:
                citing_sentence['label_ref'] = 1
            else:
                citing_sentence['label_ref'] = 2

        elif len(ids) > 1:
            found_adjacent = False
            for id in ids:
                positions = [m.start() for m in re.finditer(r'\[#' + re.escape(id) + r'\]', sent)]

                for pos in positions:
                    for other_id in all_ids_in_sent:
                        if other_id != id:
                            other_positions = [m.start() for m in re.finditer(r'\[#' + re.escape(other_id) + r'\]', sent)]

                            for other_pos in other_positions:
                                if abs(other_pos - pos) == len(id) + 3:  
                                    citing_sentence['label_ref'] = 1
                                    found_adjacent = True
                                    break
                            if found_adjacent:
                                break
                    if found_adjacent:
                        break
                if found_adjacent:
                    break

            if not found_adjacent:
                citing_sentence['label_ref'] = 2

        if 'label_ref' not in citing_sentence:
            citing_sentence['label_ref'] = 2
         

In [None]:
def process_json(input_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    for article in data:
        for citation_sentence in article.get("citation_sentences", []):
            update_label_ref(citation_sentence["citing_sentences"])

    with open("label_ref_merged_output_ref_arxiv_author_title.json", 'w') as f:
        json.dump(data, f, indent=4)

process_json("merged_output_ref_arxiv_author_title.json")

In [None]:
def conv_with_group(file_input, file_output):
    import json

    with open(file_input, 'r', encoding='utf-8') as f:
        data = json.load(f)

    dicts = []
    number = 0

    for item in data:
        article_citation_sentences = []

        article_id_info = item.get("article_id", {})
        article_id = article_id_info.get("Arxiv", None) if isinstance(article_id_info, dict) else None
        refer_id = article_id
        
        citation_sentences_list = item.get("citation_sentences", [])
        grouped_sentences = {}

        for citation_sentence in citation_sentences_list:
            cite_id = citation_sentence.get("cite_id", None)
            citing_sentences_list = citation_sentence.get("citing_sentences", [])

            for citing_sentence in citing_sentences_list:
                citing_sent = citing_sentence.get("sent", None)
                refer_sids = citing_sentence.get("id", None)
                label_ref = citing_sentence.get("label_ref", None)
                prev_sent = citing_sentence.get("sent_before", None) 
                next_sent = citing_sentence.get("sent_after", None)  
                number += 1

                merge_sent = " ".join(filter(None, [prev_sent, citing_sent, next_sent]))

                key = (
                    article_id,  # refer_ID
                    cite_id,     # cite_ID
                    tuple(refer_sids) if refer_sids else None
                )

                if key not in grouped_sentences:
                    grouped_sentences[key] = {
                        "Number": number,
                        "refer_ID": article_id,
                        "refer_ids": refer_sids,
                        "label_ref": label_ref,
                        "cite_ID": cite_id,
                        "GPT_cite_text": citing_sent,
                        "prev_curr_sent": prev_sent,  # Câu trước
                        "curr_next_sent": next_sent,  # Câu sau
                        "merge_sent": merge_sent,
                    }
                else:
                    grouped_sentences[key]["GPT_cite_text"] += " " + citing_sent

        article_citation_sentences.extend(grouped_sentences.values())

        if article_citation_sentences:
            dicts.append({
                "article_id": article_id,
                "citation_sentences": article_citation_sentences
            })

    with open(file_output, 'w', encoding='utf-8') as jsonfile:
        json.dump(dicts, jsonfile, ensure_ascii=False, indent=4)

In [None]:
input_file = "label_ref_merged_output_ref_arxiv_author_title.json"
output_file = "label_ref_arxiv_citation.json"

conv_with_group(input_file, output_file)