In [22]:
import os
import pandas as pd

def extract_abstract_from_abs_file(filepath):
    try:
        # Read the file content
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()

        # The abstract is between the second '\\' and the final '\\'
        # Split content by '\\' and get the part containing the abstract
        sections = content.split('\\\\')
        if len(sections) >= 3:
            # The abstract is typically in the last section before the final '\\'
            abstract = sections[-2].strip()
            return abstract
        else:
            return "Abstract not found."

    except FileNotFoundError:
        return "File not found."
    except Exception as e:
        return f"An error occurred: {str(e)}"


def add_abstracts_to_csv(csv_path, abs_root_folder, output_path):
    # Load CSV
    df = pd.read_csv(csv_path,dtype={'node_id': str})

    abstracts = []
    for i, row in df.iterrows():
        paper_id = str(row['id']).zfill(7)  # Ensure ID is 7 characters
        year_prefix = int(paper_id[:2])
        # Map to correct folder year
        print(year_prefix)
        if year_prefix >= 92 and year_prefix <= 99:
            folder_year = f"19{year_prefix}"
        elif year_prefix <= 3:
            folder_year = f"20{year_prefix:02d}"
        else:
            print(f"⚠️ Unknown year prefix in ID: {paper_id}")
            abstracts.append(None)
            continue

        abs_file_path = os.path.join(abs_root_folder, folder_year, f"{paper_id}.abs")
        abstract = extract_abstract_from_abs_file(abs_file_path)
        if abstract is None:
            print(f"❌ Missing: {abs_file_path}")
        abstracts.append(abstract)

    df['abstract'] = abstracts
    df.to_csv(output_path, index=False)
    print(f"✅ Done. Saved to {output_path}")




In [24]:
# === Example Usage ===
csv_path = "combined_doi_questions_embeddings.csv"  # your CSV path
abs_root_folder = ""                # root folder with year folders like 1993/, 1994/, etc.
output_path = "combined_with_abstract.csv"

add_abstracts_to_csv(csv_path, abs_root_folder, output_path)

97
1
1
0
99
2
97
0
2
0
2
2
93
2
1
99
1
0
2
1
2
0
1
0
3
1
1
0
0
1
1
1
0
99
2
1
0
98
1
0
1
0
2
98
2
0
96
1
0
94
94
2
2
2
2
2
1
1
1
3
1
2
1
1
98
2
2
1
2
0
0
97
1
2
1
0
2
1
1
2
96
2
3
0
2
2
97
2
2
1
0
0
0
96
1
1
98
0
3
0
0
1
2
0
2
1
1
1
1
1
1
1
1
1
1
0
1
2
2
1
2
2
2
1
1
2
0
2
2
1
0
2
1
98
94
0
96
0
0
1
1
0
2
1
99
2
2
1
1
2
2
1
0
98
0
0
0
2
0
1
0
1
0
0
0
2
1
3
0
2
1
0
1
94
1
3
98
2
1
2
1
2
1
1
0
1
0
0
1
96
1
2
2
0
2
0
2
0
96
3
99
1
99
1
0
1
2
2
2
3
2
0
0
0
2
1
2
1
0
0
0
1
2
2
99
2
2
99
1
2
2
0
0
2
2
1
2
2
0
2
0
0
3
99
96
2
0
1
2
1
97
2
1
1
1
96
0
1
2
2
0
1
1
2
1
0
2
96
0
0
95
0
0
1
2
0
0
0
0
2
98
2
0
1
1
1
98
97
96
1
2
0
0
2
0
1
2
0
1
2
1
0
3
1
2
0
1
0
0
2
1
1
2
2
98
1
97
2
2
3
1
2
99
2
2
1
0
0
1
0
0
99
2
2
1
1
2
2
0
2
2
2
0
2
2
1
98
1
2
1
1
2
0
3
0
2
3
2
1
2
2
0
0
0
0
0
0
97
1
99
1
2
0
0
1
0
2
97
99
1
2
1
1
2
2
0
1
1
2
0
2
3
1
1
1
0
2
0
2
0
1
99
0
0
1
2
99
✅ Done. Saved to combined_with_abstract.csv


Performance Tables for Transformer Models and Fine-Tuning
Table 1: Transformer Model with Mapping Network (Split by ID)



Model
Test Cosine Loss
Test Cosine Similarity



BAAI/bge-large-en-v1.5
0.7926
0.2152


intfloat/e5-large-v2
0.8214
0.1894


Table 2: Transformer Model with Mapping Network (Split by Query)



Model
Test Cosine Loss
Test Cosine Similarity



BAAI/bge-large-en-v1.5
0.0255
0.9738


intfloat/e5-large-v2
0.0483
0.9507


Table 3: Direct Fine-Tuning (Split by Query)



Model
Test Cosine Loss
Test Cosine Similarity



BAAI/bge-large-en-v1.5
0.0451
0.5265


intfloat/e5-large-v2
0.0684
0.4507


Table 4: Fine-Tuning (Split by ID)



Model
Test Cosine Loss
Test Cosine Similarity



BAAI/bge-large-en-v1.5
0.0711
0.1182


intfloat/e5-large-v2
0.0665
0.1233


