In [13]:
import json

# Đọc file JSON
def read_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"File {file_path} không tồn tại")
        return None
    except json.JSONDecodeError:
        print("Lỗi định dạng JSON")
        return None

# Ví dụ sử dụng
file_path = 'data_train.json'
json_data = read_json_file(file_path)

# In cấu trúc JSON
if json_data:
    print(json.dumps(json_data, indent=4))

[
    {
        "doi": "9905326",
        "final_output": [
            {
                "question": "What is the chiral quark-soliton model (CQSM)?",
                "answer": "The Chiral Quark Soliton Model (CQSM) is a field theoretical model based on a non-renormalizable lagrangian. It describes the mesonic properties in terms of quark fields and dynamically generates the soliton, which is the intrinsic object representing a baryon."
            },
            {
                "question": "What are the available data on the ratios E2/M1 and C2/M1?",
                "answer": "The paper mentions that the available data from [1, 2] show that the ratios of electric and coulomb quadrupole amplitudes to the magnetic dipole amplitude, E2/M1 and C2/M1 respectively, do not establish a de\ufb01nite picture for these ratios, although the magnitudes of both ratios seem to underestimate the latest experimental results."
            },
            {
                "question": "How does the CQ

In [33]:
import pandas as pd

# Read the CSV file
try:
    df = pd.read_csv('graphsage_embeddings.csv')
except FileNotFoundError:
    print("Error: graphsage_embeddings.csv not found.")
    exit()

# Check if node_id column exists
if 'node_id' not in df.columns:
    print("Error: 'node_id' column not found. Available columns:", df.columns.tolist())
    exit()

# Print original data for debugging
print("Original node_id sample:", df['node_id'].head().tolist())
print("Original node_id data type:", df['node_id'].dtype)

# Convert node_id to string and pad with leading zeros to 7 characters
df['node_id'] = df['node_id'].astype(str).str.strip().str.zfill(7)

# Print modified data for verification
print("Modified node_id sample:", df['node_id'].head().tolist())

# Save to a new CSV file
try:
    df.to_csv('graphsage_embeddings_padded.csv', index=False)
    print("Modified CSV saved as 'graphsage_embeddings_padded.csv'.")
except Exception as e:
    print("Error saving file:", e)

# Print the first few rows of the modified DataFrame
print("Modified DataFrame head:")
print(df.head())

# Verify the saved file
try:
    df_new = pd.read_csv('graphsage_embeddings_padded.csv',dtype={'node_id': str})
    print("Verified node_id sample from saved file:", df_new['node_id'].head().tolist())
except FileNotFoundError:
    print("Error: Output file was not created.")

Original node_id sample: [1001, 1002, 1003, 1004, 1005]
Original node_id data type: int64
Modified node_id sample: ['0001001', '0001002', '0001003', '0001004', '0001005']
Modified CSV saved as 'graphsage_embeddings_padded.csv'.
Modified DataFrame head:
   Unnamed: 0.2  Unnamed: 0.1  Unnamed: 0  node_id         0         1  \
0             0             0           0  0001001  0.344993  0.034560   
1             1             1           1  0001002  0.226590 -0.027121   
2             2             2           2  0001003 -0.256757 -0.228419   
3             3             3           3  0001004 -0.215732 -0.207303   
4             4             4           4  0001005  0.305593  0.128245   

          2         3         4         5  ...       118       119       120  \
0  0.127779 -0.164674  0.105189 -0.184557  ...  0.348033 -0.064623 -0.118074   
1 -0.114100  0.196185  0.202683 -0.065651  ...  0.076618 -0.051625  0.059363   
2 -0.896574 -0.148592  0.161573  0.016892  ... -0.004523  0.38

In [36]:
unnamed_cols = [col for col in df.columns if col.startswith('Unnamed')]
if unnamed_cols:
    print("Dropping Unnamed columns:", unnamed_cols)
    df = df.drop(columns=unnamed_cols)
print(df['node_id'].dtypes)

Dropping Unnamed columns: ['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0']
object


In [38]:
import pandas as pd
import json

# If JSON is in a file:
# with open("data_train.json", "r") as f:
#     json_data = json.load(f)

# Extract DOIs
dois = [entry["doi"] for entry in json_data]
print("DOIs in JSON:", dois)

# Load embeddings
df_embeddings = df_new

DOIs in JSON: ['9905326', '9710225', '0105135', '9906374', '0110129', '0012147', '9501406', '9812403', '9912277', '0206301', '0207040', '9210255', '9306264', '9605399', '9612341', '9909294', '9801239', '9910428', '9706222', '9909464', '0009147', '9408350', '0205050', '9504282', '0001130', '9310264', '9807335', '0203198', '9804314', '9405224', '9406297', '0205209', '9909566', '9803345', '9312208', '9702371', '9404297', '9910481', '9907468', '9411409', '0210225', '0111151', '9911214', '9304270', '9705455', '9907264', '9607451', '9408333', '0105075', '9905521', '9606212', '9709481', '9601239', '9910403', '9409420', '0005102', '9606268', '9511412', '9406314', '9811388', '0208076', '9303255', '9312346', '9309323', '0211416', '9811468', '0110351', '9311352', '9608219', '9710467', '9612362', '9608409', '0101022', '9707464', '0212207', '9905256', '0007290', '9909439', '9407278', '9609319', '0005231', '9907496', '0105207', '0003062', '0301122', '9705307', '0110288', '9404294', '9211205', '93123

In [39]:
doi_questions = []
for entry in json_data:
    if "doi" not in entry or "final_output" not in entry:
        print(f"Skipping invalid entry: {entry}")
        continue
    doi = str(entry["doi"]).strip()
    questions = [item["question"] for item in entry["final_output"] if "question" in item]
    doi_questions.append({"doi": doi, "questions": ";".join(questions)})

df_questions = pd.DataFrame(doi_questions)
print("DOIs in JSON:", df_questions["doi"].tolist())

# Merge
df_merged = pd.merge(
    df_questions,
    df_embeddings,
    left_on="doi",
    right_on="node_id",
    how="inner"
)
print("Merged DataFrame rows:", len(df_merged))
print("Merged DataFrame sample:", df_merged.head())

# Prepare output
embedding_cols = [str(i) for i in range(128)]
if all(col in df_embeddings.columns for col in embedding_cols):
    df_final = df_merged[["doi", "questions"] + embedding_cols].copy()
    df_final = df_final.rename(columns={"doi": "id"})
    df_final.to_csv("combined_doi_questions_embeddings.csv", index=False)
    print("Output CSV created: combined_doi_questions_embeddings.csv")
else:
    print("Error: Embedding columns (0 to 127) not found in embeddings CSV.")
# The output CSV file 'combined_doi_questions_embeddings.csv' has the following structure:
# - 'id': The DOI (Digital Object Identifier) of the research paper, matching the 'doi' field from the input JSON and 'node_id' from the embeddings CSV.
# - 'questions': A semicolon-separated string of questions associated with the DOI, extracted from the 'final_output' field in the JSON data.
# - Columns '0' to '127': The 128-dimensional GraphSAGE embedding coordinates for the DOI, sourced from the 'graphsage_embeddings.csv' file.
# Each row represents a DOI present in both the JSON data and embeddings file, with missing DOIs excluded due to the inner merge.


DOIs in JSON: ['9905326', '9710225', '0105135', '9906374', '0110129', '0012147', '9501406', '9812403', '9912277', '0206301', '0207040', '9210255', '9306264', '9605399', '9612341', '9909294', '9801239', '9910428', '9706222', '9909464', '0009147', '9408350', '0205050', '9504282', '0001130', '9310264', '9807335', '0203198', '9804314', '9405224', '9406297', '0205209', '9909566', '9803345', '9312208', '9702371', '9404297', '9910481', '9907468', '9411409', '0210225', '0111151', '9911214', '9304270', '9705455', '9907264', '9607451', '9408333', '0105075', '9905521', '9606212', '9709481', '9601239', '9910403', '9409420', '0005102', '9606268', '9511412', '9406314', '9811388', '0208076', '9303255', '9312346', '9309323', '0211416', '9811468', '0110351', '9311352', '9608219', '9710467', '9612362', '9608409', '0101022', '9707464', '0212207', '9905256', '0007290', '9909439', '9407278', '9609319', '0005231', '9907496', '0105207', '0003062', '0301122', '9705307', '0110288', '9404294', '9211205', '93123

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform
from itertools import combinations

# Đọc file CSV
df = pd.read_csv('combined_doi_questions_embeddings.csv')

# Trích xuất cột embedding (cột 0 đến 127)
embedding_cols = [str(i) for i in range(128)]
embeddings = df[embedding_cols].to_numpy()

# Tính ma trận khoảng cách Euclidean giữa các bài báo
dist_matrix = squareform(pdist(embeddings, metric='euclidean'))

# Hàm tìm cụm 3-5 bài báo có tổng khoảng cách nhỏ nhất
def find_closest_cluster(df, dist_matrix, cluster_size_range=(3, 5)):
    min_total_dist = float('inf')
    best_cluster = None
    best_cluster_indices = None
    
    # Duyệt qua các tổ hợp có kích thước trong khoảng [3, 5]
    for size in range(cluster_size_range[0], cluster_size_range[1] + 1):
        for indices in combinations(range(len(df)), size):
            # Lấy ma trận khoảng cách con cho các chỉ số trong tổ hợp
            sub_dist_matrix = dist_matrix[np.ix_(indices, indices)]
            # Tính tổng khoảng cách (chỉ lấy nửa trên của ma trận để tránh lặp)
            total_dist = np.sum(np.triu(sub_dist_matrix, 1))
            
            if total_dist < min_total_dist:
                min_total_dist = total_dist
                best_cluster_indices = indices
                best_cluster = df.iloc[list(indices)][['id', 'questions']].to_dict('records')
    
    return best_cluster, min_total_dist

# Tìm cụm
cluster, total_distance = find_closest_cluster(df, dist_matrix)

# In kết quả
print(f"Cụm {len(cluster)} bài báo có tổng khoảng cách nhỏ nhất ({total_distance:.4f}):")
for article in cluster:
    print(f"DOI: {article['id']}")
    print(f"Questions: {article['questions']}")
    print("-" * 50)