# Merge ICLR and NeurIPs

In [None]:
import pandas as pd


df_iclr = pd.read_json('../data/processed/iclr-2024.json')
df_neurips = pd.read_json('../data/processed/neurips-2023.json')

# Display the dataframes
display(df_iclr)
display(df_neurips)

In [None]:
import random

# Randomly select 50 unique submission IDs from df_iclr
selected_iclr_ids = random.sample(df_iclr['submission_id'].unique().tolist(), 50)
df_iclr_50 = df_iclr[df_iclr['submission_id'].isin(selected_iclr_ids)]

# Randomly select 50 unique submission IDs from df_neurips
selected_neurips_ids = random.sample(df_neurips['submission_id'].unique().tolist(), 50)
df_neurips_50 = df_neurips[df_neurips['submission_id'].isin(selected_neurips_ids)]

# Display the resulting dataframes
display(df_iclr_50)
display(df_neurips_50)

In [None]:
# Add a new column 'venue' to each dataframe
df_iclr_50['venue'] = 'iclr'
df_neurips_50['venue'] = 'neurips'

# Concatenate the two dataframes
df_merged = pd.concat([df_iclr_50, df_neurips_50], ignore_index=True)

# Display the merged dataframe
display(df_merged)

In [None]:
# Save the merged dataframe to a CSV file
df_merged.to_csv('../data/processed/50iclr-50neurips-merged.csv', index=False)

# Merge F1000 and Semantic Web Journal

In [None]:
import pandas as pd

# Read the two CSV files
file1 = pd.read_csv('../data/processed/f1000.csv')
file2 = pd.read_csv('../data/processed/semanticweb.csv')

# Display the dataframes
display(file1)
display(file2)

In [None]:
import random

# Randomly select 50 unique paper IDs from each file
selected_paper_ids_file1_sample = random.sample(list(selected_paper_ids_file1), 50)
selected_paper_ids_file2_sample = random.sample(list(selected_paper_ids_file2), 50)

# Filter the original dataframes to include all rows for the selected paper IDs
df_50_sample_file1 = file1[file1['title'].isin(selected_paper_ids_file1_sample)]
df_50_sample_file2 = file2[file2['paper_id'].isin(selected_paper_ids_file2_sample)]

# Display the resulting dataframes
display(df_50_sample_file1)
display(df_50_sample_file2)

In [None]:
# Add the 'venue' column to both sample dataframes
df_50_sample_file1['venue'] = 'f1000'
df_50_sample_file2['venue'] = 'semanticweb'

# Rename columns to remove '_llamaV3-2' from their names for each dataframe
df_50_sample_file1.columns = [col.replace('_llamaV3-2', '') for col in df_50_sample_file1.columns]
df_50_sample_file2.columns = [col.replace('_llamaV3-2', '') for col in df_50_sample_file2.columns]

# Drop the 'paper_id' column from the second dataframe
df_50_sample_file2 = df_50_sample_file2.drop(columns=['paper_id'])

# Merge the two dataframes
merged_df = pd.concat([df_50_sample_file1, df_50_sample_file2], ignore_index=True)

# Display the merged dataframe
display(merged_df)

In [None]:
# save the merged dataframe to a CSV file named sw_f1000_merged.csv
merged_df.to_csv('../data/processed/50swj-50f1000-merged.csv', index=False)

# Merge All for Human Annotation

In [None]:
import pandas as pd

# Read the two CSV files
ali_df = pd.read_csv('../data/processed/50iclr-50neurips-merged.csv')
soroush_df = pd.read_csv('../data/processed/50swj-50f1000-merged.csv')

# Display the first few rows of each dataframe to verify
display(ali_df.head())
display(soroush_df.head())

In [None]:
print("Columns in ali_df:")
print(ali_df.columns)

print("\nColumns in soroush_df:")
print(soroush_df.columns)

In [None]:
# Merge the two dataframes
merged_df = pd.concat([ali_df, soroush_df], ignore_index=True, sort=False)

# Display the merged dataframe
display(merged_df)

In [None]:
# Assign a unique paper_id for each unique combination of title and abstract
merged_df['paper_id'] = merged_df.groupby(['title', 'abstract']).ngroup()

# Reorder the columns to make paper_id the first column
merged_df = merged_df[['paper_id'] + [col for col in merged_df.columns if col != 'paper_id']]

# Display the updated dataframe
display(merged_df)

In [None]:
# Count unique values in the paper_id column
unique_paper_ids = merged_df['paper_id'].nunique()
print(f"Number of unique paper IDs: {unique_paper_ids}")

# Determine the range of paper_id values
paper_id_min = merged_df['paper_id'].min()
paper_id_max = merged_df['paper_id'].max()
print(f"Range of paper IDs: {paper_id_min} to {paper_id_max}")

In [None]:
# Reorder the columns in merged_df
columns_order = ['paper_id', 'title', 'abstract', 'review_text'] + [col for col in merged_df.columns if col not in ['paper_id', 'title', 'abstract', 'review_text']]
merged_df = merged_df[columns_order]

# Display the updated dataframe
display(merged_df)

In [27]:
merged_df.to_csv('../data/processed/merged-200-papers.csv', index=False)

In [2]:
import pandas as pd


df = pd.read_csv('../data/processed/merged-200-papers.csv')
df

df.to_json('../data/processed/merged-200-papers.json', orient='records', lines=True)