# Filter the data you want to study

You can extract 1 to multiple years of papers with all their references

## I. Libraries

In [None]:
import os
import pandas as pd
import glob
import re
import ast
from tqdm import tqdm
import numpy as np
import ast

## II. Data Location

In [11]:
# files_directory containing the batch files
files_directory = "../data/dblp_batched_processed"

# Find all batch files
batch_files = glob.glob(os.path.join(files_directory, "processed_papers_batch_*.csv"))

In [12]:
output_directory = "../data/"

## III. Filter & extract data

In [13]:
# Extract batch number and sort files numerically
def extract_batch_number(filename):
    match = re.search(r'papers_batch_(\d+)\.csv', os.path.basename(filename))
    if match:
        return int(match.group(1))
    return 0

In [14]:
years = [2020]

In [None]:
# Sort the batch files by their numerical batch number
batch_files.sort(key=extract_batch_number)

print(f"Found {len(batch_files)} batch files to process")

# Initialize empty list to store 2020 papers dataframes (more efficient than concat in loop)
dfs_filtered = []

# Process each batch file in numerical order with progress bar
for batch_file in tqdm(batch_files, desc="Processing batch files"):
    batch_number = extract_batch_number(batch_file)
    
    try:
        # Read the CSV file - only read necessary columns to save memory
        data = pd.read_csv(
            batch_file, 
            sep=';',
            usecols=lambda x: x in ['id', 'title', 'year', 'references', 'abstract']
        )

        if 'year' in data.columns:
            year_filtered_papers = data[data['year'].isin(years)].copy()
            if not year_filtered_papers.empty:
                year_filtered_papers['references'] = year_filtered_papers['references'].apply(ast.literal_eval)
                dfs_filtered.append(year_filtered_papers)
                
        # Clean up to free memory
        del data
    except Exception as e:
        tqdm.write(f"Error retrieving batch #{batch_number}: {e}")

# Combine all 2020 papers at once (more efficient than concat in loop)
if len(dfs_filtered)>0:
    # Concatenate all dataframes in the list into a single dataframe
    df_filtered = pd.concat(dfs_filtered, ignore_index=True)
    
    # Save the combined 2020 papers dataframe
    df_filtered_output = os.path.join(output_directory, "dblp_papers_filtered.csv")
    df_filtered.to_csv(df_filtered_output, index=False, sep=';')
    
    print(f"\nProcessing complete!")
    print(f"Total papers from {",".join([str(year) for year in years])}: {len(df_filtered)}")
    print(f"{",".join([str(year) for year in years])} papers saved to: {df_filtered_output}")
else:
    print(f"\nNo papers from {",".join([str(year) for year in years])}  found.")

Found 49 batch files to process


Processing batch files:   0%|          | 0/49 [00:00<?, ?it/s]

Processing batch files: 100%|██████████| 49/49 [01:03<00:00,  1.30s/it]



Processing complete!
Total papers from 2020: 18213
2020 papers saved to: ../data/dblp_papers_filtered.csv


In [21]:
avg_out_len = df_filtered['references'].apply(len).mean()

print("Average length of outCitations:", avg_out_len)

Average length of outCitations: 13.466974139351013


## IV. (Optional) Filter the data depending by a threshold percentage of refence papers that are actually available in the dataset

In [22]:
# threshold = 1  # Set your desired threshold
# all_ids = data['id'].unique()

# def fraction_in_combined(cites):
#     if len(cites) == 0:
#         return True
#     match_count = sum(int(cid) in all_ids for cid in cites)
#     return (match_count / len(cites)) >= threshold

# df_filtered_filtered = df_filtered[df_filtered['references'].apply(fraction_in_combined)].copy(deep=True)
# print(len(df_filtered_filtered))
# avg_out_len = df_filtered_filtered['references'].apply(len).mean()

# print("Average length of outCitations:", avg_out_len)

## V. (Optional) Only get random sample of filtered data for fastest processing

In [24]:
sampling = True

In [25]:
if sampling:
    sub_df_filtered = df_filtered.sample(n=100, random_state=42)
    out_cited_ids = np.unique(np.concatenate(sub_df_filtered['references'].values))
    avg_out_len = sub_df_filtered['references'].apply(len).mean()
    print("Average length of outCitations:", avg_out_len)
    df_filtered_with_refs_list = [sub_df_filtered]
else:
    df_filtered_with_refs_list = [df_filtered]

Average length of outCitations: 14.75


## VI. Get all references of filtered data

In [26]:
print("Expected total number of papers:", len(sub_df_filtered)+len(out_cited_ids))

Expected total number of papers: 1564


In [28]:
# Initialize empty list to store 2020 papers dataframes (more efficient than concat in loop)
df_filtered_with_refs_list = [sub_df_filtered]

# Extract batch number and sort files numerically
def extract_batch_number(filename):
    match = re.search(r'processed_papers_batch_(\d+)\.csv', os.path.basename(filename))
    if match:
        return int(match.group(1))
    return 0

# Sort the batch files by their numerical batch number
batch_files.sort(key=extract_batch_number)
# Process each batch file in numerical order with progress bar
for batch_file in tqdm(batch_files, desc="Processing batch files"):
    batch_number = extract_batch_number(batch_file)
    
    try:
        # Read the CSV file - only read necessary columns to save memory
        data = pd.read_csv(
            batch_file, 
            sep=';',
            usecols=lambda x: x in ['id', 'title', 'year', 'references', 'abstract']
        )
        
        # Filter papers from 2020 and collect in list
        if 'id' in data.columns:
            ref_data = data[data['id'].isin(out_cited_ids)].copy()
            if not ref_data.empty:
                ref_data['references'] = ref_data['references'].apply(lambda x: [])
                df_filtered_with_refs_list.append(ref_data)
                
        # Clean up to free memory
        del data
    except Exception as e:
        tqdm.write(f"Error processing batch #{batch_number}: {e}")
        
if len(df_filtered_with_refs_list)>0:
    df_filtered_with_refs = pd.concat(df_filtered_with_refs_list, ignore_index=True)
    
    # Save the combined 2020 papers dataframe
    df_filtered_with_refs_output = os.path.join(output_directory, "dblp_papers_filtered_sample_with_refs.csv")
    df_filtered_with_refs.to_csv(df_filtered_with_refs_output, index=False, sep=';')
    
    print(f"\nProcessing complete!")
    print(f"Sample {len(df_filtered_with_refs_list[0])} papers from {"".join([str(year) for year in years])} with Reference papers: {len(df_filtered_with_refs)} papers in total")
    print(f"Saved to: {df_filtered_with_refs_output}")

Processing batch files: 100%|██████████| 49/49 [01:01<00:00,  1.25s/it]


Processing complete!
Sample 100 papers from 2020 with Reference papers: 1564 papers in total
Saved to: ../data/dblp_papers_filtered_sample_with_refs.csv



