In [1]:
import os
import shutil
import re
import pandas as pd

## Sort Papers by Year

In [2]:
# Define the source and target directories
source_dir = '../papers'
target_dir = '../papers_by_year'

# Create a list of years to look for
years = list(range(1985, 2024 + 1))

# Create target directories for each year
for year in years:
    year_dir = os.path.join(target_dir, str(year))
    os.makedirs(year_dir, exist_ok=True)

# Function to find the year in the file name
def find_year(filename):
    for year in years:
        if str(year) in filename:
            return year
    return None

# List to keep track of unsorted files
unsorted_files = []

# Loop through all subdirectories and files in the source directory
for subdir, _, files in os.walk(source_dir):
    for file in files:
        file_path = os.path.join(subdir, file)
        year = find_year(file)
        
        if year:
            target_path = os.path.join(target_dir, str(year), file)
            if not os.path.exists(target_path):  # Check if file already exists
                shutil.copy2(file_path, target_path)
        else:
            unsorted_files.append(file_path)

# Print the unsorted files
if unsorted_files:
    print("Unsorted files:")
    for unsorted_file in unsorted_files:
        print(unsorted_file)
else:
    print("All files have been sorted.")

All files have been sorted.


## Get paper counts for each year

In [3]:
# Dictionary to keep track of paper counts by year
paper_counts = {str(year): 0 for year in years}

# Loop through all subdirectories and files in the target directory
for subdir, _, files in os.walk(target_dir):
    for file in files:
        # Extract the year from the directory name
        year = os.path.basename(subdir)
        if year in paper_counts:
            paper_counts[year] += 1

# Create a DataFrame from the paper counts dictionary
paper_counts_df = pd.DataFrame(list(paper_counts.items()), columns=['Year', 'Paper Count'])

paper_counts_df

Unnamed: 0,Year,Paper Count
0,1985,1
1,1986,0
2,1987,0
3,1988,0
4,1989,0
5,1990,0
6,1991,0
7,1992,1
8,1993,0
9,1994,2
