### Imports Per File

In [1]:
import os
import re
import pandas as pd

In [2]:
def extract_package_names(text):
    pattern = r'\b(?:library|require)\s*\(\s*["\']([^"\']+)["\']\s*\)'
    packages = re.findall(pattern, text, flags=re.IGNORECASE)
    return list(set(packages))

def process_file(file_path):
    with open(file_path, 'r', encoding = "latin1") as file:
        content = file.read()
        installs = extract_package_names(content.replace("install.packages", "library"))
        return file_path, installs

def process_folder(folder_path):
    data = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            data.append(process_file(file_path))
    return data

In [3]:
# Specify the root folder where to start the search
root_folder = 'scripts/'

# Process the files and store the results in a DataFrame
data = process_folder(root_folder)
df = pd.DataFrame(data, columns=['path_name', 'imports'])

In [4]:
df['dataverse'] = df['path_name'].str.extract(r'/([^/]+)_datasets_files/')
df['repo_id'] = df['path_name'].str.extract(r'_datasets_files/([^/]+)/')

In [5]:
df[df.imports.apply(lambda x: len(x) != 0)].to_csv("file_imports.csv", index = False)

In [6]:
repo_level = df.groupby(['dataverse', 'repo_id'])['imports'].apply(lambda x: sum(x, [])).reset_index()

In [7]:
cites_df = pd.DataFrame(repo_level["imports"].explode().value_counts().reset_index())

In [9]:
cites_df.to_csv("imports_per_package.csv", index = False)