### Imports Per File

In [15]:
import os
import re
import ast
import pandas as pd
import logging
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set up logging configuration
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s')

# Create a logger
logger = logging.getLogger(__name__)

# Create a file handler and set the logging level to ERROR
file_handler = logging.FileHandler('error.log')
file_handler.setLevel(logging.ERROR)

# Create a console handler and set the logging level to INFO
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)

# Create a formatter and add it to both handlers
formatter = logging.Formatter('%(levelname)s: %(message)s')
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)

# Add the handlers to the logger
logger.addHandler(file_handler)
logger.addHandler(console_handler)

### Extract R and Python Imports

In [3]:
def extract_imported_r_libs(text):
    pattern = r'\b(?:library|require)\s*\(\s*["\']?([^"\']+?)["\']?\s*\)'
    libs = re.findall(pattern, text, flags=re.IGNORECASE)
    return list(set(libs))

def extract_imported_py_libs(text):
    logger = logging.getLogger(__name__)

    libs = []
    
    try:
        tree = ast.parse(text)
    except SyntaxError as e:
        logger.warning(f"{e}")
        
    for node in ast.walk(tree):
        if isinstance(node, ast.Import):
            for alias in node.names:
                libs.append(alias.name)
        elif isinstance(node, ast.ImportFrom):  
            libs.append(node.module)

    return list(set(libs))

In [4]:
def get_imports(file_path):

  with open(file_path, 'r', encoding='latin1') as f:
    text = f.read()

  if file_path.endswith(('.R', '.Rscript')):
    imports = extract_imported_r_libs(text)
    language = 'R'

  elif file_path.endswith(('.py', '.ipynb')):
    imports = extract_imported_py_libs(text)
    language = 'Python'

  else:
    return None

  df = pd.DataFrame({'package': imports, 
                     'language': [language]*len(imports),
                     'file_path': [file_path]*len(imports)})
  return df

In [5]:
def extract_imports_from_folder(folder_path):
    dfs = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            
            try:
                df = get_imports(file_path)
            except Exception as e:
                logger.error(f"Error processing {file_path}: {e}")
                continue
    
            if df is not None:
                dfs.append(df)

    imports_df = pd.concat(dfs, ignore_index=True)
  
    return imports_df

In [16]:
# Specify the root folder where to start the search
root_folder = 'scripts/'

# Process the files and store the results in a DataFrame
data = extract_imports_from_folder(root_folder)

ERROR: Error processing scripts/isq_datasets_files/PSOFAU/searchAllCountryMentionsUS.py: local variable 'tree' referenced before assignment
ERROR: Error processing scripts/isq_datasets_files/PSOFAU/searchAllCountryMentionsUS.py: local variable 'tree' referenced before assignment
ERROR: Error processing scripts/isq_datasets_files/PSOFAU/EUscrape14jul2010.py: local variable 'tree' referenced before assignment
ERROR: Error processing scripts/isq_datasets_files/PSOFAU/EUscrape14jul2010.py: local variable 'tree' referenced before assignment
ERROR: Error processing scripts/isq_datasets_files/PSOFAU/EU-finalize-briefs14jul2010.py: local variable 'tree' referenced before assignment
ERROR: Error processing scripts/isq_datasets_files/PSOFAU/EU-finalize-briefs14jul2010.py: local variable 'tree' referenced before assignment
ERROR: Error processing scripts/isq_datasets_files/PSOFAU/EU-NameAndSlice14jul2010.py: local variable 'tree' referenced before assignment
ERROR: Error processing scripts/isq_da

In [7]:
data['language'].value_counts()

language
R         54561
Python     2775
Name: count, dtype: int64

In [8]:
data

Unnamed: 0,package,language,file_path
0,ggplot2,R,scripts/FPA_datasets_files/CUUC5A/Mediation Fi...
1,foreign,R,scripts/FPA_datasets_files/6S3SHE/Chung FPA Re...
2,memisc,R,scripts/FPA_datasets_files/6S3SHE/Chung FPA Re...
3,mediation,R,scripts/FPA_datasets_files/6S3SHE/Chung FPA Re...
4,tidyverse,R,scripts/FPA_datasets_files/LO7A8I/Main Documen...
...,...,...,...
57331,dplyr,R,scripts/isec_datasets_files/VF9X6A/replication...
57332,here,R,scripts/isec_datasets_files/VF9X6A/replication...
57333,ggplot2,R,scripts/isec_datasets_files/VF9X6A/replication...
57334,data.table,R,scripts/isec_datasets_files/VF9X6A/replication...


In [9]:
data['dataverse'] = data['file_path'].str.extract(r'/([^/]+)_datasets_files/')
data['repo_id'] = data['file_path'].str.extract(r'_datasets_files/([^/]+)/')

In [10]:
data.to_csv("../data/file_imports.csv", index = False)

In [11]:
data

Unnamed: 0,package,language,file_path,dataverse,repo_id
0,ggplot2,R,scripts/FPA_datasets_files/CUUC5A/Mediation Fi...,FPA,CUUC5A
1,foreign,R,scripts/FPA_datasets_files/6S3SHE/Chung FPA Re...,FPA,6S3SHE
2,memisc,R,scripts/FPA_datasets_files/6S3SHE/Chung FPA Re...,FPA,6S3SHE
3,mediation,R,scripts/FPA_datasets_files/6S3SHE/Chung FPA Re...,FPA,6S3SHE
4,tidyverse,R,scripts/FPA_datasets_files/LO7A8I/Main Documen...,FPA,LO7A8I
...,...,...,...,...,...
57331,dplyr,R,scripts/isec_datasets_files/VF9X6A/replication...,isec,VF9X6A
57332,here,R,scripts/isec_datasets_files/VF9X6A/replication...,isec,VF9X6A
57333,ggplot2,R,scripts/isec_datasets_files/VF9X6A/replication...,isec,VF9X6A
57334,data.table,R,scripts/isec_datasets_files/VF9X6A/replication...,isec,VF9X6A


### Let's count an import per repo. only once

In [12]:
repo_level = data.groupby(['dataverse', 'repo_id'])['package'].agg(set).reset_index()
repo_level.to_csv("../data/repo_level.csv", index = False)
repo_level

Unnamed: 0,dataverse,repo_id,package
0,BJPolS,0BFF0K,"{Matching, stargazer, lfe, ggplot2}"
1,BJPolS,11V2P6,"{lmtest, sandwich, sensemakr, plotrix, cluster..."
2,BJPolS,1B1MXY,"{car, plyr, reshape, ast, mokken, dplyr, itert..."
3,BJPolS,1QFIA1,"{patchwork, sensemakr, lme4, margins, estimatr..."
4,BJPolS,1WZRY2,"{spatialreg, broom, splm, dplyr, stringr, read..."
...,...,...,...
2721,xps,YIZEA7,{data.table}
2722,xps,YNXJZO,"{modelsummary, esc, cobalt, ggeffects, mediati..."
2723,xps,YTZZIT,"{rdd, plyr, rdrobust, ggplot2, foreign}"
2724,xps,YZPGSV,"{estimatr, broom.mixed, ggplot2, jtools, plm, ..."


In [13]:
# Assuming for now that python and R imports don't clash
cites_df = repo_level.explode('package').groupby('package')\
           .size().reset_index(name='count').sort_values(by='count', ascending=False)
cites_df

Unnamed: 0,package,count
891,ggplot2,1322
769,foreign,1009
1871,stargazer,901
640,dplyr,789
1978,tidyverse,720
...,...,...
1056,ivDiag,1
168,MapColoring,1
1053,iterators,1
1052,iter,1


In [14]:
cites_df.to_csv("../data/imports_per_package.csv", index = False)