In [1]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import random
import urllib.request
import os
from dotenv import load_dotenv
from pathlib import Path

env_path = Path('.') / '.czi.env'
load_dotenv(dotenv_path=env_path)


# Configure the data dir path in .czi.env located in this directory
DATA_DIR_PATH = os.environ.get("DATA_DIR_PATH")
print(f'Using data directory {DATA_DIR_PATH}')

SAMPLE_1000 = 'sample_1000_counts.csv'

Using data directory <retracted>


In [2]:
# Define metadata for dataset

_meta = {
    'Unnamed: 0': 'object',
    'Unnamed: 0.1': 'object',
    'license': 'str',
    'location': 'str',
    'pmcid': 'object',
    'pmid': 'object',
    'doi': 'object',
    'pubdate': 'object',
    'source': 'object',
    'number': 'object',
    'text': 'str',
    'software': 'str',
    'software_upper': 'str',
    'version': 'str',
    'ID': 'str',
    'curation_label': 'str',
    'mention_counts': 'int64'
}

In [3]:
# Load sample as Dask dataframe

import dask.dataframe as dd

ddf_sample = dd.read_csv(f'{DATA_DIR_PATH}/{SAMPLE_1000}', dtype=_meta)

In [4]:
# Get linked datasets

cran_df = dd.read_csv(f'{DATA_DIR_PATH}/linked/linked/normalized/cran_df.csv')
pypi_df = dd.read_csv(f'{DATA_DIR_PATH}/linked/linked/normalized/pypi_df.csv')
bioconductor_df = dd.read_csv(f'{DATA_DIR_PATH}/linked/linked/normalized/bioconductor_df.csv')
github_df = dd.read_csv(f'{DATA_DIR_PATH}/linked/linked/normalized/github_df.csv')
scicrunch_df = dd.read_csv(f'{DATA_DIR_PATH}/linked/linked/normalized/scicrunch_df.csv')

In [5]:
from functools import reduce

id_series = ddf_sample.ID

data_frames = [ddf_sample, 
               cran_df, 
               github_df, 
               pypi_df, 
               bioconductor_df, 
               scicrunch_df
              ]

pd_dfs = []

for data_frame in data_frames:
    data_frame = data_frame.compute()
    data_frame = data_frame[data_frame['ID'].isin(id_series)]
    pd_dfs.append(data_frame)
    
df_merged = reduce(lambda left, right: dd.merge(left,
                                               right,
                                               on=['ID'],
                                               how='outer',
                                               suffixes=(f'_{left.index.size + right.index.size}', f'_{right.index.size}') # f'_{left}', f'_{right}')
                                              ), 
                   pd_dfs)

df_merged = df_merged.drop_duplicates('ID')  # Linking may have introduced extra rows for the same ID

annotation_columns = [
    'MAIN','QA','QA_retrieval','preprint','software_paper','confidence','programming_language','repository_platform','discuss_in_paper','annotator','comments'
]

for new_column in annotation_columns:
    df_merged[new_column] = ""

df_merged.to_csv(f'{DATA_DIR_PATH}/sample_1000_with_links.csv')