In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from IPython.display import clear_output
import os
from time import time

import matplotlib.pyplot as plt
%matplotlib inline

# Load data

In [2]:
# Define article restrictions
journals_keep = ['Nat Commun', 'Nat Neurosci', 'Nat Methods',
                 'PLoS One',
                 'PLoS Comput Biol', 'Proc Natl Acad Sci U S A']
cols_keep = ['Journal Title', 'Year', 'PMCID', 'PMID']
year_min = 2014

In [3]:
# Load database of available articles
df = pd.read_csv('/gh/data/opencode/PMC-ids.csv')

df_keep = df[(df['Journal Title'].isin(journals_keep)) &
             (df['Year'] >= year_min)
             ]
df_keep = df_keep[cols_keep]

# Remove articles without a PMID (not read)
df_keep.dropna(subset=['PMID'], inplace=True)
df_keep = df_keep.reset_index(drop=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Load information scraped from the articles
N_articles = 147359
N_chunk = 10000
file_nums = np.append(np.arange(N_chunk, N_articles, N_chunk), N_articles)
# terms = ['python', 'matlab', 'public', 'open', 'code', 'source', 'github']
terms = ['python', 'matlab', 'open', 'code', 'github']
other = ['aff', 'subject']

dfs = defaultdict(list)
for k in terms + other:
    for fi_num in file_nums:
        csv_name = '/gh/data2/opencode/june29/{:s}_{:d}.csv'.format(k, fi_num)
        dfs[k].append(pd.read_csv(csv_name, index_col=0))
    dfs[k] = pd.concat(dfs[k])

# Only keep articles with MATLAB or python

In [5]:
ids_mat = dfs['matlab']['PMCID'].unique()
ids_py = dfs['python']['PMCID'].unique()
ids_code = np.union1d(ids_mat, ids_py)
dfs_code = {}
for k in dfs.keys():
    dfs_code[k] = dfs[k][dfs[k]['PMCID'].isin(ids_code)]
    
df_articles = df_keep[df_keep['PMCID'].isin(ids_code)].reset_index(drop=True)

# Classify each article from a journal as sharing or not

Input:
* y = shared code
* n = undetermined
* s = skip

In [6]:
# Define journal and categories of interest
# journal = 'Proc Natl Acad Sci U S A'
journal = 'Nat Methods'
# Note previous order for PNAS was ['github', 'code', 'python', 'matlab', 'open']
category_order = ['github', 'python', 'matlab', 'code', 'open']

# Define articles of interest
article_ids = df_articles[df_articles['Journal Title']==journal]['PMCID'].values

# If the file already exists, then load it and skip the article ids covered
if os.path.isfile('labels/'+journal+'.csv'):
    df_old = pd.read_csv('labels/'+journal+'.csv')
    dict_class = {}
    for col in df_old.columns:
        dict_class[col] = list(df_old[col])
    article_ids = np.setdiff1d(article_ids, df_old['PMCID'].unique())
else:
    dict_class = defaultdict(list)

# Print 1 category and 1 sentence at a time
for i_article, aid in enumerate(article_ids):
    classified = False
    for cat in category_order:
        all_sentences = dfs_code[cat][dfs_code[cat]['PMCID']==aid]['sentence'].values
        for i_sent, sent in enumerate(all_sentences):
            # Prompt user
            start_time = time()
            print('Sentence {:d}/{:d}\nCategory: {:s}\nArticle {:d}/{:d}, id {:s}'.format(
                i_sent+1, len(all_sentences), cat, i_article+1, len(article_ids), aid))
            ans = input(sent)
            clear_output()
            
            # Save default output
            dict_class['PMCID'].append(aid)
            dict_class['category'].append(cat)
            dict_class['sentence'].append(i_sent)
            dict_class['label'].append(ans)
            dict_class['time_ms'].append(int((time()-start_time)*1000))
            
            if ans == 'y':
                classified = True
                break
            elif ans == 'q':
                raise ValueError('Quit by user.')
            elif ans != 'n':
                raise ValueError('Invalid answer: {:s}'.format(ans))
                
        if classified:
            break
    
    # Save classification
    pd.DataFrame(dict_class).to_csv('labels/'+journal+'.csv', index=False)

# Sharing findings for PNAS

In [7]:
# Load manually labeled sharing
journal = 'Proc Natl Acad Sci U S A'
df_labeled = pd.read_csv('labels/'+journal+'.csv')
ids_share_labeled = df_labeled[df_labeled['label']=='y']['PMCID'].values

# Load articles containing 'github'
df_pnas = df_articles[df_articles['Journal Title']==journal]
ids_pnas = df_pnas['PMCID']
ids_github = dfs['github']['PMCID'].unique()
df_github = df_pnas[df_pnas['PMCID'].isin(ids_github)]
ids_share_github = df_github['PMCID'].values

print('{:d}/{:d} articles contain "github"'.format(len(ids_share_github), len(ids_pnas)))
print('{:d}/{:d} articles share code'.format(len(ids_share_labeled), len(ids_pnas)))


print('{:d}/{:d} "github" articles share code'.format(len(np.intersect1d(ids_share_github, ids_share_labeled)), len(ids_share_github)))
print('^NOTE: Some of those were not identified by the "github" link\nConclude: We underestimate code sharing by ~1/3 when just looking for the github string')

17/95 articles contain "github"
25/95 articles share code
17/17 "github" articles share code
^NOTE: Some of those were not identified by the "github" link
Conclude: We underestimate code sharing by ~1/3 when just looking for the github string


# Sharing findings for Nat Methods

In [8]:
# Load manually labeled sharing
journal = 'Nat Methods'
df_labeled = pd.read_csv('labels/'+journal+'.csv')
ids_share_labeled = df_labeled[df_labeled['label']=='y']['PMCID'].values

# Load articles containing 'github'
df_pnas = df_articles[df_articles['Journal Title']==journal]
ids_pnas = df_pnas['PMCID']
ids_github = dfs['github']['PMCID'].unique()
df_github = df_pnas[df_pnas['PMCID'].isin(ids_github)]
ids_share_github = df_github['PMCID'].values

print('{:d}/{:d} articles contain "github"'.format(len(ids_share_github), len(ids_pnas)))
print('{:d}/{:d} articles share code'.format(len(ids_share_labeled), len(ids_pnas)))


print('{:d}/{:d} "github" articles share code'.format(len(np.intersect1d(ids_share_github, ids_share_labeled)), len(ids_share_github)))
print('^NOTE: Some of those were not identified by the "github" link\nConclude: We underestimate code sharing by ~1/3 when just looking for the github string')

36/105 articles contain "github"
56/105 articles share code
35/36 "github" articles share code
^NOTE: Some of those were not identified by the "github" link
Conclude: We underestimate code sharing by ~1/3 when just looking for the github string
