# CORD19 - enriching smaple with DOIs/PIDs

The current two samples for the top 100, and 1000 items without the top 100, do not contain DOIs in their original form.

This notebook adds one DOI per item and writes the samples to a new version file.

What it does in detail is:

1. Collect all PIDs for all instances of the exact software name from the original CORD-19 SM dataset,
2. Pick one random PID
3. Add it to the sample dataset(s)
4. Trim the name strings in the process
4. Write the files

In [None]:
import csv
import pandas as pd
import numpy as np
import random

In [None]:
# Set paths
top100_path = '../data/sampling/top_100.csv'
random1k_path = '../data/sampling/1000sample_without_top_100.csv'
csm_path = '../data/cord-19/CORD19_software_mentions.csv'

In [None]:
# Prepare dataframes
df_csm = pd.read_csv(csm_path)

df_top100 = pd.read_csv(top100_path)
df_top100['urls'] = np.empty((len(df_top100), 0)).tolist()

df_rand1k = pd.read_csv(random1k_path)
df_rand1k['urls'] = np.empty((len(df_rand1k), 0)).tolist()

In [None]:
# Amend URLs in datasets, iterate the CSM once only!
for j, csm_row in df_csm.iterrows():
    if j % 10000 == 0:
        print('On iteration ', j)
    names_str = csm_row['software']
    names = names_str.replace('[', '').replace(']', '').replace("'", '').split(',')
    names = [s.strip() for s in names]
    # Iterate the other two dfs to match for names,
    # and append the list of URLs to the 'urls' list
    for i, top100_row in df_top100.iterrows():
        top100_name = top100_row['software']
        if top100_name in names:
            df_top100.at[i,'urls'].append(csm_row['url'])
    for i, rand1k_row in df_rand1k.iterrows():
        rand1k_name = rand1k_row['software']
        if rand1k_name in names:
            df_rand1k.at[i,'urls'].append(csm_row['url'])

In [None]:
# Pick a random entry in 'urls' and add to column
for i, top100_row in df_top100.iterrows():
    top100_row['rand_url'] = random.choice(top100_row['urls'])
for i, rand1k_row in df_rand1k.iterrows():
    rand1k_row['rand_url'] = random.choice(rand1k_row['urls'])

In [None]:
# Save new files
df_top100.to_csv(top100_path[:-4] + '_urls.csv', encoding='utf8')
df_rand1k.to_csv(random1k_path[:-4] + '_urls.csv', encoding='utf8')