### Assemble COVID19 articles with IDs, abstracts and full texts

In [2]:
import pandas as pd
import numpy as np
import json
import glob

In [65]:
latest_data_path = "/Users/rmartinshort/Documents/DS_projects/covid/dataset/2020-03-13"

In [66]:
sourcefiles = glob.glob("{}/*".format(data_source))

In [69]:
sourcefiles

['../dataset/2020-03-13/pmc_custom_license',
 '../dataset/2020-03-13/json_schema.txt',
 '../dataset/2020-03-13/noncomm_use_subset',
 '../dataset/2020-03-13/all_sources_metadata_2020-03-13.csv',
 '../dataset/2020-03-13/all_sources_metadata_2020-03-13.readme',
 '../dataset/2020-03-13/biorxiv_medrxiv',
 '../dataset/2020-03-13/COVID.DATA.LIC.AGMT.pdf',
 '../dataset/2020-03-13/comm_use_subset']

In [68]:
latest_data_path

'/Users/rmartinshort/Documents/DS_projects/covid/dataset/2020-03-13'

### Take a look at the metadata file 

Also see https://www.kaggle.com/tanulsingh077/a-comprehensive-resource-notebook-for-beginners

In [75]:
metafile = sorted([f for f in sourcefiles if "metadata" in f])[0]

In [76]:
metadata = pd.read_csv(metafile)

In [77]:
metadata.isna().sum()

sha                            12080
source_x                           0
title                            370
doi                             3143
pmcid                           2163
pubmed_id                      12770
license                        11808
abstract                        2947
publish_time                   11252
authors                          946
journal                        11709
Microsoft Academic Paper ID    28366
WHO #Covidence                 28264
has_full_text                  12080
dtype: int64

In [78]:
def detect_time_format(input_time_string):
    
    """
    Fix the format of the publish_time column so
    that pd.datetime can read it
    """
    
    return 0
    

In [81]:
def doi_url(d):
    
    """
    Create links to the DOIs so that papers 
    can be found online 
    """
    
    if d.startswith('http://'):
        return d
    elif d.startswith('doi.org'):
        return f'http://{d}'
    else:
        return f'http://doi.org/{d}'

In [79]:
metadata["publish_time"]

0        2020
1        2020
2        2020
3        2020
4        2020
         ... 
29495     NaN
29496     NaN
29497     NaN
29498     NaN
29499     NaN
Name: publish_time, Length: 29500, dtype: object

In [80]:
metadata.describe(include="all")

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
count,17420,29500,29130,26357,27337,16730.0,17692,26553,18248.0,28554,17791,1134.0,1236,17420
unique,17398,4,24654,22203,23222,,15,22453,4499.0,23860,1732,,1223,2
top,72a5640aa0c307fbe171ca7ad55d3fda48b53988,PMC,Articles of Significant Interest Selected from...,http://dx.doi.org/10.1371/journal.pcbi.1006483,PMC6224041,,CC BY,The automated comparison of protein-ligand bin...,2020.0,"['Ehrt, Christiane', 'Brinkjost, Tobias', 'Koc...",PLoS One,,#1890,True
freq,4,27337,67,35,35,,11575,35,1148.0,35,2204,,2,13219
mean,,,,,,26267590.0,,,,,,2697968000.0,,
std,,,,,,4697935.0,,,,,,487358600.0,,
min,,,,,,67173.0,,,,,,39126300.0,,
25%,,,,,,23634810.0,,,,,,2366820000.0,,
50%,,,,,,27046580.0,,,,,,3002534000.0,,
75%,,,,,,29951280.0,,,,,,3005536000.0,,


In [31]:
metadata.drop_duplicates(subset=["title"]).drop_duplicates(subset=["sha"],inplace=True)

In [29]:
metadata['doi_url'] = metadata["doi"].fillna('').apply(doi_url)

In [82]:
metadata.to_csv("processed_metadata.csv",index=False)

### Gather all the full texts and join them with the metadata

Also see https://www.kaggle.com/fmitchell259/create-corona-csv-file

In [30]:
corona_features = {"doc_id": [None], "source": [None], "title": [None],
                  "abstract": [None], "text_body": [None]}

corona_df = pd.DataFrame.from_dict(corona_features)

In [83]:
json_filenames = glob.glob(f'{latest_data_path}/**/*.json', recursive=True)

This is the total number of articles for which we have full texts, across all the sources

In [85]:
len(json_filenames)

13202

In [50]:
json_filenames[:10]

['../dataset/2020-03-13/pmc_custom_license/pmc_custom_license/8f8eb4f004c2002face0723f2f58cc411954d36e.json',
 '../dataset/2020-03-13/pmc_custom_license/pmc_custom_license/63f7049d200896290b38b38711113054f7ea1b50.json',
 '../dataset/2020-03-13/pmc_custom_license/pmc_custom_license/4df45b8404d9de0b376a8ae3c282a517df36fe51.json',
 '../dataset/2020-03-13/pmc_custom_license/pmc_custom_license/e0737ee93afe7b0bf06b1e3f9adf21d541dd10f0.json',
 '../dataset/2020-03-13/pmc_custom_license/pmc_custom_license/3c3572ba243d61e7631725669c8f88347fdbd5bc.json',
 '../dataset/2020-03-13/pmc_custom_license/pmc_custom_license/b277e521eb43fedeb50a08be126e76dd9bc7314a.json',
 '../dataset/2020-03-13/pmc_custom_license/pmc_custom_license/4cb9c6ef889605b3149ab8b59c8258074067ba04.json',
 '../dataset/2020-03-13/pmc_custom_license/pmc_custom_license/11a21be0569b11edf62c871f9e2561a2a5389006.json',
 '../dataset/2020-03-13/pmc_custom_license/pmc_custom_license/cf3640a2e06457c47beac679ac651bc69f7c9521.json',
 '../datas

In [86]:
def return_corona_df(json_filenames, df):
    
    """
    Create dataframe containing the article texts
    """

    for file_name in json_filenames:

        row = {"doc_id": None, "source": None, "title": None,
              "abstract": None, "text_body": None}
        
        source_name = file_name.split('/')[-2]

        with open(file_name) as json_data:
            data = json.load(json_data)

            row['doc_id'] = data['paper_id']
            row['title'] = data['metadata']['title']

            # Now need all of abstract. Put it all in 
            # a list then use str.join() to split it
            # into paragraphs. 

            abstract_list = [data['abstract'][x]['text'] for x in range(len(data['abstract']) - 1)]
            abstract = "\n ".join(abstract_list)

            row['abstract'] = abstract

            # And lastly the body of the text.
            
            body_list = []
            for _ in range(len(data['body_text'])):
                try:
                    body_list.append(data['body_text'][_]['text'])
                except:
                    pass

            body = "\n ".join(body_list)
            
            row['text_body'] = body
            
            row['source'] = source_name
            
            df = df.append(row, ignore_index=True)
    
    return df.iloc[1:]
    

In [87]:
corona_df_all_sources = return_corona_df(json_filenames, corona_df)

In [89]:
corona_df_all_sources.shape

(13202, 5)

In [90]:
corona_df_all_sources.isna().sum()

doc_id       0
source       0
title        0
abstract     0
text_body    0
dtype: int64

In [91]:
corona_df_all_sources.to_csv("corona_all_articles.csv",index=False)

### Join the datasets and save

In [93]:
cols_to_use = metadata.columns.difference(corona_df_all_sources.columns)

In [94]:
cols_to_use

Index(['Microsoft Academic Paper ID', 'WHO #Covidence', 'authors', 'doi',
       'has_full_text', 'journal', 'license', 'pmcid', 'publish_time',
       'pubmed_id', 'sha', 'source_x'],
      dtype='object')

In [95]:
covid_dataset = metadata[cols_to_use].\
    merge(corona_df_all_sources,left_on="sha",right_on="doc_id",how="left").\
    drop("doc_id",axis=1)

In [96]:
covid_dataset.to_csv("covid_full_processed.csv",index=False)