# Summary

# Imports

In [1]:
%run _imports.ipynb

Setting the PYTHON_VERSION environment variable.
Setting the SPARK_MASTER environment variable.
Setting the DB_TYPE environment variable.
Setting the DB_PORT environment variable.


2017-12-08 13:54:34.338850


In [2]:
NOTEBOOK_NAME = 'to_download'
os.makedirs(NOTEBOOK_NAME, exist_ok=True)

# Parameters

In [3]:
COLUMNS = [
    'pmc_type', 'pmc_id', 'source_url', 'archive_file_path', 'output_file_path'
]
COLUMNS

['pmc_type', 'pmc_id', 'source_url', 'archive_file_path', 'output_file_path']

# Functions

In [4]:
import hashlib

In [5]:
def get_pmc_dir(pmc_id):
    dgst = hashlib.md5(pmc_id.encode('utf-8')).hexdigest()
    return f'{dgst[0:2]}/{dgst[2:4]}/'

# Load file lists

## Manuscripts (NCBI)

#### `ncbi_manuscript_df`

In [6]:
ncbi_manuscript_df = pd.read_csv('../downloads/ncbi/pmc/manuscript/filelist.csv')

In [7]:
display(ncbi_manuscript_df.head())
print(len(ncbi_manuscript_df))

Unnamed: 0,File,PMCID,PMID,MID
0,PMC0021XXXXX/PMC2136438.xml,PMC2136438,19023455,NIHMS10936
1,PMC0021XXXXX/PMC2185066.xml,PMC2185066,18769527,NIHMS31656
2,PMC0022XXXXX/PMC2268633.xml,PMC2268633,17661176,NIHMS37031
3,PMC0022XXXXX/PMC2287164.xml,PMC2287164,18389087,NIHMS38164
4,PMC0022XXXXX/PMC2288569.xml,PMC2288569,18650957,NIHMS40219


457231


In [8]:
assert (ncbi_manuscript_df['PMCID'].str.len() == 10).all()

In [9]:
ncbi_manuscript_df['pmc_type'] = 'ncbi_manuscript'
ncbi_manuscript_df['pmc_id'] = ncbi_manuscript_df['PMCID']
ncbi_manuscript_df['source_url'] = (
    'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/' + 
    ncbi_manuscript_df['File'].str[:6] + 'XXXXXX' + 
    '.xml.tar.gz')
ncbi_manuscript_df['archive_file_path'] = 'manuscript/' + ncbi_manuscript_df['File']
ncbi_manuscript_df['output_file_path'] = (
    ncbi_manuscript_df['pmc_id'].apply(get_pmc_dir) + 
    ncbi_manuscript_df['pmc_id'] + 
    '.zip')

In [10]:
ncbi_manuscript_df = ncbi_manuscript_df[COLUMNS].drop_duplicates()
display(ncbi_manuscript_df.head())
print(len(ncbi_manuscript_df))

Unnamed: 0,pmc_type,pmc_id,source_url,archive_file_path,output_file_path
0,ncbi_manuscript,PMC2136438,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/PMC002XXXXXX.xml.tar.gz,manuscript/PMC0021XXXXX/PMC2136438.xml,ea/c2/PMC2136438.zip
1,ncbi_manuscript,PMC2185066,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/PMC002XXXXXX.xml.tar.gz,manuscript/PMC0021XXXXX/PMC2185066.xml,7d/ed/PMC2185066.zip
2,ncbi_manuscript,PMC2268633,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/PMC002XXXXXX.xml.tar.gz,manuscript/PMC0022XXXXX/PMC2268633.xml,9b/74/PMC2268633.zip
3,ncbi_manuscript,PMC2287164,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/PMC002XXXXXX.xml.tar.gz,manuscript/PMC0022XXXXX/PMC2287164.xml,fe/dc/PMC2287164.zip
4,ncbi_manuscript,PMC2288569,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/PMC002XXXXXX.xml.tar.gz,manuscript/PMC0022XXXXX/PMC2288569.xml,76/32/PMC2288569.zip


457231


## OA Package (NCBI)

#### `ncbi_oa_package_df`

In [11]:
ncbi_oa_package_df = pd.read_csv('../downloads/ncbi/pmc/oa_file_list.csv').drop_duplicates()

In [12]:
display(ncbi_oa_package_df.head())
print(len(ncbi_oa_package_df))

Unnamed: 0,File,Article Citation,Accession ID,Last Updated (YYYY-MM-DD HH:MM:SS),PMID,License
0,oa_package/08/e0/PMC13900.tar.gz,Breast Cancer Res. 2001 Nov 2; 3(1):55-60,PMC13900,2017-04-26 12:15:50,11250746.0,NO-CC CODE
1,oa_package/b0/ac/PMC13901.tar.gz,Breast Cancer Res. 2001 Nov 9; 3(1):61-65,PMC13901,2016-01-20 10:58:46,11250747.0,NO-CC CODE
2,oa_package/f7/98/PMC13902.tar.gz,Breast Cancer Res. 2001 Nov 8; 3(1):66-75,PMC13902,2006-02-02 19:37:52,11250748.0,NO-CC CODE
3,oa_package/9c/7f/PMC13911.tar.gz,Breast Cancer Res. 2000 Nov 16; 2(1):59-63,PMC13911,2013-03-17 14:00:52,11056684.0,NO-CC CODE
4,oa_package/c6/fb/PMC13912.tar.gz,Breast Cancer Res. 2000 Dec 6; 2(1):64-72,PMC13912,2013-03-17 14:00:52,11400682.0,NO-CC CODE


1780586


In [13]:
ncbi_oa_package_df = ncbi_oa_package_df[ncbi_oa_package_df['File'].str.startswith('oa_package')]

ncbi_oa_package_df['pmc_type'] = 'ncbi_oa_package'
ncbi_oa_package_df['pmc_id'] = ncbi_oa_package_df['Accession ID']
ncbi_oa_package_df['source_url'] = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/' + ncbi_oa_package_df['File']
ncbi_oa_package_df['archive_file_path'] = 'manuscript/' + ncbi_oa_package_df['File']

ncbi_oa_package_df = ncbi_oa_package_df.dropna(subset=['pmc_id'])
ncbi_oa_package_df['output_file_path'] = (
    ncbi_oa_package_df['pmc_id'].apply(get_pmc_dir) + 
    ncbi_oa_package_df['pmc_id'] + 
    '.zip')

In [14]:
ncbi_oa_package_df = ncbi_oa_package_df[COLUMNS].drop_duplicates()
display(ncbi_oa_package_df.head())
print(len(ncbi_oa_package_df))

Unnamed: 0,pmc_type,pmc_id,source_url,archive_file_path,output_file_path
0,ncbi_oa_package,PMC13900,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/08/e0/PMC13900.tar.gz,manuscript/oa_package/08/e0/PMC13900.tar.gz,b5/c0/PMC13900.zip
1,ncbi_oa_package,PMC13901,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/b0/ac/PMC13901.tar.gz,manuscript/oa_package/b0/ac/PMC13901.tar.gz,39/28/PMC13901.zip
2,ncbi_oa_package,PMC13902,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f7/98/PMC13902.tar.gz,manuscript/oa_package/f7/98/PMC13902.tar.gz,04/bc/PMC13902.zip
3,ncbi_oa_package,PMC13911,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/9c/7f/PMC13911.tar.gz,manuscript/oa_package/9c/7f/PMC13911.tar.gz,69/8a/PMC13911.zip
4,ncbi_oa_package,PMC13912,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/c6/fb/PMC13912.tar.gz,manuscript/oa_package/c6/fb/PMC13912.tar.gz,ed/e5/PMC13912.zip


1779122


## PMC Suppl (EBI)

#### `ebi_pmc_suppl_df`

In [15]:
with open(f"statistics/pmc_suppl_files.pickle", 'rb') as fin:
    ebi_pmc_suppl_files = pickle.load(fin)

In [16]:
ebi_pmc_suppl_df = pd.DataFrame(ebi_pmc_suppl_files, columns=['file'])

In [17]:
display(ebi_pmc_suppl_df.head())
print(len(ebi_pmc_suppl_df))

Unnamed: 0,file
0,/pub/databases/pmc/suppl/NON-OA/PMC101900-PMC105899/PMC102254.zip
1,/pub/databases/pmc/suppl/NON-OA/PMC101900-PMC105899/PMC102269.zip
2,/pub/databases/pmc/suppl/NON-OA/PMC101900-PMC105899/PMC102275.zip
3,/pub/databases/pmc/suppl/NON-OA/PMC101900-PMC105899/PMC102277.zip
4,/pub/databases/pmc/suppl/NON-OA/PMC101900-PMC105899/PMC102329.zip


1623854


In [18]:
ebi_pmc_suppl_df['pmc_type'] = 'ebi_suppl'

assert all(f.endswith('.zip') for f in ebi_pmc_suppl_df['file'])
ebi_pmc_suppl_df['pmc_id'] = ebi_pmc_suppl_df['file'].apply(op.basename).str[:-4]

ebi_pmc_suppl_df['source_url'] = 'ftp://ftp.ebi.ac.uk/pub/databases/pmc/' + ebi_pmc_suppl_df['file'].str[19:]

ebi_pmc_suppl_df['archive_file_path'] = ebi_pmc_suppl_df['file'].str[19:]

ebi_pmc_suppl_df['output_file_path'] = (
    ebi_pmc_suppl_df['pmc_id'].apply(get_pmc_dir) + 
    ebi_pmc_suppl_df['pmc_id'] + 
    '.zip')

In [19]:
ebi_pmc_suppl_df = ebi_pmc_suppl_df[COLUMNS].drop_duplicates()
display(ebi_pmc_suppl_df.head())
print(len(ebi_pmc_suppl_df))

Unnamed: 0,pmc_type,pmc_id,source_url,archive_file_path,output_file_path
0,ebi_suppl,PMC102254,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/NON-OA/PMC101900-PMC105899/PMC102254.zip,suppl/NON-OA/PMC101900-PMC105899/PMC102254.zip,d9/4e/PMC102254.zip
1,ebi_suppl,PMC102269,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/NON-OA/PMC101900-PMC105899/PMC102269.zip,suppl/NON-OA/PMC101900-PMC105899/PMC102269.zip,4b/49/PMC102269.zip
2,ebi_suppl,PMC102275,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/NON-OA/PMC101900-PMC105899/PMC102275.zip,suppl/NON-OA/PMC101900-PMC105899/PMC102275.zip,83/5a/PMC102275.zip
3,ebi_suppl,PMC102277,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/NON-OA/PMC101900-PMC105899/PMC102277.zip,suppl/NON-OA/PMC101900-PMC105899/PMC102277.zip,fb/e9/PMC102277.zip
4,ebi_suppl,PMC102329,ftp://ftp.ebi.ac.uk/pub/databases/pmc/suppl/NON-OA/PMC101900-PMC105899/PMC102329.zip,suppl/NON-OA/PMC101900-PMC105899/PMC102329.zip,ac/01/PMC102329.zip


1623854


# To download

In [20]:
len(ncbi_manuscript_df)

457231

In [21]:
len(ncbi_oa_package_df)

1779122

In [22]:
len(ebi_pmc_suppl_df)

1623854

In [23]:
to_download_df = pd.concat(
    [ncbi_oa_package_df, ncbi_manuscript_df, ebi_pmc_suppl_df],
    ignore_index=True
)

In [24]:
display(to_download_df.head())
print(len(to_download_df))

Unnamed: 0,pmc_type,pmc_id,source_url,archive_file_path,output_file_path
0,ncbi_oa_package,PMC13900,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/08/e0/PMC13900.tar.gz,manuscript/oa_package/08/e0/PMC13900.tar.gz,b5/c0/PMC13900.zip
1,ncbi_oa_package,PMC13901,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/b0/ac/PMC13901.tar.gz,manuscript/oa_package/b0/ac/PMC13901.tar.gz,39/28/PMC13901.zip
2,ncbi_oa_package,PMC13902,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f7/98/PMC13902.tar.gz,manuscript/oa_package/f7/98/PMC13902.tar.gz,04/bc/PMC13902.zip
3,ncbi_oa_package,PMC13911,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/9c/7f/PMC13911.tar.gz,manuscript/oa_package/9c/7f/PMC13911.tar.gz,69/8a/PMC13911.zip
4,ncbi_oa_package,PMC13912,ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/c6/fb/PMC13912.tar.gz,manuscript/oa_package/c6/fb/PMC13912.tar.gz,ed/e5/PMC13912.zip


3860207


In [25]:
to_download_df = to_download_df.drop_duplicates(subset=['pmc_id'])

In [27]:
print(len(to_download_df))

2396266


In [29]:
to_download_df.to_csv(f'{NOTEBOOK_NAME}/to_download.csv.gz', index=False, compression='gzip')