# Create ENCODE import JSON

Create a JSON file to import ENCODE data into the web applicaiton using the command:

    python manage.py load_encode /path/to/encode_json.json
    
    

In [None]:
from django.conf import settings

from collections import defaultdict
import json
import os
import pandas as pd
import numpy as np

In [None]:
fn = os.path.abspath('./data/cleaned_encode_list.xlsx')
assert os.path.exists(fn)
df = pd.read_excel(fn, sheetname="Metadata")

In [None]:
md5_fn = os.path.abspath('data/md5list.txt')
assert os.path.exists(md5_fn)

## Create list with local file paths

We have a list of bigWig files and md5 values for all files. We now need to map these files to our mapping in this Excel crosswalk:

In [None]:
encode_root = "/apps/encodeTracks/"
def getFileLocationDict(fn):
    cw = defaultdict(dict)
    
    with open(fn, 'r') as f:
        lines = f.readlines()
    
    lines = [ln.split() for ln in lines]
    
    for md5, fn in lines:
        name = os.path.basename(fn)
        path = fn.replace(encode_root, '')  # remove root
        cw[name][md5] = path
    
    return cw

cw = getFileLocationDict(md5_fn)

In [None]:
def getMatchingPath(name, md5=None):
    files = cw[name]
    
    # first, see if we're missing a name in the crosswalk
    if len(files) == 0:
        return print('Missing name: {}'.format(name))
    
    # first, try to get using MD5        
    try:
        return files[md5]
    except Exception:
        if md5:
            # next, see if we're not matching an MD5
            print('Unmatched MD5: {} - our MD5: {}, from db: {}'.format(
                name, md5, '|'.join(files.keys())
            ))        
    
    # next, if no MD5 but only one name, use this value
    if len(files) == 1 and md5 is None:
        return list(files.values())[0]
    
    print('Unmatched: {} {}'.format(name, md5))  

In [None]:
def func(d, fld, md5fld):
    name = d[fld]
    md5 = d[md5fld]
    if md5 is np.NaN:
        md5 = None
    if name is not np.NaN:
        path = getMatchingPath(name, md5)
        if path:
            return path
    return None

df['_plus_bigwig_fn'] = df.apply(func, axis=1, args=('plus_bigwig', 'plus_md5sum'))
df['_minus_bigwig_fn'] = df.apply(func, axis=1, args=('minus_bigwig', 'minus_md5sum'))
df['_ambig_bigwig_fn'] = df.apply(func, axis=1, args=('ambig_bigwig', 'ambig_md5sum'))

## Cleanup content in Excel file

In [None]:
# make text field NaN = ""
fields = [
    #'Name',
    'Description',
    #'plus_bigwig',
    #'minus_bigwig',
    #'ambig_bigwig',
    #'genome_assembly',
    #'dataType',
    'cell',
    'antibody',
    'rnaExtract',
    'phase',
    'treatment',
    'localization',
    'labExpId',
    'dccAccession',
    'controlId',
    'project'
    'labExpId',
    'dccAccession',
    'controlId',
    'project',
    'ambig_md5sum',
    'ambig_view',
    #'subId',
    'protocol',
    #'replicate',
    'lab',
    #'type',
    'ambig_tableName',
    'geoSampleAccession',
    'setType',
    #'dateUnrestricted',
    #'dataVersion',
    'ambig_size',
    'composite',
    #'grant',
    #'dateSubmitted',
    'origAssembly',
    'labVersion',
    'control',
    #'dateResubmitted',
    'plus_md5sum',
    'readType',
    'plus_tableName',
    'plus_view',
    'minus_md5sum',
    'minus_tableName',
    'minus_size',
    'plus_size',
    'donorId',
    'bioRep',
    'minus_view',
    'seqPlatform',
    'spikeInPool',
    'sex',
    'mapAlgorithm',
    'platform',
    'submittedDataVersion',
    #'insertLength',
    #'expId',
    'labProtocolId',
    'uniqueness',
    'sourceObj',
    'softwareVersion',
    'age',
    'strain',
    'tissueSourceType',

]
for fld in fields:
    if fld not in df.columns:
        continue
    df[fld].fillna(value='', inplace=True)

In [None]:
# Convert date fields to ordinal
def func(d, fld):
    val = d[fld]
    if val is not np.NaN and val is not pd.NaT:
        try:
            return val.toordinal()
        except AttributeError:
            # invalid date
            print("Invalid date: {}".format(val))
    return None

df['dateUnrestricted'] = df.apply(func, axis=1, args=('dateUnrestricted', ))
df['dateSubmitted'] = df.apply(func, axis=1, args=('dateSubmitted', ))
df['dateResubmitted'] = df.apply(func, axis=1, args=('dateResubmitted', ))

In [None]:
# For numeric fields, set NaN to None
fields = [
    'subId',
    'replicate',
    'dataVersion',
    'insertLength',
    'expId',
    'dateUnrestricted',
    'dateSubmitted',
    'dateResubmitted',
]
for fld in fields:
    df[fld] = df[fld].where(pd.notnull(df[fld]), other=None)    

In [None]:
# coerce to string (some datetimes mixed in)
df['dataVersion'] = df.dataVersion.astype(str)

### Create extra content field

In [None]:
extra_content_fields = [
    'labExpId',
    'dccAccession',
    'controlId',
    'project',
    'ambig_md5sum',
    'ambig_view',
    'subId',
    'protocol',
    'replicate',
    'lab',
    'type',
    'ambig_tableName',
    'geoSampleAccession',
    'setType',
    'dateUnrestricted',
    'dataVersion',
    'ambig_size',
    'composite',
    'grant',
    'dateSubmitted',
    'origAssembly',
    'labVersion',
    'control',
    'dateResubmitted',
    'plus_md5sum',
    'readType',
    'plus_tableName',
    'plus_view',
    'minus_md5sum',
    'minus_tableName',
    'minus_size',
    'plus_size',
    'donorId',
    'bioRep',
    'minus_view',
    'seqPlatform',
    'spikeInPool',
    'sex',
    'mapAlgorithm',
    'platform',
    'submittedDataVersion',
    'insertLength',
    'expId',
    'labProtocolId',
    'uniqueness',
    'sourceObj',
    'softwareVersion',
    'age',
    'strain',
    'tissueSourceType',
]

dtype_datetime = np.dtype('datetime64[ns]')

def getExtraContent(d):
    content = {}
    for fld in extra_content_fields:
        val = d[fld]
        if val and val is not pd.NaT:
            if df[fld].dtype is dtype_datetime:
                content[fld] = d[fld].toordinal()
            elif val is not np.NaN:
                content[fld] = d[fld]
    return content

# Create JSON file for import

In [None]:
dicts = []
for i, d in df.iterrows():
    extra_content = getExtraContent(d)
    d = {
        "name": d.Name,
        "genome_assembly": d.genome_assembly,
        "data_type": d.dataType,
        "cell_type": d.cell,
        "antibody": d.antibody,
        "rna_extract": d.rnaExtract,
        "treatment": d.treatment,
        "phase": d.phase,
        "localization": d.localization,
        "extra_content": extra_content,
        "plus_bigwig": d._plus_bigwig_fn,
        "minus_bigwig": d._minus_bigwig_fn,
        "ambig_bigwig": d._ambig_bigwig_fn,
    }
    dicts.append(d)    

In [None]:
fn = os.path.abspath('./data/load_encode.json')
with open(fn, 'w') as f:
    f.write(json.dumps(dicts, indent=4, separators=(',', ': ')))