In [1]:
import os
import pandas as pd

from analysis import models

In [2]:
models.EncodeDataset.objects.all().delete()

In [3]:
hg19 = os.path.abspath('./data/hg19_metadata.xlsx')
mm9 = os.path.abspath('./data/mm9_metadata.xlsx')
assert os.path.exists(hg19)
assert os.path.exists(mm9)

In [4]:
def toName(d):
    return d.fn[:-7]

def loadAndCleanDF(fn):
    df = pd.read_excel(fn)
    df.info()    
    
    fields = [
        'dataType',
        'cell',
        'antibody',
        'rnaExtract',
        'treatment',
        'phase',
        'localization',
    ]
    for fld in fields:
        if fld not in df.columns:
            continue
        df[fld].fillna(value='', inplace=True)
        print('\n{}\n-----'.format(fld))
        print('\n'.join(sorted(df[fld].unique())))
    
    df['_name'] = df.apply(toName, axis=1)
    
    return df

In [5]:
dfHg = loadAndCleanDF(hg19)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4424 entries, 0 to 4423
Data columns (total 48 columns):
antibody                2209 non-null object
bioRep                  1212 non-null object
cell                    4414 non-null object
composite               4396 non-null object
control                 1241 non-null object
controlId               2350 non-null object
dataType                4424 non-null object
dataVersion             4424 non-null object
dateResubmitted         894 non-null object
dateSubmitted           4418 non-null object
dateUnrestricted        4348 non-null object
dccAccession            4417 non-null object
donorId                 297 non-null object
expId                   21 non-null float64
fn                      4424 non-null object
geoSampleAccession      4096 non-null object
grant                   4424 non-null object
insertLength            177 non-null object
lab                     4424 non-null object
labExpId                3147 non-null obje

In [6]:
def maxLengthChecks(df):
    def getMaxLength(df, fld):
        len_ = max([len(x) for x in df[fld].unique() if isinstance(x, str)])
        print("{} maximum length: {}".format(fld, len_))
    getMaxLength(df, 'dataType')
    getMaxLength(df, 'cell')
    getMaxLength(df, 'antibody')
    getMaxLength(df, 'rnaExtract')
    getMaxLength(df, 'treatment')
    if 'phase' in df.columns:
        getMaxLength(df, 'phase')
    getMaxLength(df, 'localization')

In [7]:
maxLengthChecks(dfHg)

dataType maximum length: 10
cell maximum length: 23
antibody maximum length: 22
rnaExtract maximum length: 12
treatment maximum length: 24
phase maximum length: 4
localization maximum length: 11


In [8]:
for i, d in dfHg.iterrows():
    object_ = models.EncodeDataset.objects.create(
        name=d._name,
        public=True,
        genome_assembly=models.HG19,
        data_type=d.dataType,
        cell_type=d.cell,
        antibody=d.antibody,
        rna_extract=d.rnaExtract,
        treatment=d.treatment,
        phase=d.phase,
        localization=d.localization,
    )

In [9]:
dfMm = loadAndCleanDF(mm9)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1123 entries, 0 to 1122
Data columns (total 46 columns):
age                     1118 non-null object
antibody                541 non-null object
bioRep                  120 non-null object
cell                    1118 non-null object
composite               1123 non-null object
control                 541 non-null object
controlId               541 non-null object
dataType                1123 non-null object
dataVersion             1123 non-null object
dateResubmitted         224 non-null object
dateSubmitted           1118 non-null object
dateUnrestricted        1054 non-null object
dccAccession            1118 non-null object
fn                      1123 non-null object
geoSampleAccession      1065 non-null object
grant                   1123 non-null object
insertLength            8 non-null float64
lab                     1123 non-null object
labExpId                376 non-null object
labVersion              689 non-null object
lo

In [10]:
maxLengthChecks(dfMm)

dataType maximum length: 10
cell maximum length: 26
antibody maximum length: 22
rnaExtract maximum length: 12
treatment maximum length: 16
localization maximum length: 4


In [11]:
for i, d in dfMm.iterrows():
    object_ = models.EncodeDataset.objects.create(
        name=d._name,
        public=True,
        genome_assembly=models.MM9,
        data_type=d.dataType,
        cell_type=d.cell,
        antibody=d.antibody,
        rna_extract=d.rnaExtract,
        treatment=d.treatment,
        phase='',
        localization=d.localization,
    )