# Helper to download histone ChIP-seq data from ENCODE 

This script loads the metadata associated with files.txt obtained from the ENCODE cart, filters according to the files of interest, downloads from the URLs, and creates a metadata summary.

In [1]:
import numpy as np
import pandas as pd
import urllib.request

## Human ChIP-Seq

In [2]:
organism = 'human'

### Tissue

In [3]:
data_type = 'tissue'

#### Metadata summary

In [4]:
metadata = pd.read_csv('histone_metadata/' + organism + '/' + data_type + '/metadata.tsv', sep = '\t', index_col = 0, low_memory=False)

In [13]:
#metadata = metadata[metadata['Audit ERROR'].isnull()]

In [5]:
metadata = metadata[metadata['File type'] == 'bigWig']

In [6]:
#metadata = metadata[metadata['Output type'] == 'fold change over control']

In [7]:
metadata = metadata[metadata['File assembly'] == 'GRCh38']

In [8]:
metadata = metadata[metadata['File analysis status'] == 'released']

In [9]:
metadata = metadata[metadata['Biosample treatments'].isnull()]

In [11]:
metadata['Technical replicate(s)'] = metadata['Technical replicate(s)'].apply(lambda x: [int(t[-1]) for t in x.split(',', 1)])
metadata['Biological replicate(s)'] = metadata['Biological replicate(s)'].apply(lambda x: [int(b[-1]) for b in x.split(',', 1)])

In [12]:
metadata_summary = metadata[['Biosample term name', 'Experiment accession', 'Experiment target','Biological replicate(s)', 'Technical replicate(s)', 'Audit WARNING', 'Audit NOT_COMPLIANT','Audit ERROR']]

In [13]:
report = pd.read_csv('histone_metadata/' + organism + '/' + data_type + '/report.tsv', sep = '\t', header = 1, index_col = 1, low_memory=False)

In [14]:
normal_gestational_week = 40
ages = pd.DataFrame(np.empty([report.shape[0],1]), columns = ['age',], index = report.index)
for sample in report.index:
    age_string = report.loc[sample,'Biosample age']
    if type(age_string) == float or type(age_string) == int:
        age = age_string
        if age > 0 and age.is_integer():
            ages.loc[sample] = age + 0.5
        else: 
            ages.loc[sample] = age
    elif 'years' in age_string and 'above' not in age_string:
        age = float(age_string.split(' ', 1)[0])
        if age > 0 and age.is_integer():
            ages.loc[sample] = age + 0.5
        else: 
            ages.loc[sample] = age
    elif 'weeks' in age_string:
        age = (float(age_string.split(' ', 1)[0]) - normal_gestational_week)*7/365
        ages.loc[sample] = age 
    elif 'days' in age_string:
        age = (float(age_string.split(' ', 1)[0]) - normal_gestational_week*7)/365
        ages.loc[sample] = age 
    else:
        ages.loc[sample] = np.nan

In [15]:
gender = pd.DataFrame(np.empty([report.shape[0],1]), columns = ['gender',], index = report.index)
for sample in report.index:
    gender_string = report.loc[sample,'Biosample summary']
    if 'female' in gender_string:
        gender.loc[sample] = 'F' 
    elif 'male' in gender_string:
        gender.loc[sample] = 'M'    
    else:
        gender.loc[sample] = np.nan   

In [16]:
ages = ages.loc[metadata_summary['Experiment accession']]
gender = gender.loc[metadata_summary['Experiment accession']]
description = report['Description'].loc[metadata_summary['Experiment accession']]
biosample = report['Biosample accession'].loc[metadata_summary['Experiment accession']]
ages.index = metadata_summary.index
gender.index = metadata_summary.index
description.index = metadata_summary.index
biosample.index = metadata_summary.index

In [17]:
metadata_summary = pd.concat([ages, gender, metadata_summary, biosample, description], axis = 1)

In [18]:
metadata_summary = metadata_summary.dropna(subset=['age'])

In [19]:
metadata_summary.to_pickle('histone_metadata/' + organism + '/' + data_type + '/metadata_summary.pkl')

#### Download files

In [20]:
metadata = metadata.loc[metadata_summary.index]

In [21]:
for histone_mark in np.unique(metadata['Experiment target']):
    
    metadata_mark = metadata[metadata['Experiment target'] == histone_mark]
    
    #to download from a txt file using the terminal
    file = open('histone_data/' + organism + '/' + data_type + '/' + histone_mark[:-6] + "/raw_data/files_" + histone_mark[:-6] + ".txt", "w")
    for url in metadata_mark['File download URL']:
        file.write(url + '\n')
    file.close()

### Primary Cell

In [22]:
data_type = 'primary_cell'

#### Metadata summary

In [23]:
metadata = pd.read_csv('histone_metadata/' + organism + '/' + data_type + '/metadata.tsv', sep = '\t', index_col = 0, low_memory=False)

In [24]:
#metadata = metadata[metadata['Audit ERROR'].isnull()]

In [25]:
metadata = metadata[metadata['File type'] == 'bigWig']

In [26]:
#metadata = metadata[metadata['Output type'] == 'fold change over control']

In [27]:
metadata = metadata[metadata['File assembly'] == 'GRCh38']

In [28]:
metadata = metadata[metadata['File analysis status'] == 'released']

In [29]:
metadata = metadata[metadata['Biosample treatments'].isnull()]

In [30]:
metadata['Technical replicate(s)'] = metadata['Technical replicate(s)'].apply(lambda x: [int(t[-1]) for t in x.split(',', 1)])
metadata['Biological replicate(s)'] = metadata['Biological replicate(s)'].apply(lambda x: [int(b[-1]) for b in x.split(',', 1)])

In [31]:
metadata_summary = metadata[['Biosample term name', 'Experiment accession', 'Experiment target','Biological replicate(s)', 'Technical replicate(s)', 'Audit WARNING', 'Audit NOT_COMPLIANT','Audit ERROR']]

In [32]:
report = pd.read_csv('histone_metadata/' + organism + '/' + data_type + '/report.tsv', sep = '\t', header = 1, index_col = 1, low_memory=False)

In [33]:
normal_gestational_week = 40
ages = pd.DataFrame(np.empty([report.shape[0],1]), columns = ['age',], index = report.index)
for sample in report.index:
    age_string = report.loc[sample,'Biosample age']
    if type(age_string) == float or type(age_string) == int:
        age = age_string
        if age > 0 and age.is_integer():
            ages.loc[sample] = age + 0.5
        else: 
            ages.loc[sample] = age
    elif 'years' in age_string and 'above' not in age_string:
        age = float(age_string.split(' ', 1)[0])
        if age > 0 and age.is_integer():
            ages.loc[sample] = age + 0.5
        else: 
            ages.loc[sample] = age  
    elif 'weeks' in age_string:
        age = (float(age_string.split(' ', 1)[0]) - normal_gestational_week)*7/365
        ages.loc[sample] = age 
    elif 'days' in age_string:
        if age_string == '2-4 days': #this just codes these cells as 3 days old
            age = (3 - normal_gestational_week*7)/365 
            ages.loc[sample] = age 
            continue
        age = (float(age_string.split(' ', 1)[0]) - normal_gestational_week*7)/365
        ages.loc[sample] = age 
    else:
        ages.loc[sample] = np.nan

In [34]:
gender = pd.DataFrame(np.empty([report.shape[0],1]), columns = ['gender',], index = report.index)
for sample in report.index:
    gender_string = report.loc[sample,'Biosample summary']
    if 'female' in gender_string:
        gender.loc[sample] = 'F' 
    elif 'male' in gender_string:
        gender.loc[sample] = 'M'    
    else:
        gender.loc[sample] = np.nan   

In [35]:
ages = ages.loc[metadata_summary['Experiment accession']]
gender = gender.loc[metadata_summary['Experiment accession']]
description = report['Description'].loc[metadata_summary['Experiment accession']]
biosample = report['Biosample accession'].loc[metadata_summary['Experiment accession']]
ages.index = metadata_summary.index
gender.index = metadata_summary.index
description.index = metadata_summary.index
biosample.index = metadata_summary.index

In [36]:
metadata_summary = pd.concat([ages, gender, metadata_summary, biosample, description], axis = 1)

In [37]:
metadata_summary = metadata_summary.dropna(subset=['age'])

In [38]:
metadata_summary.to_pickle('histone_metadata/' + organism + '/' + data_type + '/metadata_summary.pkl')

#### Download files

In [39]:
metadata = metadata.loc[metadata_summary.index]

In [40]:
for histone_mark in np.unique(metadata['Experiment target']):
    
    metadata_mark = metadata[metadata['Experiment target'] == histone_mark]
    
    #to download from a txt file using the terminal
    file = open('histone_data/' + organism + '/' + data_type + '/' + histone_mark[:-6] + "/raw_data/files_" + histone_mark[:-6] + ".txt", "w")
    for url in metadata_mark['File download URL']:
        file.write(url + '\n')
    file.close()