In [1]:
import os
import requests
import pandas as pd

In [44]:
projects = requests.get("https://dcc.icgc.org/api/v1/projects?size=1000").json()["hits"]
print "Found {} total ICGC projects".format(len(projects))

projects_with_ssm = [p for p in projects if "availableDataTypes" in p and "ssm" in p["availableDataTypes"]]
print "Found {} with ssm".format(len(projects_with_ssm))

data_types = ["donor", "simple_somatic_mutation.open"]
files = [{"name": "{1}.{0}.tsv.gz".format(p["id"], d),
          "url": "https://dcc.icgc.org/api/v1/download?fn=/release_22/Projects/{0}/{1}.{0}.tsv.gz".format(p["id"], d)}
         for p in projects_with_ssm for d in data_types]
files_to_download = [f for f in files if not os.path.isfile("/data/icgc/{}".format(f["name"]))]
print "Files to download", len(files_to_download)

Found 70 total ICGC projects
Found 60 with ssm
Files to download 0


In [45]:
for f in files_to_download:
    print("Downloading {}".format(f["name"]))
    r = requests.get(f["url"], allow_redirects=True, verify=False)
    if r.status_code == requests.codes.ok:
        with open("/data/icgc/{}".format(f["name"]), 'wb+') as tar:
            tar.write(r.content)
    else:
        print("Problems downloading {}: {}".format(f["name"], r.status_code))

In [5]:
donors = pd.read_table("/data/icgc/donor.ALL-US.tsv.gz")
donors.head()

Unnamed: 0,icgc_donor_id,project_code,study_donor_involved_in,submitted_donor_id,donor_sex,donor_vital_status,disease_status_last_followup,donor_relapse_type,donor_age_at_diagnosis,donor_age_at_enrollment,...,donor_relapse_interval,donor_diagnosis_icd10,donor_tumour_staging_system_at_diagnosis,donor_tumour_stage_at_diagnosis,donor_tumour_stage_at_diagnosis_supplemental,donor_survival_time,donor_interval_of_last_followup,prior_malignancy,cancer_type_prior_malignancy,cancer_history_first_degree_relative
0,DO2,ALL-US,,TARGET-10-PAIXFI,male,,,,5,5,...,2865,C91.0,,,,2865,2920,,,
1,DO1,ALL-US,,TARGET-10-PAIXDK,male,deceased,relapse,progression (liquid tumours),7,7,...,1169,C91.0,,,,2037,2037,,,
2,DO4,ALL-US,,TARGET-10-PAIXGP,female,alive,,,2,2,...,5087,C91.0,,,,5087,5087,,,
3,DO3,ALL-US,,TARGET-10-PAIXFN,male,deceased,relapse,progression (liquid tumours),15,15,...,650,C91.0,,,,1068,1068,,,
4,DO6,ALL-US,,TARGET-10-PAIXPN,male,alive,,,1,1,...,4946,C91.0,,,,4946,4946,,,


In [25]:
donors.columns

Index([u'icgc_donor_id', u'project_code', u'study_donor_involved_in',
       u'submitted_donor_id', u'donor_sex', u'donor_vital_status',
       u'disease_status_last_followup', u'donor_relapse_type',
       u'donor_age_at_diagnosis', u'donor_age_at_enrollment',
       u'donor_age_at_last_followup', u'donor_relapse_interval',
       u'donor_diagnosis_icd10', u'donor_tumour_staging_system_at_diagnosis',
       u'donor_tumour_stage_at_diagnosis',
       u'donor_tumour_stage_at_diagnosis_supplemental', u'donor_survival_time',
       u'donor_interval_of_last_followup', u'prior_malignancy',
       u'cancer_type_prior_malignancy',
       u'cancer_history_first_degree_relative'],
      dtype='object')

In [50]:
donors[donors["icgc_donor_id"] == "DO162"]

Unnamed: 0,icgc_donor_id,project_code,study_donor_involved_in,submitted_donor_id,donor_sex,donor_vital_status,disease_status_last_followup,donor_relapse_type,donor_age_at_diagnosis,donor_age_at_enrollment,...,donor_relapse_interval,donor_diagnosis_icd10,donor_tumour_staging_system_at_diagnosis,donor_tumour_stage_at_diagnosis,donor_tumour_stage_at_diagnosis_supplemental,donor_survival_time,donor_interval_of_last_followup,prior_malignancy,cancer_type_prior_malignancy,cancer_history_first_degree_relative
491,DO162,ALL-US,,TARGET-10-PALETF,female,deceased,relapse,progression (liquid tumours),8,8,...,650,C91.0,,,,768,768,,,


In [7]:
variants = pd.read_table("/data/icgc/simple_somatic_mutation.open.ALL-US.tsv.gz")
variants.head()

Unnamed: 0,icgc_mutation_id,icgc_donor_id,project_code,icgc_specimen_id,icgc_sample_id,matched_icgc_sample_id,submitted_sample_id,submitted_matched_sample_id,chromosome,chromosome_start,...,experimental_protocol,sequencing_strategy,base_calling_algorithm,alignment_algorithm,variation_calling_algorithm,other_analysis_algorithm,seq_coverage,raw_data_repository,raw_data_accession,initial_data_release_date
0,MU4819276,DO5,ALL-US,SP873,SA565,SA167,TARGET-10-PAIXPH-03A-01D,TARGET-10-PAIXPH-10A-01D,12,25271550,...,Paired End,WXS,CASAVA,BWA,SNVMix2,,,CGHub,27e629e7-b3c8-4d3b-8ddc-0ff89433e96d,
1,MU4819276,DO5,ALL-US,SP873,SA565,SA167,TARGET-10-PAIXPH-03A-01D,TARGET-10-PAIXPH-10A-01D,12,25271550,...,Paired End,WXS,CASAVA,BWA,SNVMix2,,,CGHub,27e629e7-b3c8-4d3b-8ddc-0ff89433e96d,
2,MU4819276,DO5,ALL-US,SP873,SA565,SA167,TARGET-10-PAIXPH-03A-01D,TARGET-10-PAIXPH-10A-01D,12,25271550,...,Paired End,WXS,CASAVA,BWA,SNVMix2,,,CGHub,27e629e7-b3c8-4d3b-8ddc-0ff89433e96d,
3,MU4819276,DO5,ALL-US,SP873,SA565,SA167,TARGET-10-PAIXPH-03A-01D,TARGET-10-PAIXPH-10A-01D,12,25271550,...,Paired End,WXS,CASAVA,BWA,SNVMix2,,,CGHub,27e629e7-b3c8-4d3b-8ddc-0ff89433e96d,
4,MU4819276,DO5,ALL-US,SP873,SA565,SA167,TARGET-10-PAIXPH-03A-01D,TARGET-10-PAIXPH-10A-01D,12,25271550,...,Paired End,WXS,CASAVA,BWA,SNVMix2,,,CGHub,27e629e7-b3c8-4d3b-8ddc-0ff89433e96d,


In [21]:
variants.columns

Index([u'icgc_mutation_id', u'icgc_donor_id', u'project_code',
       u'icgc_specimen_id', u'icgc_sample_id', u'matched_icgc_sample_id',
       u'submitted_sample_id', u'submitted_matched_sample_id', u'chromosome',
       u'chromosome_start', u'chromosome_end', u'chromosome_strand',
       u'assembly_version', u'mutation_type', u'reference_genome_allele',
       u'mutated_from_allele', u'mutated_to_allele', u'quality_score',
       u'probability', u'total_read_count', u'mutant_allele_read_count',
       u'verification_status', u'verification_platform',
       u'biological_validation_status', u'biological_validation_platform',
       u'consequence_type', u'aa_mutation', u'cds_mutation', u'gene_affected',
       u'transcript_affected', u'gene_build_version', u'platform',
       u'experimental_protocol', u'sequencing_strategy',
       u'base_calling_algorithm', u'alignment_algorithm',
       u'variation_calling_algorithm', u'other_analysis_algorithm',
       u'seq_coverage', u'raw_data_re

In [14]:
submissions = variants.groupby(["icgc_donor_id", "icgc_sample_id"])

In [54]:
for donor_sample, variants in submissions:
    print donor_sample
    print len(variants.head())
    print donors[donors["icgc_donor_id"] == donor_sample[0]][["donor_sex", "donor_age_at_diagnosis"]]

('DO162', 'SA172')
1
    donor_sex  donor_age_at_diagnosis
491    female                       8
('DO162', 'SA596')
5
    donor_sex  donor_age_at_diagnosis
491    female                       8
('DO200', 'SA577')
4
    donor_sex  donor_age_at_diagnosis
576      male                       3
('DO208', 'SA579')
5
    donor_sex  donor_age_at_diagnosis
579      male                      13
('DO210', 'SA622')
3
    donor_sex  donor_age_at_diagnosis
571    female                       6
('DO22', 'SA30')
5
   donor_sex  donor_age_at_diagnosis
64      male                      14
('DO228', 'SA629')
3
    donor_sex  donor_age_at_diagnosis
570      male                      14
('DO282', 'SA581')
1
    donor_sex  donor_age_at_diagnosis
536      male                       2
('DO376', 'SA133')
3
    donor_sex  donor_age_at_diagnosis
619    female                      13
('DO378', 'SA135')
5
    donor_sex  donor_age_at_diagnosis
621      male                       8
('DO378', 'SA394')
4
    donor_sex

In [89]:
# Submissions dict keyed by icgc_sample_id
submissions = {id: {} for id in expression.analyzed_sample_id.append(variants.analyzed_sample_id).unique()}
print "Found {} samples with either or both expression and variant data".format(len(submissions))
    
# for sample_id in expression.icgc_sample_id.unique():
#     submissions[sample_id]["expression"] = expression[expression["icgc_sample_id"] == sample_id][
#         ["gene_id", "normalized_expression_value"]]
#     submissions[sample_id]["icgc_donor_id"] = expression[expression["icgc_sample_id"] == sample_id].iloc[0]

# for sample_id in variants.icgc_sample_id.unique():
#     submissions[sample_id]["variants"] = variants[variants["icgc_sample_id"] == sample_id][
#         ["chromosome", "chromosome_start", "mutated_from_allele", "mutated_to_allele"]]
#     submissions[sample_id]["icgc_donor_id"] = \
#         variants[variants["icgc_sample_id"] == sample_id]["icgc_donor_id"].get_value
# submissions["SA565"]


AttributeError: 'DataFrame' object has no attribute 'analyzed_sample_id'