# Create a manifest file

The *manifest.txt* file created here is intended to be used with [gdc-client](https://gdc.cancer.gov/access-data/gdc-data-transfer-tool)

Please refer to [gdc API](https://docs.gdc.cancer.gov/API/Users_Guide/Getting_Started/) for reference about query done here

In [23]:
import requests
import json
import pandas as pd
import os
import sys


PATH = os.getcwd()

In [24]:
# use file endopint
files_endpt = "https://api.gdc.cancer.gov/files"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = [
    "file_name",
    #"analysis.workflow_type",
    #"experimental_strategy",
    "cases.project.primary_site",
    "cases.project.disease_type",
    "cases.project.project_id",
    "cases.submitter_id",
    "cases.samples.portions.analytes.aliquots.submitter_id",
    "cases.demographic.vital_status",
    "cases.demographic.days_to_birth",
    "cases.demographic.days_to_death",
    "cases.demographic.gender",
    "cases.diagnoses.primary_diagnosis",
    "cases.diagnoses.tumor_stage",
    "cases.diagnoses.tumor_grade",
    "cases.diagnoses.last_known_disease_status",
    "cases.diagnoses.progression_or_recurrence"
    "cases.diagnoses.days_to_last_follow_up",
    "cases.diagnoses.age_at_diagnosis",
    "cases.diagnoses.morphology",
    "cases.diagnoses.tissue_or_organ_of_origin",
    "cases.exposures.years_smoked",
    "cases.exposures.cigarettes_per_day",
    "cases.samples.longest_dimension",
    'cases.samples.sample_type',
    "samples.portions.analytes.aliquots.submitter_id",
    "samples.sample_type"
]

fields = ','.join(fields)

Filter gene expression data obtained with *RNA-Seq HTSeq* in txt format either from **BRCA** or **LUAD** and **LUSC**

Don't forget to select the correct filter for *cases.project.project_id*

In [25]:
# Tissue project filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene Expression Quantification"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.experimental_strategy",
            "value": ["RNA-Seq"]
            }
        },
        {
        "op": "in",
        "content":{
               "field": "files.analysis.workflow_type",
                "value": ["HTSeq - FPKM"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ['TCGA-LUAD',] 
            }
        }
        
    ]
}

In [26]:
# A POST is used, so the filter parameters can be passed directly as a Dict object.
##remove manifest to have file fields
params = {
   "return_type": "manifest",
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }

In [27]:
# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

#print(response.content.decode("utf-8"))

In [28]:
with open(f"{PATH}/manifest.txt","w") as manifest:
    manifest.write(response.content.decode("utf-8"))

# Files

Write a file with all metadata, useful in next analyses

In [29]:
params = {
    "filters": json.dumps(filters),
    "fields": fields,
    "format": "TSV",
    "size": "15000"
    }
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
with open(f"{PATH}/files.txt","w") as files:
    files.write(response.content.decode("utf-8"))

In [30]:
df_files = pd.read_csv(f"{PATH}/files.txt", sep='\t')
df_files.drop("id", axis=1, inplace=True)
df_files.set_index("file_name", inplace=True)
df_files.head(3)

Unnamed: 0_level_0,cases.0.demographic.days_to_birth,cases.0.demographic.days_to_death,cases.0.demographic.gender,cases.0.demographic.vital_status,cases.0.diagnoses.0.age_at_diagnosis,cases.0.diagnoses.0.last_known_disease_status,cases.0.diagnoses.0.morphology,cases.0.diagnoses.0.primary_diagnosis,cases.0.diagnoses.0.tissue_or_organ_of_origin,cases.0.diagnoses.0.tumor_grade,cases.0.exposures.0.cigarettes_per_day,cases.0.exposures.0.years_smoked,cases.0.project.disease_type,cases.0.project.primary_site,cases.0.project.project_id,cases.0.samples.0.longest_dimension,cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id,cases.0.samples.0.sample_type,cases.0.submitter_id
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
12fff07f-74d6-4ee0-9f4f-36a4d194d24f.FPKM.txt.gz,-25069.0,161.0,male,Dead,25069.0,not reported,8140/3,"Adenocarcinoma, NOS","Upper lobe, lung",not reported,1.369863,15.0,Lung Adenocarcinoma,Lung,TCGA-LUAD,,TCGA-MP-A4T8-01A-11R-A24X-07,Primary Tumor,TCGA-MP-A4T8
05d2b1e9-22a4-4369-b622-63846d672875.FPKM.txt.gz,-27993.0,1790.0,female,Dead,27993.0,not reported,8140/3,"Adenocarcinoma, NOS","Lower lobe, lung",not reported,2.739726,,Lung Adenocarcinoma,Lung,TCGA-LUAD,,TCGA-MP-A4T6-01A-32R-A262-07,Primary Tumor,TCGA-MP-A4T6
c8fb3217-2cc9-47c0-84b6-b666c07624e6.FPKM.txt.gz,-18762.0,896.0,female,Dead,18762.0,not reported,8140/3,"Adenocarcinoma, NOS","Upper lobe, lung",not reported,3.041096,,Lung Adenocarcinoma,Lung,TCGA-LUAD,0.9,TCGA-49-4510-01A-01R-1206-07,Primary Tumor,TCGA-49-4510


In [31]:
import numpy as np

df_files.drop(df_files.index[df_files['cases.0.samples.0.sample_type'] == 'Recurrent Tumor'], inplace=True)
df_files['type'] = df_files['cases.0.samples.0.sample_type']
df_files.drop(columns = ['cases.0.samples.0.sample_type'], inplace=True)


In [32]:
df_files.columns

Index(['cases.0.demographic.days_to_birth',
       'cases.0.demographic.days_to_death', 'cases.0.demographic.gender',
       'cases.0.demographic.vital_status',
       'cases.0.diagnoses.0.age_at_diagnosis',
       'cases.0.diagnoses.0.last_known_disease_status',
       'cases.0.diagnoses.0.morphology',
       'cases.0.diagnoses.0.primary_diagnosis',
       'cases.0.diagnoses.0.tissue_or_organ_of_origin',
       'cases.0.diagnoses.0.tumor_grade',
       'cases.0.exposures.0.cigarettes_per_day',
       'cases.0.exposures.0.years_smoked', 'cases.0.project.disease_type',
       'cases.0.project.primary_site', 'cases.0.project.project_id',
       'cases.0.samples.0.longest_dimension',
       'cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id',
       'cases.0.submitter_id', 'type'],
      dtype='object')

In [33]:
# save file to dat
df_files.to_csv(f"{PATH}/files_tcga.dat", header=True)