*Quick reminder:* To run the code in a cell, either click on the "play" icon above, or press Shift+Enter. Cells can be run as many times as needed.

In [None]:
import os

# Generate File for Move Archive Tool
<i><font size=2 color="grey">Version 1.0, last modified September 2017<br>
Python 3.6<br>
</font></i>
  
  
This notebook contains code to generate the file required by the 'Move Archive tool' -- to associate archived files with experiment sample accessions.    

**<u>Expected input:</u>**  
  
- Database upload report received after upload of the files to move.  
- Mapping of Experiment Sample Accession to file name - please provide a **plain text file**.  
  
**<u>Output:</u>**  

Mapping file, SDYacc_EXPacces_ES_ACC_2_FID.txt, in directory containing the mapping file.   
  
**<u>Parameters:</u>**  
<font color="DarkRed"><strong>Please</strong></font> change the following parameters by commenting out or editing accordingly.

In [None]:
## Study accession:
sdy_acc = "SDYxxx"

## Experiment accession:
exp_acc = "EXPxxxxx"

## Path to upload database report -- this is the file in the confirmation email of successful upload
upload_report = "placeholder/path/to/upload_report/ImmPort.report.Database.xxxx.txt"

## What value to use for file detail?
file_detail = "Custom assay result"
#file_detail = "Gene expression result"
#file_detail = "RNA sequencing result"
#file_detail = "Genotyping result"
#file_detail = "TPM"
#file_detail = "RPKM"
#file_detail = "FPKM"
#file_detail = "Custom"
#file_detail = "FASTQ"
#file_detail = "Illumina BeadArray"
#file_detail = "Illumina GA"

## Path to mapping file containing correspondance between Experiment Sample Accession and File Name
mapping_file = "placeholder/path/to/mapping_file"

## How are the fields separated in the mapping file?
sep_mapping = "\t"
#sep_mapping = ","

## Which column contains the experiment sample accession in the mapping file? Count starts at 1.
exp_sample_acc_mapping = 1

## Which column contains the filename in the mapping file? Count starts at 1.
filename_mapping = 2

## Does the mapping file contain a header line?
mapping_header = True
#mapping_header = False 

The following lines of code go through the mapping file to associate Experiment Sample Accession to corresponding file names.

In [None]:
## Set up empty
file_info = {}
file_names = {}
no_mapping = []

## Set up parameters:
es_index = exp_sample_acc_mapping - 1
filename_index = filename_mapping - 1

## Path for output mapping file:
output_path_file = os.path.split(mapping_file)[0]
fid_mapping_filename = "_".join([sdy_acc, exp_acc, "ES_ACC_2_FID.txt"])
fid_mapping_path = os.path.join(output_path_file, fid_mapping_filename)

## Go through mapping file:
try:
    with open(mapping_file, "r") as mf:
        if mapping_header:
            mf.readline()
        for line in mf:
            es = line.strip().split(sep_mapping)[es_index]
            filename = line.strip().split(sep_mapping)[filename_index]
            file_names[filename] = es
except:
    print("Could not get information from the mapping file. Please check parameters above.")
    print("Given path for the mapping file was %s" % (mapping_file))

The following lines of code go through the database report and generate the file mapping Experiment Sample Accessions to File Info ID, based on filenames.

In [None]:
try:
    ## Go through data base report
    with open(upload_report, "r") as ur:
        for line in ur:
            if line.startswith("Stored in:file_info"):
                path1 = line.split()[4]
                fid  = path1.split(".")[-2]
                name = line.split()[3]
                file_info[name] = fid

    try:
        ## Generate mapping file experiment sample - file info ID
        with open(fid_mapping_path, "w") as fm:
            for files in file_info:
                if files in file_names:
                    fm.write("\t".join([file_info[files], file_names[files], file_detail]) + "\n")
                else:
                    no_mapping.append(files)
        print("The file to associate File Info IDs to Experiment Sample Accession was generated:")
        print(fid_mapping_path)
        if no_mapping:
            print("The following files were uploaded but don't have a corresponding Experiment Sample Accession \
                    in the file provided:")
            print("\n".join(no_mapping))
        
    except:
        print("The file mapping File Info ID to Experiment Sample Accession could not be generated.")
        print("Please check that the database report is the right one.")

except:
    print("The database report could not be read. Please check the path above.")