*Quick reminder:* To run the code in a cell, either click on the "play" icon above, or press Shift+Enter. Cells can be run as many times as needed.

In [None]:
import os
import pandas as pd

# Split multi-sample file -- Many samples per row, 1 column per sample
<i><font size=2 color="grey">Version 1.0, last modified September 2017<br>
Pandas 0.19.2<br>
Python 3.6<br>
</font></i>
  
  
This notebook contains code to split a file containing data for multiple samples. <font color="red"><strong>Expected Format:</strong></font> many sample per row, 1 column per sample.   

**<u>Expected input:</u>**  
  
Input file to split, and mapping between user-defined ID and ImmPort Experiment Sample Accession. Input file must be a **text file**. The expectation is that the headers contain the <strong>user-defined ID <font color="red">ONLY</font></strong>.
  
**<u>Output:</u>**  

Directory containing one file per experiment sample accession.   
  
**<u>Parameters:</u>**  
<font color="DarkRed"><strong>Please</strong></font> change the following parameters by commenting out or editing accordingly.

In [None]:
## Study accession:
sdy_acc = "SDYxxx"

## Experiment accession:
exp_acc = "EXPxxxxx"

## What type of data is contained in the file to split?
file_type = "gene_expression"
#file_type = "microbiome_results"
#file_type = "RNA_seq"
#file_type = "microarray_results"

## Path to file to split:
input_file = "placeholder/path/to/input_file"

## Is the file to split tab or comma separated?
input_file_format = "tsv"
#input_file_format = "csv"

## How many columns contain information about probe, type of probe, gene, gene names etc?
## These columns will be included in each of the individual sample files.
info_column = 1

## What is the best desription of the value represented?
measure = "Value"
#measure = "Count"
#measure = "Signal"

## Path to mapping file
mapping_file = "placeholder/path/to/mapping_file"

## How are the fields separated in the mapping file?
sep_mapping = "\t"
#sep_mapping = ","

## Which column contains the user-defined ID in the mapping file? Count starts at 1.
exp_sample_id_mapping = 1

## Which column contains the experiment sample accession in the mapping file? Count starts at 1.
exp_sample_acc_mapping = 2

## Does the mapping file contain a header line?
mapping_header = True
#mapping_header = False

The following lines of code go through the mapping file to build a dictionary containing the user-defined ID to experiment sample accessions, and create a file path for each accession.

In [None]:
## Create empties
esIDs = {}
es_files = {}
no_match = []

## Set up parameters
uid_idx = exp_sample_id_mapping - 1
es_acc_idx = exp_sample_acc_mapping - 1
extension =  file_type + ".txt"
read_fctns = {"tsv" : pd.read_table, "csv": pd.read_csv}

## Create path to output directory:
## directory will be ./SDYxxx/EXPxxxxx/ where . is the directory containing the input file
output_path_file = os.path.split(os.path.realpath(input_file))[0]
sdy_dir = os.path.join(output_path_file, sdy_acc)
os.makedirs(sdy_dir, exist_ok=True)
exp_dir = os.path.join(sdy_dir, exp_acc)
os.makedirs(exp_dir, exist_ok=True)

## Go through mapping file:
try:
    with open(mapping_file, "r") as mf:
        if mapping_header:
            mf.readline()
        for line in mf:
            uid = line.strip().split(sep_mapping)[uid_idx]
            es_acc = line.strip().split(sep_mapping)[es_acc_idx]
            es_acc = es_acc.strip("\"")
            uid = uid.strip("\"")
            esIDs[uid] = es_acc

            ## assuming one line per experiment sample, create file paths here.
            es_file_name = "_".join([sdy_acc, exp_acc, es_acc, extension])
            es_file_path = os.path.join(exp_dir, es_file_name)
            es_files[uid] = es_file_path
except:
    print("Couldn't open the mapping file. Please check parameters indicated above.")

The following lines of code populate the files that were just created.

In [None]:
## Go through file to split:
try:
    df = read_fctns[input_file_format](input_file)
    ## subset the first few informational columns that will appear in each file:
    df_info = df.iloc[:, 0:info_column]
    
    ## For each accession in the mapping file, populate the files:
    for ids in es_files:
        if ids in df.columns:
            df_subset = pd.concat([df_info, df[ids]], axis=1, join_axes=[df.index])
            df_subset.rename(columns = {df_subset.columns[info_column]:measure}, inplace=True)
            df_subset.to_csv(es_files[ids], sep="\t", index=False)
        else:
            no_match.append(ids)
            
    print("%s files were created in %s\n" % ((len(es_files) - len(no_match)), exp_dir))
    print("Files are ready for upload.")
    if (no_match):
        print("The following columns header were not matched to an accession in the mapping file:")
        print("(No files corresponding to these headers were generated)")
        print("\n".join(no_match))
        

except:
    print("Couldn't read file to split. Please check path or file format in the parameters above.")
    print("Given path was %s" % (input_file))

If files are uploaded after experiment samples accessions were generated, each individual file will need to be remapped to its corresponding experiment sample accession. The following code generates the file containing the mapping between Experiment Sample Accession and File Info ID for the 'Move Archive' Tool.  
<font color="DarkRed"><strong>Please</strong></font> change the parameters first:

In [None]:
## Path to upload database report -- this is the file in the confirmation email of successful upload
upload_report = "placeholder/path/to/upload_report/ImmPort.report.Database.xxxx.txt"

## What value to use for file detail?
file_detail = "Custom assay result"
#file_detail = "Gene expression result"
#file_detail = "RNA sequencing result"
#file_detail = "Genotyping result"
#file_detail = "TPM"
#file_detail = "RPKM"
#file_detail = "FPKM"
#file_detail = "Custom"
#file_detail = "FASTQ"
#file_detail = "Illumina BeadArray"
#file_detail = "Illumina GA"

In [None]:
## Set up empty
file_info = {}

## Path for mapping file:
## file will be in directory containing the SDYxxx directory
fid_mapping_filename = "_".join([sdy_acc, exp_acc, "ES_ACC_2_FID.txt"])
fid_mapping_path = os.path.join(output_path_file, fid_mapping_filename)

try:
    ## Go through data base report
    with open(upload_report, "r") as ur:
        for line in ur:
            if line.startswith("Stored in:file_info"):
                path = line.split()[4]
                filename = os.path.split(path)[1]
                es = filename.split("_")[2]
                fid  = path.split(".")[-2]
                file_info[es] = fid

    try:
        ## Generate mapping file experiment sample - file info ID
        with open(fid_mapping_path, "w") as fm:
            for exp_acc_nb in es_files:
                fm.write("\t".join([file_info[esIDs[exp_acc_nb]], esIDs[exp_acc_nb], file_detail]) + "\n")
        print("The file to associate File Info IDs to Experiment Sample Accession was generated:")
        print(fid_mapping_path)
        
    except:
        print("The file mapping File Info ID to Experiment Sample Accession could not be generated.")
        print("Please check that the database report is the right one.")

except:
    print("The database report could not be read. Please check the path above.")