In [None]:
os.getcwd()

In [1]:
### standard python pacakges

import os
import glob
import re
import csv
import subprocess

### custom python

from generate_summary_csv import sample_summary

In [3]:
sample_dict = {'H4':'ko3','A5':'ko4','F3':'ko1','G3':'ctl1','H3':'ko2','A4':'ctl2'}

# sample_dict = {'H4':'ko3'}

In [4]:
chemistry_dict = {'ko3':'v3','ko4':'v3','ko1':'v3','ctl1':'v3','ko2':'v3','ctl2':'v3'}

# chemistry_dict = {'ko3':'v3'}

In [5]:
parameter_dict = {'transcriptome_path':'XXX',
                  'barcode_rank_threshold':500,
                  'droplet_utils_FDR':0.01,
                  'mito_threshold':20}

In [6]:
class sequence_project:
    def __init__(self, directory_path, project_name):
        self.directory_path = directory_path
        self.project_name = project_name
        self.sample_dict = sample_dict
        self.chemistry_dict = chemistry_dict
        self.sub_dict = {}
        self.parameter_dict = parameter_dict
        self.output_directory = self.directory_path.rsplit('/',1)[0] + '/' + self.project_name
        self.file_list = self.get_file_list()
        self.file_dict = {}
        self.file_dict
        self.generate_file_dict()
        self.date_lane_dict = {}
        self.generate_date_to_lane_dict()
        self.run_the_pipeline() 
    
    def get_file_list(self):
        glob_input = self.directory_path + '/*.fastq.gz'
        file_list = glob.glob(glob_input)
        return file_list
    
    def generate_file_dict(self):
        for key in sample_dict:
            search_string = '-' + key + '-'
            relevant_files = list(filter(lambda x: search_string in x, self.file_list))
            self.file_dict[key] = relevant_files
    
    def generate_date_to_lane_dict(self):
        for file in self.file_list:
            date = re.search('_......_', file).group(0)[1:-1]
            self.date_lane_dict[date] = ""
        lane_num = 0
        for key in self.date_lane_dict:
            lane_num = lane_num + 1
            lane_string = 'L0'
            if lane_num < 10:
                lane_string = lane_string + '0' + str(lane_num)
            else:
                lane_string = lane_string + str(lane_num)
            self.date_lane_dict[key] = lane_string
        
    def make_output_dir(self):
        os.system('mkdir '+ self.output_directory)
        os.chdir(self.output_directory)     
    
    def rename_and_move_files(self):
        old_new_tuple = []
        x= 0
        for key in self.file_dict:
            old_path_list = self.file_dict[key]
            for path in old_path_list:
                old_file_path = path
                old_file = path.split('/')[-1]
                old_path = path.rsplit('/',1)[0] + '/'
                sample_name = re.search('GA-..-', old_file).group(0)[3:-1]
                date = re.search('_......_', old_file).group(0)[1:-1]
                sample_string = self.sub_dict[sample_name]
                print('1')
                print(date)
                print(sample_string)
                lane_string = '_' + 'L001' + "_" #self.date_lane_dict[date] + "_"
                print('2')
                read_string = re.search('_.._', old_file).group(0)[1:-1] + '_'
                end_string = '001.fastq.gz'
                new_file_name = sample_string + '_S1' + lane_string + read_string + end_string
                new_file_path = self.output_directory + '/' + new_file_name
                old_new_tuple.append(tuple((old_file_path, new_file_path)))
        for name in old_new_tuple:
            command_list = ['cp',str(name[0]),str(name[1])]
            os_copy_and_move_command = " ".join(command_list)
            os.system(os_copy_and_move_command)
    
    def run_cellranger(self):
        for key in self.sub_dict:
            id_name = self.sub_dict[key]
            transcriptome = self.parameter_dict['transcriptome_path']
            fastqs = self.output_directory
            sample = self.sub_dict[key]
            cellranger_command = "cellranger count --id={} --include-introns --transcriptome={} --fastqs={} --sample={}".format(id_name,\
                                                                                                         transcriptome,\
                                                                                                         fastqs,\
                                                                                                         sample)
            print(cellranger_command)
            os.system(cellranger_command)
            
            

    def cellranger_csv_summary(self):
        csv_summary_paths = []
        sample_names = self.sub_dict.values()
        for name in sample_names:
            new_path = self.output_directory + '/' + name +'/outs/metrics_summary.csv'
            csv_summary_paths.append(new_path)
        output_location = self.output_directory + '/' + self.project_name + '_cellranger_summary.csv'
        sample_summary(csv_summary_paths, output_location,self.chemistry_dict)
        subprocess.call(['Rscript', 'cellranger_summary_graph.r', output_location])   ### generate a graph in R
    
    def pre_process(self):
        sample_names = self.sub_dict.values()
        project_directory = self.output_directory
        arg_2 = str(self.parameter_dict['barcode_rank_threshold'])
        arg_3 = str(self.parameter_dict['droplet_utils_FDR'])
        arg_4 = str(self.parameter_dict['mito_threshold'])
        arg_6 = '/'.join([self.output_directory,'pre_process_objects'])
        arg_7 = project_directory
        os.system('mkdir '+ arg_6)
        for sample in sample_names:
            sample_directory = '/'.join([self.output_directory,sample,'outs/raw_feature_bc_matrix'])
            arg_1 = sample_directory
            arg_5 = sample
            subprocess.call(['Rscript', 'pre_process.r', arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7])
    
    def merge_and_align(self):
        arg_1 = '/'.join([self.output_directory,'pre_process_objects'])
        arg_2 = self.output_directory
        subprocess.call(['Rscript','merge_and_align.r',arg_1,arg_2])

    def run_the_pipeline(self):
        sample_list = [sample for sample in dict.values(self.sample_dict)]
        number_list = [number for number in range(len(sample_list))]
        barcode_list = [barcode for barcode in dict.keys(self.sample_dict)]
        sample_index_list = list(zip(sample_list,barcode_list,number_list))
        print("""Hello user. Here are the samples available for processing today: \n""")
        print("""Sample\tBarcode\tIndex""")
        for sample in sample_index_list:
            print('{}:\t{}:\t{}'.format(sample[0],sample[1],sample[2]))
        user_index = input("\n" \
        "What samples would you like to process? Please enter one of the following: \n" \
        "a) a comma separated list of indices\n" \
        "b) all, to process all samples\n" \
        "\n")

        if user_index == 'all':
            self.sub_dict = self.sample_dict
        else:
            index_list = user_index.split(',')
            index_list = [int(x) for x in index_list]
            for index in index_list:
                barcode = barcode_list[index]
                sample = sample_list[index]
                self.sub_dict[barcode] = sample

        process_select = input("\n" \
                               "Terrific, where should we start (enter a #)?:\n" \
                               "1 = start from scratch\n" \
                               "2 = start from aligned cell-ranger input\n"  \
                               "3 = start from seurat processed input\n" \
                               "\n")

        print("\n")

        start_value = int(process_select)
        if start_value == 1:
            self.make_output_dir()
            self.rename_and_move_files()
            self.run_cellranger()
            #print(self.file_dict)
            #self.cellranger_csv_summary()
            print(self.file_list)
            print(self.directory_path)
            start_value +=1
        if start_value == 2:
            start_value +=1
            #self.pre_process()
        if start_value == 3:
         print('done')

In [None]:
nr1_prelim = sequence_project('XXX','nr1_all_processed_flx_expand')