The raw data contains sequencing result from both Nextseq PE150 and Novaseq SE75 runs.

Among them, 
    1. S6, S18, S19, S20 are the mock sample with 4 bacterial species, with only Nextseq sequencing results.
    2. The others are stool samples
        44-14: S14, with both Nextseq and Novaseq results
        44-52: S31, S32, S34, with only Nextseq sequencing results
        44-150: S5, S7, S15, S27, with both Nextseq and Novaseq results
        44-111: S8, S9, with both Nextseq and Novaseq results
        44-171: S10, with both Nextseq and Novaseq results
        44-172: S11, S12, with both Nextseq and Novaseq results
        44-224: S33, S35, with only Nextseq sequencing results

In [None]:
import time
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
import shutil
base_dirt = "./sequencing_microbeseq/"
trimmed_dir_base = base_dirt + "trimmed/"
R1R2_filtered_dir_base = base_dirt + "filtered_selected/"
output_dirt = base_dirt + "filtered/"
barcode_dirt = "./bc1andbc2.xlsx" # the folder with barcode 1 and barcode 2 sequences

In [None]:
# Barcode Information
bc1_number = 96
bc2_number = 384
barcode_temp = pd.read_excel(barcode_dirt)
bc2_temp = barcode_temp['bc2'].values
bc2 = []
for i in range(bc2_number):
    bc2 += [str(bc2_temp[i][31:39])]
bc1_temp = barcode_temp['bc1'].values
bc1 = []
for i in range(bc1_number):
    loc = bc1_temp[i].find("AGATCGGAAGAGCGTCGTGTAGGGAAAGAG")
    bc1 += [str(bc1_temp[i][23:loc-1])]

number_barcode = bc1_number * bc2_number 
while True:
    i = 0
    while True:
        sample_ID_multiplier =  10**(i)
        if sample_ID_multiplier > number_barcode:
            break
        i += 1
    break

In [None]:
# function count_match
# input: two strings a and b with same length
# output: number of matches between strings a and b
def count_match(a, b):
    if len(a) != len(b):
        return 0
    n = len(a)
    match_number = 0
    for i in range(n):
        if a[i] == b[i]:
            match_number += 1
    return match_number

# returns reverse complement of a sequence
def reverseComplement(s):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    t = ''
    for base in s:
        t = complement[base] + t
    return t

# returns the hamming distance fo two strings
def hamdist(str1, str2):
    diffs = 0
    for ch1, ch2 in zip(str1, str2):
        if ch1 != ch2:
            diffs += 1
    return diffs
    
# returns the closest hamming distance of the starting location of 2 sequences, if larger than 5, return -1
def find_W1_loc(read, W1 = "GAGTGATTGCTTGTGACGCCTT", threshold = 5):
    length_W1 = len(W1)
    start_loc = 8
    tests = 4
    closest_loc = start_loc
    diff = length_W1
    for i in range(tests):
        diff_temp = hamdist(read[start_loc + i:start_loc + i + length_W1], W1) 
        if diff > diff_temp:
            closest_loc = i + start_loc
            diff = diff_temp
        #if hamming diatance less then threshold, then the location is correct and returned
        if diff < threshold:
            return closest_loc
    return 0

def closest_index(str1, ref_list, mismatch_allow = 1):
    closest_index_list = []
    closest_ham_dist = len(str1)
    for i in range(len(ref_list)):
        ham_dist_temp = hamdist(str1, ref_list[i])
        if ham_dist_temp < closest_ham_dist:# if a closer element if found, initialize closest_index_list and closest_ham_dist
            closest_ham_dist = ham_dist_temp
            closest_index_list = [i]
        elif ham_dist_temp == closest_ham_dist:# if it's the same length, add it to length list
            closest_index_list += [i]
    if closest_ham_dist <= mismatch_allow:
        return closest_index_list
    else:
        return []

-------------------
### First process Nextseq result

In [None]:
# with input read file, output each sorted file, each output file is a barcode/single cell
# All input fastq files should have this format
#     S*_R1.fastq, and S*_R2.fastq
def fastq_filter(Sample_index, input_dirt, output_dirt, R1_file_ending = "_R1.fastq", R2_file_ending = "_R2.fastq"):
    sample_number_ID = Sample_index*sample_ID_multiplier 
    # this is to directly take input read files, check structre, then sort reads according to their barcode number

    R1_ori_dir = input_dirt+ "S"+str(Sample_index) + R1_file_ending
    R2_ori_dir = input_dirt+ "S"+str(Sample_index) + R2_file_ending
    # This is to creat empty file which will be used to store reads later
    # Both R1 and R2 files are output into the same folder, this will be easier for later adapter trim
    for i in range(number_barcode):
        dirct = output_dirt + str(i+sample_number_ID) + '_R1.fastq'
        foutput1 = open(dirct, 'w')        
        foutput1.close()
        dirct = output_dirt + str(i+sample_number_ID) + '_R2.fastq'
        foutput1 = open(dirct, 'w')        
        foutput1.close()
    hit_map = [] # label if the read pass stru and barcode check, -1 means no pass
    # initialize
    loc_list_temp = []
    read_list_temp = []
    hold_list_temp = []
    quality_list_temp = []
    for i in range(number_barcode): 
        # each i is a brcode, and this element in list stores corresponding information
        loc_list_temp += [[]]
        read_list_temp += [[]]
        hold_list_temp += [[]]
        quality_list_temp += [[]]
    #sample_read_count = [0]*(samples+1) # store read number for each sample, 0 for no match (bad), others for each sample
    test_flag = 0 # used to count how many reads processed
    indicator_W1fail = 0
    indicator_BC1fail = 0
    indicator_BC2fail = 0
    indicator_BC1_mismatch_allowed = 0
    indicator_BC2_mismatch_allowed = 0
    indicator_R1_short = 0
    storage_read_number = 1000000 
    # how many reads to take before output (this is a balance between I/O speed and RAM use)
    BC2_fail_dist = []
    with open(R1_ori_dir) as finput:
        while True:            
            flag = -1 # flag which sample the read is, -1 means not a good read
            test_flag += 1 # count read number
            line_loc = finput.readline()
            line_read = finput.readline()
            line_hold = finput.readline()
            line_quality = finput.readline()
            if len (line_read) == 0:
                break
            if len (line_read) <= 80: # if too short, no need to keep this read
                # It's important to use 80 instead of 41 here, 
                # as part of Read1 is going to be written in the file
                # if Read 1 is too short, output would be empty, leading to error
                #sample_read_count[0] += 1
                hit_map += [-1]
                indicator_R1_short += 1
                print sample_number_ID, "length less then 80"
                continue
            loc = find_W1_loc(line_read)
            if loc:
                bc1_temp = reverseComplement(line_read[:loc])                
                bc2_temp = reverseComplement(line_read[loc+22:loc+30])                
                if bc1_temp in bc1: 
                    index1_temp = bc1.index(bc1_temp)
                    if bc2_temp in bc2:
                        index2_temp = bc2.index(bc2_temp)
                        index_temp = index1_temp * bc2_number + index2_temp
                        flag = index_temp
                        loc_list_temp[index_temp].append(line_loc)
                        read_list_temp[index_temp].append(line_read[loc+63:])
                        hold_list_temp[index_temp].append(line_hold)
                        quality_list_temp[index_temp].append(line_quality[loc+63:])  
                    elif len(closest_index(bc2_temp, bc2)) == 1:
                        indicator_BC2_mismatch_allowed += 1
                        index2_temp = closest_index(bc2_temp, bc2)[0]
                        index_temp = index1_temp * bc2_number + index2_temp
                        flag = index_temp
                        loc_list_temp[index_temp].append(line_loc)
                        read_list_temp[index_temp].append(line_read[loc+63:])
                        hold_list_temp[index_temp].append(line_hold)
                        quality_list_temp[index_temp].append(line_quality[loc+63:])  
                    else:
                        indicator_BC2fail += 1
                        index2_temp = -1
                elif len(closest_index(bc1_temp, bc1)) == 1:
                    indicator_BC1_mismatch_allowed += 1
                    index1_temp = closest_index(bc1_temp, bc1)[0]
                    if bc2_temp in bc2:
                        index2_temp = bc2.index(bc2_temp)
                        index_temp = index1_temp * bc2_number + index2_temp
                        flag = index_temp
                        loc_list_temp[index_temp].append(line_loc)
                        read_list_temp[index_temp].append(line_read[loc+63:])
                        hold_list_temp[index_temp].append(line_hold)
                        quality_list_temp[index_temp].append(line_quality[loc+63:])  
                    elif len(closest_index(bc2_temp, bc2)) == 1:
                        indicator_BC2_mismatch_allowed += 1
                        index2_temp = closest_index(bc2_temp, bc2)[0]
                        index_temp = index1_temp * bc2_number + index2_temp
                        flag = index_temp
                        loc_list_temp[index_temp].append(line_loc)
                        read_list_temp[index_temp].append(line_read[loc+63:])
                        hold_list_temp[index_temp].append(line_hold)
                        quality_list_temp[index_temp].append(line_quality[loc+63:])  
                    else:
                        indicator_BC2fail += 1
                        index2_temp = -1
                else: 
                    indicator_BC1fail += 1
            else:
                indicator_W1fail += 1                
            hit_map += [flag]                          
            if test_flag % storage_read_number == 0:
                for i in range(number_barcode): # i is the number of barcode
                    dirct = output_dirt + str(i+sample_number_ID) + '_R1.fastq'
                    with open(dirct, 'a') as foutput1:
                        for j in range(len(loc_list_temp[i])):
                            foutput1.write(loc_list_temp[i][j])
                            foutput1.write(read_list_temp[i][j])                       
                            foutput1.write(hold_list_temp[i][j])
                            foutput1.write(quality_list_temp[i][j])
                    foutput1.close()  
                # initialize
                loc_list_temp = []
                read_list_temp = []
                hold_list_temp = []
                quality_list_temp = []
                for i in range(number_barcode):
                    loc_list_temp += [[]]
                    read_list_temp += [[]]
                    hold_list_temp += [[]]
                    quality_list_temp += [[]]              
    finput.close()
    # write out remaining data
    for i in range(number_barcode):
        dirct = output_dirt + str(i+sample_number_ID) + '_R1.fastq'
        with open(dirct, 'a') as foutput1:
            for j in range(len(loc_list_temp[i])):
                foutput1.write(loc_list_temp[i][j])
                foutput1.write(read_list_temp[i][j])                       
                foutput1.write(hold_list_temp[i][j])
                foutput1.write(quality_list_temp[i][j])
        foutput1.close()  
    print "Sample number is:", Sample_index
    print "Total read number for this sample is:", len(hit_map)
    print 'Total read pass structure check is {}, BC1 fail is {}, BC1 one mismatch is {},\
    W1 fail is {}, BC2 fail is {}, BC2 one mismatch is {}.\n'\
    .format(len(hit_map) - indicator_BC1fail-indicator_W1fail-indicator_BC2fail-indicator_R1_short,\
            indicator_BC1fail, indicator_BC1_mismatch_allowed, indicator_W1fail, \
            indicator_BC2fail, indicator_BC2_mismatch_allowed)

    # Based on the labeling of each read, take and sort Read 2 according to barcodes

    # initialize
    loc_list_temp = []
    read_list_temp = []
    hold_list_temp = []
    quality_list_temp = []
    for i in range(number_barcode):
        loc_list_temp += [[]]
        read_list_temp += [[]]
        hold_list_temp += [[]]
        quality_list_temp += [[]]
            
    test_flag = 0
    with open(R2_ori_dir) as finput:
        for k in range(len(hit_map)):
            test_flag += 1
            line_loc = finput.readline()
            line_read = finput.readline()
            line_hold = finput.readline()
            line_quality = finput.readline()
            index_temp = hit_map[k]
            if hit_map[k] != -1:
                loc_list_temp[index_temp].append(line_loc)
                read_list_temp[index_temp].append(line_read)
                hold_list_temp[index_temp].append(line_hold)
                quality_list_temp[index_temp].append(line_quality) 
            if test_flag % storage_read_number == 0:
                for i in range(number_barcode):
                    dirct = output_dirt + str(i+sample_number_ID) + '_R2.fastq'
                    with open(dirct, 'a') as foutput1:
                        for j in range(len(loc_list_temp[i])):
                            foutput1.write(loc_list_temp[i][j])
                            foutput1.write(read_list_temp[i][j])                       
                            foutput1.write(hold_list_temp[i][j])
                            foutput1.write(quality_list_temp[i][j])
                    foutput1.close()
                # initialize
                loc_list_temp = []
                read_list_temp = []
                hold_list_temp = []
                quality_list_temp = []
                for i in range(number_barcode):
                    loc_list_temp += [[]]
                    read_list_temp += [[]]
                    hold_list_temp += [[]]
                    quality_list_temp += [[]]   
    # write in remaining data
    for i in range(number_barcode):
        dirct = output_dirt + str(i+sample_number_ID) + '_R2.fastq'
        with open(dirct, 'a') as foutput1:
            for j in range(len(loc_list_temp[i])):
                foutput1.write(loc_list_temp[i][j])            
                foutput1.write(read_list_temp[i][j])                       
                foutput1.write(hold_list_temp[i][j])
                foutput1.write(quality_list_temp[i][j])
        foutput1.close()  
    finput.close()

In [None]:
if not os.path.isdir(R1R2_filtered_dir_base):
    os.mkdir(R1R2_filtered_dir_base)
if not os.path.isdir(output_dirt):
    os.mkdir(output_dirt)
if not os.path.isdir(trimmed_dir_base):
    os.mkdir(trimmed_dir_base)

In [None]:
%%time
sample_list_total = [18, 19, 20, 6]
for Sample_index in sample_list_total:
    fastq_filter(Sample_index, base_dirt + "original_files/", output_dirt)

In [None]:
%%time
sample_list_total = [5, 7, 8, 9, 10, 11, 12, 14, 15, 27]
for Sample_index in sample_list_total:
    fastq_filter(Sample_index, base_dirt + "original_files/", output_dirt)

In [None]:
%%time
sample_list_total = [31, 32, 33, 34, 35]
for Sample_index in sample_list_total:
    fastq_filter(Sample_index, base_dirt + "original_files/", output_dirt)

-------------------
### Then process Novaseq result

In [None]:
# with input read file, output each sorted file, each output file is a barcode/single cell
# All input fastq files should have this format
# S*_R1.fastq, and S*_R2.fastq
def fastq_filter_SE(Sample_index, input_dirt, output_dirt, R1_file_ending = "_R1.fastq", R2_file_ending = "_R2.fastq"):
    sample_number_ID = Sample_index*sample_ID_multiplier 
    # this is to directly take input read files, check structre, then sort reads according to their barcode number

    R1_ori_dir = input_dirt+ "S"+str(Sample_index) + R1_file_ending
    R2_ori_dir = input_dirt+ "S"+str(Sample_index) + R2_file_ending
    # This is to creat empty file which will be used to store reads later
    # Both R1 and R2 files are output into the same folder, this will be easier for later adapter trim
    for i in range(number_barcode):
        dirct = output_dirt + str(i+sample_number_ID) + '_R2_nova.fastq'
        foutput1 = open(dirct, 'w')        
        foutput1.close()
    hit_map = [] # label if the read pass stru and barcode check, -1 means no pass
    # initialize
    loc_list_temp = []
    read_list_temp = []
    hold_list_temp = []
    quality_list_temp = []
    for i in range(number_barcode): 
        # each i is a brcode, and this element in list stores corresponding information
        loc_list_temp += [[]]
        read_list_temp += [[]]
        hold_list_temp += [[]]
        quality_list_temp += [[]]
    #sample_read_count = [0]*(samples+1) # store read number for each sample, 0 for no match (bad), others for each sample
    test_flag = 0 # used to count how many reads processed
    indicator_W1fail = 0
    indicator_BC1fail = 0
    indicator_BC2fail = 0
    indicator_BC1_mismatch_allowed = 0
    indicator_BC2_mismatch_allowed = 0
    indicator_R1_short = 0
    storage_read_number = 1000000 
    # how many reads to take before output (this is a balance between I/O speed and RAM use)
    BC2_fail_dist = []
    with open(R1_ori_dir) as finput:
        while True:            
            flag = -1 # flag which sample the read is, -1 means not a good read
            test_flag += 1 # count read number
            line_loc = finput.readline()
            line_read = finput.readline()
            line_hold = finput.readline()
            line_quality = finput.readline()
            if len (line_read) == 0:
                break
            if len (line_read) <= 41: # if too short, no need to keep this read
                #sample_read_count[0] += 1
                hit_map += [-1]
                indicator_R1_short += 1
                print sample_number_ID, "length less then 41"
                continue
            loc = find_W1_loc(line_read)
            if loc:
                bc1_temp = reverseComplement(line_read[:loc])                
                bc2_temp = reverseComplement(line_read[loc+22:loc+30])                
                if bc1_temp in bc1: 
                    index1_temp = bc1.index(bc1_temp)
                    if bc2_temp in bc2:
                        index2_temp = bc2.index(bc2_temp)
                        index_temp = index1_temp * bc2_number + index2_temp
                        flag = index_temp
                    elif len(closest_index(bc2_temp, bc2)) == 1:
                        indicator_BC2_mismatch_allowed += 1
                        index2_temp = closest_index(bc2_temp, bc2)[0]
                        index_temp = index1_temp * bc2_number + index2_temp
                        flag = index_temp
                    else:
                        indicator_BC2fail += 1
                        index2_temp = -1
                elif len(closest_index(bc1_temp, bc1)) == 1:
                    indicator_BC1_mismatch_allowed += 1
                    index1_temp = closest_index(bc1_temp, bc1)[0]
                    if bc2_temp in bc2:
                        index2_temp = bc2.index(bc2_temp)
                        index_temp = index1_temp * bc2_number + index2_temp
                        flag = index_temp
                    elif len(closest_index(bc2_temp, bc2)) == 1:
                        indicator_BC2_mismatch_allowed += 1
                        index2_temp = closest_index(bc2_temp, bc2)[0]
                        index_temp = index1_temp * bc2_number + index2_temp
                        flag = index_temp 
                    else:
                        indicator_BC2fail += 1
                        index2_temp = -1
                else: 
                    indicator_BC1fail += 1
            else:
                indicator_W1fail += 1                
            hit_map += [flag]        
    finput.close()
    print "Sample number is:", Sample_index
    print "\nTotal read number for this sample is:", len(hit_map)
    print 'Total read pass structure check is {}, BC1 fail is {}, BC1 one mismatch is {},\
    W1 fail is {}, BC2 fail is {}, BC2 one mismatch is {}.'\
    .format(len(hit_map) - indicator_BC1fail-indicator_W1fail-indicator_BC2fail-indicator_R1_short,\
            indicator_BC1fail, indicator_BC1_mismatch_allowed, indicator_W1fail, \
            indicator_BC2fail, indicator_BC2_mismatch_allowed)

    # Based on the labeling of each read, take and sort Read 2 according to barcodes

    # initialize
    loc_list_temp = []
    read_list_temp = []
    hold_list_temp = []
    quality_list_temp = []
    for i in range(number_barcode):
        loc_list_temp += [[]]
        read_list_temp += [[]]
        hold_list_temp += [[]]
        quality_list_temp += [[]]
            
    test_flag = 0
    with open(R2_ori_dir) as finput:
        for k in range(len(hit_map)):
            test_flag += 1
            line_loc = finput.readline()
            line_read = finput.readline()
            line_hold = finput.readline()
            line_quality = finput.readline()
            index_temp = hit_map[k]
            if hit_map[k] != -1:
                loc_list_temp[index_temp].append(line_loc)
                read_list_temp[index_temp].append(line_read)
                hold_list_temp[index_temp].append(line_hold)
                quality_list_temp[index_temp].append(line_quality) 
            if test_flag % storage_read_number == 0:
                for i in range(number_barcode):
                    dirct = output_dirt + str(i+sample_number_ID) + '_R2_nova.fastq'
                    with open(dirct, 'a') as foutput1:
                        for j in range(len(loc_list_temp[i])):
                            foutput1.write(loc_list_temp[i][j])
                            foutput1.write(read_list_temp[i][j])                       
                            foutput1.write(hold_list_temp[i][j])
                            foutput1.write(quality_list_temp[i][j])
                    foutput1.close()
                # initialize
                loc_list_temp = []
                read_list_temp = []
                hold_list_temp = []
                quality_list_temp = []
                for i in range(number_barcode):
                    loc_list_temp += [[]]
                    read_list_temp += [[]]
                    hold_list_temp += [[]]
                    quality_list_temp += [[]]   
    # write in remaining data
    for i in range(number_barcode):
        dirct = output_dirt + str(i+sample_number_ID) + '_R2_nova.fastq'
        with open(dirct, 'a') as foutput1:
            for j in range(len(loc_list_temp[i])):
                foutput1.write(loc_list_temp[i][j])            
                foutput1.write(read_list_temp[i][j])                       
                foutput1.write(hold_list_temp[i][j])
                foutput1.write(quality_list_temp[i][j])
        foutput1.close()  
    finput.close()

In [None]:
base_dirt_nova = "./sequencing_microbeseq/original_files/Novaseq/"
trimmed_dir_base_nova = base_dirt + "trimmed/"
R1R2_filtered_dir_base_nova = base_dirt + "filtered_selected/"
output_dirt_nova = base_dirt + "filtered/"
if not os.path.isdir(R1R2_filtered_dir_base_nova):
    os.mkdir(R1R2_filtered_dir_base_nova)
if not os.path.isdir(output_dirt_nova):
    os.mkdir(output_dirt_nova)
if not os.path.isdir(trimmed_dir_base_nova):
    os.mkdir(trimmed_dir_base_nova)

In [None]:
%%time
sample_list_total = [5, 7, 8, 9, 10, 11, 12, 14, 15, 27]
for Sample_index in sample_list_total:
    fastq_filter_SE(Sample_index, base_dirt_nova, output_dirt_nova)

-------

In [None]:
# this function takes input directory and returns read number counts of each 
# takes dirt and Sample_index
# This is to read in the length of each and every file
def read_number_count(Sample_index, input_dirt):    
    sample_number_ID = Sample_index*sample_ID_multiplier
    sort_read_counts = []
    for i in range(number_barcode):
        dirct = input_dirt + str(i+sample_number_ID) + '_R1.fastq'
        reads_temp = 0
        with open(dirct) as finput:
            while True:
                line_loc = finput.readline()
                line_read = finput.readline()
                line_hold = finput.readline()
                line_quality = finput.readline()
                if len (line_read) == 0:
                    break
                reads_temp += 1
        finput.close()
        sort_read_counts += [reads_temp]  
    a_temp_df_read = pd.DataFrame({'barcode_list': range(sample_number_ID,sample_number_ID+number_barcode)}) 
    a_temp_df_read['read_number'] = sort_read_counts
    a_temp_df_read.to_csv(base_dirt+'S'+str(Sample_index)+"_read_count_all_barcodes.csv",index=False)
    return(sort_read_counts)
    #print "Total read length of all files is", sum(sort_read_counts)


In [None]:
sort_read_counts_total = []
sample_list_total =  [5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19, 20, 27, 31, 32, 33, 34, 35]
for Sample_index in sample_list_total:
    sort_read_counts_total += [read_number_count(Sample_index, output_dirt)]

In [None]:
min_cutoff_list = [5000, 10000, 2000, 4000, 3500, 4000, 4000, 3500, 4000, 5000, 11000, 8000, 7000, 3000, 50000, 15000, 10000, 20000, 15000]
max_cutoff_list = [40000, 75000, 20000, 25000, 20000, 40000, 40000, 40000, 30000, 30000, 80000, 100000, 75000, 40000, 400000, 250000, 80000, 200000, 90000]
sample_list_total =  [5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19, 20, 27, 31, 32, 33, 34, 35]
for sample_index in range(len(sample_list_total)):
    print "This is sample S", sample_list_total[sample_index]
    # this might be confusing, it is the index of a list, 
    #for example, sample_index 23, means 24th element in sort_read_counts_total
    read_max = max_cutoff_list[sample_index]
    read_min = min_cutoff_list[sample_index]
    barcode_count = 0
    read_count = 0
    read_count_sample = pd.read_csv(base_dirt+'S'+\
                                    str(sample_list_total[sample_index])+\
                                    "_read_count_all_barcodes.csv")
    for i in range(len(read_count_sample)):   
        if read_count_sample['read_number'][i] > read_min:
            barcode_count += 1
            read_count += read_count_sample['read_number'][i]
    print barcode_count, read_count, \
    read_count/(sum(read_count_sample['read_number']) + 0.0)

    barcode_count = 0
    read_count = 0
    for i in range(len(read_count_sample)):   
        if read_count_sample['read_number'][i] > read_max:
            barcode_count += 1
            read_count += read_count_sample['read_number'][i]
    print barcode_count, read_count, \
    read_count/(sum(read_count_sample['read_number']) + 0.0)
    keep_barcode_list = []
    for i in range(len(read_count_sample['read_number'])):   
        if read_count_sample['read_number'][i] < read_max and \
        read_count_sample['read_number'][i] > read_min:
            keep_barcode_list.append(read_count_sample['barcode_list'][i])
    print '{} cells are kept'.format(len(keep_barcode_list))
    for i in keep_barcode_list:
        dirt_temp_from= output_dirt + str(i) + '_R1.fastq'
        dirt_temp_to= R1R2_filtered_dir_base + str(i) + '_R1.fastq'
        shutil.move(dirt_temp_from, dirt_temp_to)
        dirt_temp_from= output_dirt + str(i) + '_R2.fastq'
        dirt_temp_to= R1R2_filtered_dir_base + str(i) + '_R2.fastq'
        shutil.move(dirt_temp_from, dirt_temp_to)
        if sample_list_total[sample_index] not in [6, 18,19,20,31, 32, 33, 34, 35]:
            dirt_temp_from= output_dirt + str(i) + '_R2_nova.fastq'
            dirt_temp_to= R1R2_filtered_dir_base + str(i) + '_R2_nova.fastq'
            shutil.move(dirt_temp_from, dirt_temp_to)