Import Packages and Useful Functions

In [1]:
import pandas as pd
from labtools.adtools import sort
import seaborn as sns
import matplotlib.pyplot as plt
# Function for writing dictionary to csv file (used for loss tables)
import csv

name = 'Sort-seq experiment X'

def write_dict_to_csv(file_path, dictionary):
    with open(file_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(dictionary.keys())
        writer.writerow(dictionary.values())
# Function for translating DNA sequences
def translate_seq(df):
    # Translation table
    table = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
    }
    # List to store translated protein sequences
    protein_list = []
    # Translate each DNA sequence in the list
    for dnaseq in df["DNAseq"]:
        protein = ""
        # Check if the length of the DNA sequence is divisible by 3
        if len(dnaseq) % 3 == 0:
            # Iterate over the DNA sequence in steps of 3
            for i in range(0, 119, 3):
                # Extract a codon from the DNA sequence
                codon = dnaseq[i:i + 3]
                # Lookup the amino acid corresponding to the codon and append it to the protein sequence
                protein += table.get(codon, '')
        # Append the translated protein sequence to the list
        protein_list.append(protein)
    # Add a new column to the DataFrame with the translated protein sequences
    df["AAseq"] = protein_list
    return df

Initialize the sort

In [2]:
my_sort = sort.Sort(['/path/paired_reads_bin1.fastq', '/path/paired_reads_bin2.fastq', 
                     '/path/paired_reads_bin3.fastq', '/path/paired_reads_bin4.fastq',
                     # '/path/paired_reads_bin5_(if-ratio).fastq', '/path/paired_reads_bin6_(if-ratio).fastq', 
                     # '/path/paired_reads_bin7_(if-ratio).fastq', '/path/paired_reads_bin8_(if-ratio).fastq'
                    ], 
                    # List of number of cells sorted in each bin
                    bin_counts = [250000,250000,250000,250000,
                                 # 250000, 250000, 250000, 250000                 (if processing 8 bin ratio sort)
                                 ],
                    # List of median fluorescence values for each bin 
                    bin_values = [285,2109,3769,8782,
                                 # 55222,79875,131603,262143                      (if processing 8 bin ratio sort)
                                 ],
                    # CSV with all designed sequences in the library
                    design_file = "/path/Gcn4_Design.csv")

Process

In [None]:
activities, total_reads, reads_per_bin, loss_table = my_sort.process(thresh=40, ad_preceder = "GCTAGC", ad_length = 120, barcoded = False)

Renaming dataframe columns

In [None]:
activities.reset_index(inplace=True)
activities.rename(columns={'index': 'DNAseq'}, inplace=True)
total_reads = total_reads.reset_index(name='numreads_per_AD')
total_reads.rename(columns={'index': 'AD'}, inplace=True)
reads_per_bin.index.name = 'AD'
reads_per_bin.columns = [0, 1, 2, 3,
                        # 4, 5, 6, 7                 (if processing 8 bin ratio sort)
                        ]
reads_per_bin_reset = reads_per_bin.reset_index()

QC Analyses

In [None]:
# Calculate included reads and loss_table stats as %s
print('sum included reads: ', total_reads['numreads_per_AD'].sum())
total_sum = sum([total_reads['numreads_per_AD'].sum(), loss_table['ad_preceder'], loss_table['design_file'], loss_table['thresh']])
print('% included reads: ', 100*total_reads['numreads_per_AD'].sum() / total_sum)
print('% reads lost ad_preceder: ', 100*loss_table['ad_preceder'] / total_sum)
print('% reads lost design_file: ', 100*loss_table['design_file'] / total_sum)
print('% reads lost thresh: ', 100*loss_table['thresh'] / total_sum)

Saving Data

In [None]:
# Make combined dataframe with: DNAseq, AAseq, total reads, readsperbin, activity
merge = activities.merge(total_reads, left_index=True, right_index=True)
merge = merge.drop(columns=[0, 1, 2, 3,
                            # 4, 5, 6, 7,                          (if processing 8 bin ratio sort)
                            'AD'])
merge2 = merge.merge(reads_per_bin_reset, left_on='DNAseq', right_on='AD')
merge2.drop(columns=['AD'], inplace=True)

merge2 = translate_seq(merge2)
merge2 = merge2.reindex(columns=['DNAseq', 'AAseq', 'numreads_per_AD', 0, 1, 2, 3,
                                 # 4, 5, 6, 7,                      (if processing 8 bin ratio sort)
                                 'Activity'])
merge2.head()

In [8]:
activities.to_csv(f"/path/{name}_activities.csv", index=True)
reads_per_bin.to_csv(f"/path/{name}_readsperbin.csv", index=True)
total_reads.to_csv(f"/path/{name}_totalreads.csv", index=True)
write_dict_to_csv(f'/path/{name}_loss.csv', loss_table)
merge2.to_csv(f'/path/{name}_combined-dataframe.csv', index=True)