## Convert the .txt file to ordered .csv 

Now that we have our individual .txt files from epitope_analysis.ipynb, we will translate each sequence and count the times each amino acid sequence appears. The files will have additional data added, including the days post infection, the animal ID, the epitope name, replicate, and the frequency of each observed amino acid sequence in that sample. This .csv file can then be further analyzed using 

In [None]:
import collections
import os
import glob
import pandas as pd
import Bio.Seq
from matplotlib import pyplot as plt

In [None]:
""" Enter the working directory where your .txt files are located"""

WD = "~/Epitope_analysis/outputs"

# make sure you are in the right location, use '-' instead of '_' between animal, dpi, and epitope 

os.chdir(WD)

### Define required functions 

In [None]:
def remove_empty_files(files):
    """ Take the list of files that were generated through a looped version of something Dave made, then 
        make a list of files that have sufficient (arbitrary) amount of reads, mark those that have epitopes
        but don't make thresholds, and those that are completely empty
        
        Args: 
            files = list of .txt files output 
        Returns:
            four items: one list of the files that had sufficient njumber of epitopes, one that has files with
                low number of sequences, one of empties, and then one that lists how many sequences are in each
                and every file so we can plot it.
    """
    
    contains_seqs = []
    empties = []
    low_seqs = []
    num_seqs = []
    for f in range(len(files)):
        df = pd.read_csv(files[f], sep = '\t')
        num_seqs.append(int(pd.DataFrame.sum(df['barcode_count'])))
        if len(df) > 0:
            if pd.DataFrame.sum(df['barcode_count']) > 1000:
                contains_seqs.append(files[f])
            else:
                low_seqs.append(files[f])
        else:
            empties.append(files[f])
    print("There are " + str(len(contains_seqs)) + " files with 1000+ epitope sequences detected, \n" + 
          str(len(low_seqs)) + " with <1000 epitope sequences detected, \nand " + 
          str(len(empties)) + " empty files.")
    return contains_seqs, low_seqs, empties, num_seqs

def identify_animals(files):
    """Let's figure out which animals have sequences with epitopes so we can later make a list of them
        
       Args:
           files = list of .txt files from your data set 
       
       Returns:
           list of animals that have been sequenced
    """
    
    animals = set()
    for f in files:
        f = f.replace('_', '-')
        a = f[0:f.index('-')]
        animals.add(a)
    return list(animals)

def translate_and_group(file):
    """Since we care more about the effet on the translated product, translate the sequences and then combine 
       identical amino acid sequences.
       
       Args: 
           file = the file you want translated, in .txt form
        
       Returns:
           pandas data frame with translated sequences and corresponding counts
    """
    
    df = pd.read_csv(file, sep = '\t')
    aas = []
    for seq in df['barcode_sequence']:
        transl = Bio.Seq.translate(seq)
        aas.append(transl)

    df['barcode_sequence'] = pd.DataFrame(aas)
    
    return df.groupby(['barcode_sequence']).sum() 

def sort_by_animal(transl_files, animal_list):
    """Make lists of all files from each animal and return one list of sorted lists. 
       Args:
           transl_files = made from translate_and_group, list of translated sequences and the counts
           animal_list = made from identify_animals, sets up the unique animals in the data set 
       Returns:
           sorted list of lists for each unique animal 
    """
    
    sorted_list = []
    for a in animal_list:
        byanimal = []
        for t in transl_files:
            if t[0][0:len(a)] == a:
                byanimal.append(t)
        sorted_list.append(byanimal)
    return sorted_list

def identify_epitope(sorted_list_name):
    """Take a file from the sorted list and label which epitopes are found
    
        Args:
            sorted_list_name = file from the list, sorted by animal, that you want epitopes from 
        
        Returns:
            the names of the epitopes found """
    if "GW9" in sorted_list_name[0]:
        epitope_name = "GW9"
    elif "RM9" in sorted_list_name[0]:
        epitope_name = "RM9"
    #epitope_name = sorted_list_name[0][sorted_list_name[0].index('ep')+4:sorted_list_name[0].index('.')]
    return epitope_name

def identify_dpi(sorted_list_name):
    """Take a file from the sorted list and label the time point
    
        Args:
            sorted_list_name = file from the list, sorted by animal, that you want dpi from 
        
        Returns:
            timepoint """
    name = sorted_list_name[0].replace('_', '-')
    dpi = name[name.index('-')+4:name.index('.')]
    
    return dpi

def identify_replicate(sorted_list_name):
    """Take a file from the sorted list and label the replicate
    
        Args:
            sorted_list_name = file from the list, sorted by animal, that you want replicate from 
        
        Returns:
            replicate 
    """
    rep = sorted_list_name[0][sorted_list_name[0].index('ep')-1:sorted_list_name[0].index('ep')+3]
    return rep

def identify_animal(sorted_list_name):
    """Take a file from the sorted list and label the animal 
    
        Args:
            sorted_list_name = file from the list, sorted by animal, that you want replicate from 
        
        Returns:
            animal number
    """
    if '-' in sorted_list_name[0]:
        animal = sorted_list_name[0][:sorted_list_name[0].index('-')]
    else:
        animal = sorted_list_name[0][:sorted_list_name[0].index('_')]
    return animal

def write_translated_csv(sorted_file):
    """Take a file from the sorted list and write it as a .csv 
    
        Args:
            sorted_list_name = file from the list, sorted by animal, that you want replicate from 
        
        Returns:
            csv of the translated file 
    """
    filename = str(identify_animal(sorted_file)) + "_" + str(identify_dpi(sorted_file)) + "_" + str(identify_epitope(sorted_file)) + ".csv"
    sorted_file[1].to_csv(filename)
    
def write_translated_csv_LOW(sorted_file):
    """Take a file from the sorted list of samples with <1000 counts and write it as a .csv 
    
        Args:
            sorted_list_name = file from the list, sorted by animal, that you want replicate from 
        
        Returns:
            csv of the translated file 
    """
    filename = str(identify_animal(sorted_file)) + "_" + str(identify_dpi(sorted_file)) + "_" + str(identify_epitope(sorted_file)) + "_" + str(identify_replicate(sorted_file)) + "_LOW.csv"
    sorted_file[1].to_csv(filename)

### Read in the .txt files, remove the empty files, and plot the distribution of number of identified sequences 

In [None]:
txtfiles = [f for f in glob.glob("*_counts.txt")]
all_files = remove_empty_files(txtfiles)
viable_files = all_files[0]

plt.plot(sorted(all_files[3], reverse=True))
plt.xlabel("Number of Files")
plt.ylabel("Number of Sequences")

In [None]:
translated_files = []
for v in viable_files:
    translated_files.append((v, translate_and_group(v)))

In [None]:
animals = identify_animals(txtfiles)
sorted_list = sort_by_animal(translated_files, animals)

print(sorted_list)

In [None]:
for s in sorted_list:
    for t in s:
        t[1]['animal'] = identify_animal(t)
        t[1]['epitope'] = identify_epitope(t)
        t[1]['dpi'] = identify_dpi(t)
        #t[1]['rep'] = identify_replicate(t)
        t[1]['freq'] = t[1]['barcode_count']/t[1]['barcode_count'].sum()
        write_translated_csv(t)

In [None]:
# This section was put in here so I could still keep the files that had low sequence counts, but would know to take 
# the results with a little caution begcause low coverage may have skewed the results. 

low_seqs = []
for s in all_files[1]:
    low_seqs.append((s, translate_and_group(s)))
    
sorted_list_low = sort_by_animal(low_seqs, animals)

for s in sorted_list_low:
    for t in s:
        t[1]['animal'] = identify_animal(t)
        t[1]['epitope'] = identify_epitope(t)
        t[1]['dpi'] = identify_dpi(t)
        #t[1]['rep'] = identify_replicate(t)
        write_translated_csv_LOW(t)

In [None]:
# Here we are filtering out any epitope sequence that comprises less than 1% of the total frequency. These will be 
# grouped into an "other" group. 

filtered_list = []
merged_list = []
for s in sorted_list:
    filtered_animal = []
    for t in s:
        df = t[1].loc[t[1]['freq'] > 0.01]
        filtered_name = t[0][0:t[0].index(".")] + ".filtered.csv"
        filtered_animal.append(df)
        df.to_csv(filtered_name)
    
    filtered_list.append(filtered_animal)

    merged_animal = pd.concat(filtered_animal)
    merged_list.append(merged_animal)