# Recap

After inferring split sites on DNA and amino acid sequences, two csv files containing fastq IDs and split sites info are generated for each target protein.

Sub-steps:  
1. Concatentate the 2 CSV files (pair-end reads) together.
2. Remove any records where the forward and reverse reads reported different split/insertion sites.  
3. Deduplicate any records where the forward and reverse reads pointed to the same split/insertion site.
4. Remove sites mapped beyond the permitted transposition windows. 
5. Work out the protein split sites that are missing from the library.
6. Calculate percentage coverage on DNA and amino acid sequence levels.

For sub-steps 4-5, information regarding DNA and amino acid sequence transposition windows are retrieved from a csv file named `target_protein_info.csv`, stored at the root directory.

This notebook file uses the complete data of the mCherry library and therefore takes a lot of time to run.

# Setup

In [1]:
import pandas as pd
import os

# Function to clean up data (Steps 3.2 - 3.4)

In [2]:
def clean_up_aligned_data(target_protein, all_reads_df,
                          five_prime_trans_border, three_prime_trans_border):
    
    # Step 2: Remove reads multiple insertions
    # Find all reads that have duplicated ID
    data_with_dup_IDs = all_reads_df[all_reads_df['fastq_id'].duplicated(keep=False)]
    duplicated_ids = data_with_dup_IDs['fastq_id'].drop_duplicates().to_list()
    
    # Check if any data that contains duplicated IDs were actually mapped to different split sites
    ids_with_multiple_insertion = list()
    for query_id in duplicated_ids:
        data_to_check = data_with_dup_IDs.query('fastq_id==@query_id')
        if len(data_to_check) != 2:
            print("More than two duplicated IDs")
            raise
            
        # to check if the split sites of the two records are identical,
        # one only needs to check insertion orientation and dna split site
        for col in ['insertion_orientation', 'dna_split_site_middle']:
            if data_to_check.iloc[0][col] != data_to_check.iloc[1][col]:
                ids_with_multiple_insertion.append(query_id)
                break
    
    # Filter out records without double insertions
    data_single_insertions = all_reads_df.query('~(fastq_id.isin(@ids_with_multiple_insertion))')
    
    # Step 3: Remove reads with same IDs and same insertions
    # Remaining records still contain duplicated reads from paired ends that point to the same split site
    data_deduplicated = data_single_insertions.drop_duplicates(subset=['fastq_id'], keep='first')
    
    # Step 4: Remove reads out of transposition windows
    data_final = data_deduplicated.query('dna_split_site_middle>=@five_prime_trans_border and \
                dna_split_site_middle<=@three_prime_trans_border')
    
    return data_final

# Read target protein information from csv file

In [3]:
all_target_protein_info = pd.read_csv("target_protein_info.csv", index_col=0)

# Steps 3.1 - 3.4

In [4]:
library_n_IDs = [
    "1_mCherry" # Expand this list or use fnmatch for batch processing
]

for library_n_ID in library_n_IDs:
    
    target_protein = library_n_ID.split("_")[1]
    
    # Get target protein info from imported csv file
    target_protein_info_row = all_target_protein_info.query('target_protein==@target_protein')
    end_dna = target_protein_info_row.iloc[0]['end_dna']
    five_prime_trans_border = target_protein_info_row.iloc[0]['five_prime_trans_border']
    three_prime_trans_border = target_protein_info_row.iloc[0]['three_prime_trans_border']
    n_trans_border = target_protein_info_row.iloc[0]['n_trans_border']
    c_trans_border = target_protein_info_row.iloc[0]['c_trans_border']
    
    # Read data
    base_filename = "IBM_NGS_aligned_reads_" + library_n_ID
    csv_filenames = [base_filename + "_" + str(i+1) + ".csv" for i in range(2)]
    aligned_reads_dfs = [pd.read_csv(os.path.join("results_per_fastq", filename)) for filename in csv_filenames]
    all_reads_df = pd.concat(aligned_reads_dfs, ignore_index=True, verify_integrity=True)
    
    # Clean up data
    cleaned_up_data = clean_up_aligned_data(target_protein, all_reads_df,
                          five_prime_trans_border, three_prime_trans_border)
    output_filename = "IBM_NGS_cleaned_up_reads_" + library_n_ID +  ".csv"
    output_filepath = os.path.join("results_per_target_protein", output_filename)
    cleaned_up_data.to_csv(output_filepath)

# Steps 3.5 - 3.6

In [5]:
def convert_split_site_name(list_of_sites):
    new_list = [str(int(site-0.5)) + "/" + str(int(site+0.5)) for site in list_of_sites]
    readable_list = str()
    for site in new_list:
        readable_list += site + ", "
    return readable_list

In [6]:
for library_n_ID in library_n_IDs:
    
    # Reimport exported data
    target_protein = library_n_ID.split("_")[1]
    output_filename = "IBM_NGS_cleaned_up_reads_" + library_n_ID +  ".csv"
    output_filepath = os.path.join("results_per_target_protein", output_filename)
    cleaned_up_data = pd.read_csv(output_filepath)
    
    # Get target protein info from imported csv file
    target_protein_info_row = all_target_protein_info.query('target_protein==@target_protein')
    end_dna = target_protein_info_row.iloc[0]['end_dna']
    five_prime_trans_border = target_protein_info_row.iloc[0]['five_prime_trans_border']
    three_prime_trans_border = target_protein_info_row.iloc[0]['three_prime_trans_border']
    n_trans_border = target_protein_info_row.iloc[0]['n_trans_border']
    c_trans_border = target_protein_info_row.iloc[0]['c_trans_border']
    
    # Step 5 - 6: Find missing insertions in protein sequence and calculate aa sequence coverage
    productive_data = cleaned_up_data.query('productive_insertion==True')['aa_split_site_middle']
    aa_sites_hitted = productive_data.drop_duplicates().sort_values().to_list()
    theoretical_all_aa_sites = [i+0.5 for i in range(int(n_trans_border-0.5), int(c_trans_border+0.5))]
    missing_aa_sites = [site for site in theoretical_all_aa_sites if site not in aa_sites_hitted]
    aa_seq_coverage = round((len(theoretical_all_aa_sites)-len(missing_aa_sites))/len(theoretical_all_aa_sites), 3)
    
    # Step 6: Calculate DNA sequence coverage, for both FW and RV orientations intead of either one value for each strand
    num_of_DNA_sites_hitted = len(cleaned_up_data.query('insertion_orientation=="FW"')['dna_split_site_middle'].drop_duplicates()) + \
                    len(cleaned_up_data.query('insertion_orientation=="RV"')['dna_split_site_middle'].drop_duplicates())
    num_of_theoretical_all_dna_sites = 2 * int(three_prime_trans_border - five_prime_trans_border + 1)
    dna_seq_coverage = round(num_of_DNA_sites_hitted / num_of_theoretical_all_dna_sites, 3)
    
    # Write result summary to text file
    f = open("results_per_target_protein/NGS_all_aligned_results_summary.txt", "a+")
    f.write("-----------------------------------------" + "\n")
    f.write("target protein = " + target_protein + "\n")
    f.write("aligned read counts = " +  str(len(cleaned_up_data)) + "\n")
    f.write("possible DNA insertion sites count =  " + str(num_of_theoretical_all_dna_sites) + "\n")
    f.write("missing DNA insertion sites count =  " + str(num_of_theoretical_all_dna_sites - num_of_DNA_sites_hitted) + "\n")
    f.write("DNA sequence insertion coverage =  " + str(dna_seq_coverage) + "\n")
    f.write("possible amino acid split/insertion sites count =  " + str(len(theoretical_all_aa_sites)) + "\n")
    f.write("missing aa split/insertion sites count= " + str(len(missing_aa_sites)) + "\n")
    f.write("missing aa split/insertion sites = " + str(convert_split_site_name(missing_aa_sites)) + "\n")
    f.write("aa sequence coverage =  " + str(aa_seq_coverage) + "\n")
    f.close()

# Inspect output

In [7]:
f = open("results_per_target_protein/NGS_all_aligned_results_summary.txt", 'r')
print(f.read())
f.close()

-----------------------------------------
target protein = mCherry
aligned read counts = 1012384
possible DNA insertion sites count =  1138
missing DNA insertion sites count =  125
DNA sequence insertion coverage =  0.89
possible amino acid split/insertion sites count =  189
missing aa split/insertion sites count= 11
missing aa split/insertion sites = 30/31, 36/37, 40/41, 51/52, 78/79, 127/128, 130/131, 166/167, 170/171, 193/194, 216/217, 
aa sequence coverage =  0.942

