This notebook will analyze the sequence and RNA secodary structural contexts of final off target hits in a group of samples. It will also perform analysis of guide alignment proximal to the off-target hits

In [1]:
#####################
# import statements #
#####################

import os
import pandas as pd
import numpy as np
import tqdm as tqdm
from Bio.Seq import Seq
from Bio import Align
import pysam
import math
from functools import reduce
import RNA
import warnings

In [None]:
##########################
# User-Defined Variables #
##########################
# - define all variables below with paths to the required files
# - this should be the only cell that requires modification

# full path to ALTER-code/5_tutorial-workflows/1_degs-off-tgts
proj_dir = ''
# full path to reference genome gtfgz
gtfgz_path = ''
# full path to reference genome fastagz
genome_fastagz_path = ''
# full path to reference genome transcript fastagz
transcript_fastagz_path = ''


# guide sequence 5'>3' as a string, the g1 base is removed as it does not pair to the target
guide_seq = 'AGAAGACGGACGGCGGCAUU'

replicates = 3

wt_condition = '01_transfection.control'
seq_offset = 10
alignment_offset = 25

target_snp = ('C','T')

In [3]:
sample_map_path = os.path.join(proj_dir, 'sample-map.tsv')
proc_dir = os.path.join(proj_dir, 'init-processing')
off_tgt_dir = os.path.join(proc_dir, 'off-tgt-analysis', f'{replicates}-reps')
salmon_dir = os.path.join(proj_dir, 'salmon-results')
trans_raw_path = os.path.join(salmon_dir, 'combined-results', 'transcript-raw.tsv')

out_dir = os.path.join(off_tgt_dir, 'seq-analysis')
os.makedirs(out_dir, exist_ok=True)

sample_map_df = pd.read_csv(sample_map_path, sep='\t')

biocondition_set = sample_map_df['condition'].unique()
off_tgt_df_dict = {}
for biocondition in biocondition_set:
    if biocondition != wt_condition:
        off_tgt_condition_dir = os.path.join(off_tgt_dir, biocondition)
        off_tgt_path = os.path.join(off_tgt_condition_dir, f'{biocondition}-var_VOI_DP_GQ_non-wt.tsv.gz')
        off_tgt_df_dict[biocondition] = pd.read_csv(off_tgt_path, sep='\t', index_col='Unnamed: 0')

        print(f'{biocondition}:\t{len(off_tgt_df_dict[biocondition])} Off Targets')

genome_fasta = pysam.Fastafile(genome_fastagz_path)
gtfgz = pysam.TabixFile(gtfgz_path)

trans_raw_df = pd.read_csv(trans_raw_path, sep='\t', low_memory=False)

display(sample_map_df)

02_ALTER.4.PPIB.14L8:	380 Off Targets


Unnamed: 0,sample,condition,rep
0,R9761,01_transfection.control,1
1,R9762,01_transfection.control,2
2,R9763,01_transfection.control,3
3,R9764,02_ALTER.4.PPIB.14L8,1
4,R9765,02_ALTER.4.PPIB.14L8,2
5,R9766,02_ALTER.4.PPIB.14L8,3


In [4]:
# import off-tgt data and add a column that has the mean pct_snp value for each condition
for off_tgt_biocondition, off_tgt_df in off_tgt_df_dict.items():
    mean_pct_snp_col_list = []
    for biocondition in sample_map_df['condition'].unique():
        search_mask = sample_map_df['condition'] == biocondition
        biocondition_samples_list = list(sample_map_df.loc[search_mask, 'sample'])
        pct_snp_col_list = [f'{sample_name}_pct_snp' for sample_name in biocondition_samples_list]
        off_tgt_df[f'{biocondition}_mean_pct_snp'] = off_tgt_df[pct_snp_col_list].sum(axis=1)/len(pct_snp_col_list)
        mean_pct_snp_col_list.append(f'{biocondition}_mean_pct_snp')

    print(f'{off_tgt_biocondition}:\t{len(off_tgt_df_dict[off_tgt_biocondition])} Off Targets')
    display_cols = ['chrom', 'pos', 'ref', 'alt', 'exon_id'] + mean_pct_snp_col_list
    display(off_tgt_df[display_cols])

    off_tgt_df_dict[off_tgt_biocondition] = off_tgt_df

02_ALTER.4.PPIB.14L8:	380 Off Targets


Unnamed: 0,chrom,pos,ref,alt,exon_id,01_transfection.control_mean_pct_snp,02_ALTER.4.PPIB.14L8_mean_pct_snp
1081,chr1,2229554,C,T,ENSE00001477855.2,0.0,13.303333
1819,chr1,8865298,C,T,"ENSE00003576214.1,ENSE00003490109.1,ENSE000034...",0.0,19.830000
1969,chr1,9604274,C,T,ENSE00001371282.5,0.0,12.580000
2722,chr1,15734167,C,T,"ENSE00004034602.1,ENSE00004034602.1,ENSE000040...",0.0,26.940000
3378,chr1,17044825,C,T,"ENSE00001846268.1,ENSE00003728285.1,ENSE000040...",0.0,19.800000
...,...,...,...,...,...,...,...
241323,chrX,106986043,C,T,"ENSE00001680547.1,ENSE00001680547.1",0.0,13.126667
241639,chrX,119468602,C,T,"ENSE00004257176.1,ENSE00001257624.6",0.0,17.080000
242106,chrX,130065235,C,T,"ENSE00001843723.2,ENSE00001870045.1",0.0,19.843333
242578,chrX,136206573,C,T,"ENSE00003577269.1,ENSE00003577269.1,ENSE000035...",0.0,23.226667


In [5]:
# build exon_map df
# index: exon_id; columns: exon_id, exon_number, exon_start, exon_end, transcript_id, gene_id, gene_name, strand
# exons are numbered 1-n 5'-3' for the transcript

def pull_gtf_info_from_rec(gtfgz_rec, tgt_info):
    ###################################################################################################
    # Purpose: pull information from the annotations of a gtf feature                                 #
    # Inputs: 1. gtfgz_rec - a row of a gtf file                                                      #
    #         2. tgt_info - the name of the target annotation                                         #
    # Output: the value of the target annotation                                                      # 
    ###################################################################################################

    gtfgz_info = gtfgz_rec[8].strip()
    if tgt_info in gtfgz_info:
        if tgt_info == 'exon_number':
            return gtfgz_info.split(f'{tgt_info}')[1].strip().split(';')[0].strip()
        else:
            return gtfgz_info.split(f'{tgt_info} "')[1].strip().split('";')[0].strip()
    else:
        print(f'\n[WARNING] {tgt_info} not found in:\n{gtfgz_rec}\n')
        return ''


def build_exon_id_map(gtfgz_path):
    ###################################################################################################
    # Purpose: build a dataframe, 
    #          index: exon_id; 
    #          columns: exon_id, exon_number, exon_start, exon_end, transcript_id, gene_id,
    #                   gene_name, strand
    # Inputs: 1. gtfgz_path - path of the reference gtf file                                         
    # Output: the exon id map dataframe
    ###################################################################################################
    exon_id_list = []
    exon_num_list = []
    exon_chrom_list = []
    exon_start_list = []
    exon_end_list = []
    transcript_id_list = []
    gene_id_list = []
    gene_name_list = []
    strand_list = []
    
    for gtfgz_line in gtfgz.fetch():
        gtfgz_line = gtfgz_line.strip()
        gtfgz_rec = gtfgz_line.split('\t')
        if gtfgz_rec[2] == 'exon':
            exon_id_list.append(        pull_gtf_info_from_rec(gtfgz_rec=gtfgz_rec, tgt_info='exon_id'))
            exon_chrom_list.append(     gtfgz_rec[0].strip())
            exon_start_list.append(     int(gtfgz_rec[3].strip()))
            exon_end_list.append(       int(gtfgz_rec[4].strip()))
            exon_num_list.append(       int(pull_gtf_info_from_rec(gtfgz_rec=gtfgz_rec, tgt_info='exon_number')))
            transcript_id_list.append(  pull_gtf_info_from_rec(gtfgz_rec=gtfgz_rec, tgt_info='transcript_id'))
            gene_id_list.append(        pull_gtf_info_from_rec(gtfgz_rec=gtfgz_rec, tgt_info='gene_id'))
            gene_name_list.append(      pull_gtf_info_from_rec(gtfgz_rec=gtfgz_rec, tgt_info='gene_name'))
            strand_list.append(         gtfgz_rec[6].strip())
    
    map_df = pd.DataFrame({
        'exon_id':      exon_id_list,
        'exon_number':  exon_num_list,
        'exon_chrom':   exon_chrom_list,
        'exon_start':   exon_start_list,
        'exon_end':     exon_end_list,
        'transcript_id':transcript_id_list,
        'gene_id':      gene_id_list,
        'gene_name':    gene_name_list,
        'strand':       strand_list,
    })
    map_df = map_df.set_index(map_df['exon_id'])
    return map_df
            
exon_map = build_exon_id_map(gtfgz_path=gtfgz_path)
display(exon_map)

Unnamed: 0_level_0,exon_id,exon_number,exon_chrom,exon_start,exon_end,transcript_id,gene_id,gene_name,strand
exon_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSE00004248723.1,ENSE00004248723.1,1,chr1,11121,11211,ENST00000832824.1,ENSG00000290825.2,DDX11L16,+
ENSE00004248721.1,ENSE00004248721.1,1,chr1,11125,11211,ENST00000832825.1,ENSG00000290825.2,DDX11L16,+
ENSE00004248726.1,ENSE00004248726.1,1,chr1,11410,11671,ENST00000832826.1,ENSG00000290825.2,DDX11L16,+
ENSE00004248710.1,ENSE00004248710.1,1,chr1,11411,11671,ENST00000832827.1,ENSG00000290825.2,DDX11L16,+
ENSE00004248702.1,ENSE00004248702.1,1,chr1,11426,11671,ENST00000832828.1,ENSG00000290825.2,DDX11L16,+
...,...,...,...,...,...,...,...,...,...
ENSE00004096553.1,ENSE00004096553.1,2,KI270753.1,44107,44491,ENST00000751368.1,ENSG00000297844.1,ENSG00000297844,+
ENSE00004262525.1,ENSE00004262525.1,1,KI270755.1,26350,26743,ENST00000839853.1,ENSG00000309258.1,ENSG00000309258,+
ENSE00004262526.1,ENSE00004262526.1,2,KI270755.1,27337,27723,ENST00000839853.1,ENSG00000309258.1,ENSG00000309258,+
pRFL382_ALTER-4-PPIB-14L8.1,pRFL382_ALTER-4-PPIB-14L8.1,1,pRFL382_ALTER-4-PPIB-14L8,1281,5078,pRFL382_ALTER-4-PPIB-14L8.1,pRFL382_ALTER-4-PPIB-14L8,pRFL382_ALTER-4-PPIB-14L8,-


In [6]:
# add a frequency metric (transcipt_reads/gene_reads, sum of all samples)
# to trans_raw_df that can be used to find the most common transcript

read_cols = []

for trans_raw_col in trans_raw_df.columns:
    if '_reads' in trans_raw_col:
        read_cols.append(trans_raw_col)

total_gene_reads_df = trans_raw_df[['gene_id'] + read_cols].groupby(['gene_id']).sum()
total_gene_reads_df['total_gene_reads'] = total_gene_reads_df[read_cols].sum(axis=1)

total_trans_reads = trans_raw_df[read_cols].sum(axis=1)
total_gene_reads = trans_raw_df['gene_id'].apply(lambda x: total_gene_reads_df.loc[x, 'total_gene_reads'])
trans_raw_df['transcript_freq'] = total_trans_reads/total_gene_reads
trans_raw_df = trans_raw_df.sort_values(by=['transcript_freq'], ascending=False)

display(trans_raw_df)

Unnamed: 0,transcript_id,R9761_tpm,R9761_reads,R9762_tpm,R9762_reads,R9763_tpm,R9763_reads,R9764_tpm,R9764_reads,R9765_tpm,R9765_reads,R9766_tpm,R9766_reads,gene_id,gene_name,transcript_freq
386587,pRFL462_transfection-control.1,42119.170383,442174.441,43658.462120,467343.028,37540.145506,388934.815,0.000000,0.000,0.000000,0.000,0.000000,0.000,pRFL462_transfection-control,pRFL462_transfection-control,1.0
156319,ENST00000575018.1,2.538206,2.000,3.644525,3.000,0.000000,0.000,1.095934,1.001,0.000000,0.000,0.000000,0.000,ENSG00000262519.1,TXNP4,1.0
156156,ENST00000574681.1,1.495503,16.000,1.373664,15.000,1.512829,16.000,1.407754,17.000,0.888295,12.000,0.483467,6.000,ENSG00000262703.1,ENSG00000262703,1.0
156168,ENST00000574705.1,1.588380,34.983,1.492548,32.628,2.281548,48.076,1.967395,46.749,1.451980,38.648,2.802039,66.941,ENSG00000262712.1,ENSG00000262712,1.0
156172,ENST00000574716.1,0.204353,4.001,0.499214,10.000,0.257340,5.003,0.045745,1.002,0.042334,1.032,0.088270,2.000,ENSG00000263096.1,ENSG00000263096,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386527,ENST00000850887.1,0.000000,0.000,0.000000,0.000,0.000000,0.000,0.000000,0.000,0.000000,0.000,0.000000,0.000,ENSG00000310561.1,ENSG00000310561,
386528,ENST00000850888.1,0.000000,0.000,0.000000,0.000,0.000000,0.000,0.000000,0.000,0.000000,0.000,0.000000,0.000,ENSG00000300293.2,ENSG00000300293,
386529,ENST00000850889.1,0.000000,0.000,0.000000,0.000,0.000000,0.000,0.000000,0.000,0.000000,0.000,0.000000,0.000,ENSG00000310562.1,ENSG00000310562,
386535,ENST00000850897.1,0.000000,0.000,0.000000,0.000,0.000000,0.000,0.000000,0.000,0.000000,0.000,0.000000,0.000,ENSG00000310564.1,ENSG00000310564,


In [7]:
# determine the most transcribed transcript at each variant location and store the data 
# in a new 'most transcribed column of off_tgt_df

def get_most_transcribed(off_tgt_row, trans_raw_df):
###################################################################################################
# Purpose: in off_tgt_df, gene, transcript, and exon IDs are stored in matched lists of all
#          possible transcripts that the location could be a part of. This method recovers the 
#          matched tuples and returns comma sepparated values for the most transcribed tuple
# Inputs: 1. off_tgt_row - subset of an off_tgt_df row passed with pd.apply(axis=1)
#         2. trans_raw_df - transcript raw counts df                                         
# Output: string 'gene_id,transcript_id,exon_id' for the most transcribed hit at the location
###################################################################################################
    if pd.isna(off_tgt_row['exon_id']):
        return ''
    else:
        gene_id_list = off_tgt_row['gene_id'].split(',')
        transc_id_list = off_tgt_row['transcript_id'].split(',')
        exon_id_list = off_tgt_row['exon_id'].split(',')

        id_tuple_list = []
        for idx in range(len(gene_id_list)):
            id_tuple_list.append((gene_id_list[idx], transc_id_list[idx], exon_id_list[idx]))
        id_tuple_list = list(set(id_tuple_list))
        
        freq_list = []
        for gene_id, transc_id, exon_id in id_tuple_list:
            try:
                freq_list.append(trans_raw_df.loc[transc_id, 'transcript_freq'])
            except KeyError:
                freq_list.append(-1)
        
        most_freq_idx = freq_list.index(max(freq_list))

        return ','.join(id_tuple_list[most_freq_idx])

for biocondition, off_tgt_df in off_tgt_df_dict.items():
    off_tgt_df['most_transcribed'] = off_tgt_df[['gene_id', 'transcript_id', 'exon_id']].apply(
        get_most_transcribed, trans_raw_df=trans_raw_df.set_index('transcript_id'), axis=1
        )
    
    print(f'{biocondition}:\t{len(off_tgt_df_dict[biocondition])} Off Targets')
    off_tgt_df_dict[biocondition] = off_tgt_df
    display(off_tgt_df[['chrom', 'pos', 'ref', 'alt', 'most_transcribed']])

02_ALTER.4.PPIB.14L8:	380 Off Targets


Unnamed: 0,chrom,pos,ref,alt,most_transcribed
1081,chr1,2229554,C,T,"ENSG00000157933.11,ENST00000378536.5,ENSE00001..."
1819,chr1,8865298,C,T,"ENSG00000074800.16,ENST00000234590.10,ENSE0000..."
1969,chr1,9604274,C,T,"ENSG00000188807.13,ENST00000340305.9,ENSE00001..."
2722,chr1,15734167,C,T,"ENSG00000116786.15,ENST00000375793.3,ENSE00004..."
3378,chr1,17044825,C,T,"ENSG00000117118.12,ENST00000375499.8,ENSE00003..."
...,...,...,...,...,...
241323,chrX,106986043,C,T,"ENSG00000133131.15,ENST00000355610.9,ENSE00001..."
241639,chrX,119468602,C,T,"ENSG00000005022.6,ENST00000317881.9,ENSE000012..."
242106,chrX,130065235,C,T,"ENSG00000102034.18,ENST00000335997.11,ENSE0000..."
242578,chrX,136206573,C,T,"ENSG00000022267.19,ENST00000651929.2,ENSE00003..."


In [8]:
# determine the sequence context for each off-tgt hit in a window size determined by the offset
# variable set at the beginning of the notebook

def get_exon_seq(transcript_id, exon_num, exon_map, genome_fasta):
    ###################################################################################################
    # Purpose: get the sequence of a specified exon as well at the start and end locations of the exon
    # Inputs: 1. transcript_id - target transcript
    #         2. exon_num - exon number within the specified transcript
    #         3. exon_map - dataframe indexed by exon_ids and holding exon information
    #         4. genome_fasta - reference genome fasta imported with pysam.Fastafile()                                           
    # Output: 1. exon_start - genomic location of the exon start will be -infinity if the requested exon
    #                         falls off the transcript beginning
    #         2. exon_start - genomic location of the exon end will be infinity if the requested exon
    #                         falls off the transcript end
    #         3. exon_seq - exon sequence, will be '' if the requested exon exceeds the transcript
    ###################################################################################################
    if exon_num < 1: # catch trying to go before transcript beginning
        return -(np.inf), np.inf, ''
    else:
        search_mask = (exon_map['transcript_id'] == transcript_id) & (exon_map['exon_number'] == exon_num)
        
        if exon_map[search_mask].empty: # catch trying to go before transcript end
            return -(np.inf), np.inf, ''
        else:
            exon_map_row = exon_map[search_mask].loc[exon_map[search_mask].index[0]]
            exon_chrom = str(exon_map_row['exon_chrom'])
            exon_start = int(exon_map_row['exon_start'])
            exon_end = int(exon_map_row['exon_end'])
            exon_seq = genome_fasta.fetch(exon_chrom, exon_start - 1, exon_end) # genome indicies are index 1 but fetch is index 0
            strand = str(exon_map_row['strand'])
            exon_seq = str(Seq(exon_seq).reverse_complement()) if strand == '-' else exon_seq

        return exon_start, exon_end, exon_seq

def get_seq_context(genome_fasta, exon_map, chrom, pos, exon_id, transcript_id, offset):
    ###################################################################################################
    # Purpose: get the sequence context of a specified position in a window size set by the offset 
    #          variable
    # Inputs: 1. genome_fasta - reference genome fasta imported with pysam.Fastafile()
    #         2. exon_map - dataframe indexed by exon_ids and holding exon information
    #         3. chrom - chromosome the target location is on
    #         4. pos - chromosomal position of the target location
    #         5. exon_id - id of the target exon
    #         6. transcript_id - id of the target transcript
    #         7. offset - the number of nucleotides to each side of the location to be returned
    # Output: a string of the desired sequence context, if the context window exceeds the transcript it
    #         will be padded with '-'
    ###################################################################################################

    tgt_seq_range = (pos - offset, pos + offset + 1) # beginning of range is inclusive end is exclusive
    
    search_mask = exon_map['transcript_id'] == transcript_id
    exon_map_row = exon_map[search_mask].loc[exon_id]

    exon_range = (exon_map_row['exon_start'], exon_map_row['exon_end'])
    strand = exon_map_row['strand']

    if (tgt_seq_range[0] >= exon_range[0]) and (tgt_seq_range[1] - 1 <= exon_range[1] + 1):
        tgt_seq = genome_fasta.fetch(chrom, tgt_seq_range[0] - 1, tgt_seq_range[1] - 1) # genome indicies are index 1 but fetch is index 0
        tgt_seq = str(Seq(tgt_seq).reverse_complement()) if strand == '-' else tgt_seq
    else:
        exon_num = exon_map_row['exon_number']

        init_tgt_seq = genome_fasta.fetch(
            chrom, 
            (tgt_seq_range[0] - 1) if tgt_seq_range[0] >= exon_range[0] else exon_range[0] - 1, 
            (tgt_seq_range[1] - 1) if tgt_seq_range[1] - 1  <= exon_range[1] else exon_range[1], 
        ) # genome indicies are index 1 but fetch is index 0
        init_tgt_seq = str(Seq(init_tgt_seq).reverse_complement()) if strand == '-' else init_tgt_seq

        bases_from_neighbors = [-1*(pos - offset - exon_range[0]),-1*(exon_range[1] - offset - pos)]
        bases_from_neighbors = [(base_count if base_count > 0 else 0) for base_count in bases_from_neighbors]
        bases_from_neighbors = [bases_from_neighbors[-(i+1)] for i in range(len(bases_from_neighbors))] if strand == '-' else bases_from_neighbors

        if bases_from_neighbors[0] > 0:
            prev_exon_num = exon_num - 1
            prev_exon_start, prev_exon_end, prev_exon_seq = get_exon_seq(transcript_id=transcript_id, exon_num=prev_exon_num, exon_map=exon_map, genome_fasta=genome_fasta)

            while len(prev_exon_seq) < bases_from_neighbors[0]:
                prev_exon_num -= 1
                prev_prev_exon_start, prev_prev_exon_end, prev_prev_exon_seq = get_exon_seq(transcript_id=transcript_id, exon_num=prev_exon_num, exon_map=exon_map, genome_fasta=genome_fasta)
                prev_exon_start = prev_prev_exon_start
                prev_exon_seq = prev_prev_exon_seq + prev_exon_seq

                if np.isinf(prev_exon_start):
                    break

            if prev_exon_seq != '':
                addn_from_prev = prev_exon_seq[-bases_from_neighbors[0]:] if bases_from_neighbors[0] < len(prev_exon_seq) else prev_exon_seq
            else:
                addn_from_prev = ''
            
            while len(addn_from_prev) < bases_from_neighbors[0]:
                addn_from_prev = '-' + addn_from_prev
            
        else:
            addn_from_prev = ''
    
        if bases_from_neighbors[1] > 0:
            next_exon_num = exon_num + 1
            next_exon_start, next_exon_end, next_exon_seq = get_exon_seq(transcript_id=transcript_id, exon_num=exon_num+1, exon_map=exon_map, genome_fasta=genome_fasta)

            while len(next_exon_seq) < bases_from_neighbors[1]:
                next_exon_num += 1
                next_next_exon_start, next_next_exon_end, next_next_exon_seq = get_exon_seq(transcript_id=transcript_id, exon_num=next_exon_num, exon_map=exon_map, genome_fasta=genome_fasta)
                next_exon_end = next_next_exon_end
                next_exon_seq = next_exon_seq + next_next_exon_seq

                if np.isinf(next_exon_end):
                    break

            if next_exon_seq != '':
                addn_from_next = next_exon_seq[:bases_from_neighbors[1]] if bases_from_neighbors[1] < len(next_exon_seq) else next_exon_seq
            else:
                addn_from_next = ''

            while len(addn_from_next) < bases_from_neighbors[1]:
                addn_from_next = addn_from_next + '-'
        else:
            addn_from_next = ''

        tgt_seq = addn_from_prev + init_tgt_seq + addn_from_next

        if len(tgt_seq) != (1 + offset + offset):
            print(f'\n[WARNING] wrong length, {len(tgt_seq)} bases')
            print(f'{chrom}\t{pos}\t{tgt_seq}\t{exon_id}\t{exon_num}\t{exon_range}\t{transcript_id}')
            print(tgt_seq)
    return tgt_seq

def get_row_seq_context(off_tgt_row, genome_fasta, exon_map, offset):
    ###################################################################################################
    # Purpose: given a subset of an off_tgt_df row, return the sequence context in a window size 
    #          determined by the offset variable
    # Inputs: 1. off_tgt_row - subset of an off_tgt_df row passed with pd.apply(axis=1)
    #         2. genome_fasta - reference genome fasta imported with pysam.Fastafile()
    #         3. exon_map - dataframe indexed by exon_ids and holding exon information
    #         4. offset - the number of nucleotides to each side of the location to be returned
    # Output: a string of the desired sequence context, if the context window exceeds the transcript it
    #         will be padded with '-'
    ###################################################################################################
    if off_tgt_row['most_transcribed'] == '':
        tgt_seq = ''
    else:
        chrom = off_tgt_row['chrom']
        pos = off_tgt_row['pos']
        transc_info = off_tgt_row['most_transcribed'].split(',')
        transc_id = transc_info[1]  
        exon_id = transc_info[2]

        tgt_seq = get_seq_context(
                        genome_fasta=genome_fasta, exon_map=exon_map,
                        chrom=chrom, pos=pos, exon_id=exon_id, transcript_id=transc_id,
                        offset=offset
                        )

    return tgt_seq


for biocondition, off_tgt_df in off_tgt_df_dict.items():
    off_tgt_df['seq_context'] = off_tgt_df[['chrom', 'pos', 'strand', 'most_transcribed']].apply(get_row_seq_context,
        genome_fasta=genome_fasta, exon_map=exon_map,
        offset=seq_offset,
        axis=1
    )
    off_tgt_df_dict[biocondition] = off_tgt_df

In [9]:
# divide the sequence contexts into character lists and store in a dataframe
# calculate the lowest mfe secondary structure of each context sequence and do the same with the dot-bracket notation

idx_df_dict = {}
idx_col_list = [f'idx_{i}' for i in range((2*seq_offset) + 1)]
info_col_list = ['chrom', 'pos', 'ref', 'alt']

for biocondition, off_tgt_df in off_tgt_df_dict.items():

    seq_df           = off_tgt_df[info_col_list + mean_pct_snp_col_list].copy()
    struct_df        = off_tgt_df[info_col_list + mean_pct_snp_col_list].copy()
    no_idx_struct_df = off_tgt_df[info_col_list + mean_pct_snp_col_list].copy()

    seq_df[idx_col_list] = off_tgt_df.apply(lambda x: list(x['seq_context']), axis=1, result_type='expand')
    
    no_idx_struct_df['seq_context'] = off_tgt_df['seq_context']
    no_idx_struct_df['struct_context'] = no_idx_struct_df['seq_context'].apply(lambda x: str(RNA.fold(x)[0]))

    struct_df[idx_col_list] = no_idx_struct_df.apply(lambda x: list(x['struct_context']), axis=1, result_type='expand')

    idx_df_dict[biocondition] = [seq_df, struct_df, no_idx_struct_df]
    
    print(f'Condition: {biocondition}')
    display(seq_df)
    display(no_idx_struct_df)
    display(struct_df)

Condition: 02_ALTER.4.PPIB.14L8


Unnamed: 0,chrom,pos,ref,alt,01_transfection.control_mean_pct_snp,02_ALTER.4.PPIB.14L8_mean_pct_snp,idx_0,idx_1,idx_2,idx_3,...,idx_11,idx_12,idx_13,idx_14,idx_15,idx_16,idx_17,idx_18,idx_19,idx_20
1081,chr1,2229554,C,T,0.0,13.303333,G,T,G,G,...,G,C,A,C,A,A,G,G,C,C
1819,chr1,8865298,C,T,0.0,19.830000,A,G,T,C,...,A,A,G,G,A,C,T,A,C,C
1969,chr1,9604274,C,T,0.0,12.580000,A,G,C,C,...,G,C,G,C,T,G,G,C,C,A
2722,chr1,15734167,C,T,0.0,26.940000,T,C,G,C,...,A,G,A,G,G,G,T,C,C,G
3378,chr1,17044825,C,T,0.0,19.800000,T,G,C,C,...,G,A,T,G,G,G,A,C,C,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241323,chrX,106986043,C,T,0.0,13.126667,A,C,A,G,...,T,G,G,A,A,T,G,T,G,T
241639,chrX,119468602,C,T,0.0,17.080000,T,A,G,C,...,G,A,G,C,G,G,G,T,C,A
242106,chrX,130065235,C,T,0.0,19.843333,A,G,T,T,...,A,A,G,G,C,T,A,A,G,A
242578,chrX,136206573,C,T,0.0,23.226667,G,C,A,A,...,G,G,T,G,C,G,G,A,C,T


Unnamed: 0,chrom,pos,ref,alt,01_transfection.control_mean_pct_snp,02_ALTER.4.PPIB.14L8_mean_pct_snp,seq_context,struct_context
1081,chr1,2229554,C,T,0.0,13.303333,GTGGTGCACTCGCACAAGGCC,...((((....))))......
1819,chr1,8865298,C,T,0.0,19.830000,AGTCCTTCATCAAGGACTACC,(((((((....)))))))...
1969,chr1,9604274,C,T,0.0,12.580000,AGCCAGCCCTCGCGCTGGCCA,.((((((......))))))..
2722,chr1,15734167,C,T,0.0,26.940000,TCGCTGGGGGCAGAGGGTCCG,.......((((.....)))).
3378,chr1,17044825,C,T,0.0,19.800000,TGCCATCTATCGATGGGACCC,..(((((....))))).....
...,...,...,...,...,...,...,...,...
241323,chrX,106986043,C,T,0.0,13.126667,ACAGACCTATCTGGAATGTGT,(((..((.....))..)))..
241639,chrX,119468602,C,T,0.0,17.080000,TAGCGCCCATCGAGCGGGTCA,....((((.......))))..
242106,chrX,130065235,C,T,0.0,19.843333,AGTTTGAGGGCAAGGCTAAGA,(((((.......)))))....
242578,chrX,136206573,C,T,0.0,23.226667,GCAAGCCCATCGGTGCGGACT,((..(((....))))).....


Unnamed: 0,chrom,pos,ref,alt,01_transfection.control_mean_pct_snp,02_ALTER.4.PPIB.14L8_mean_pct_snp,idx_0,idx_1,idx_2,idx_3,...,idx_11,idx_12,idx_13,idx_14,idx_15,idx_16,idx_17,idx_18,idx_19,idx_20
1081,chr1,2229554,C,T,0.0,13.303333,.,.,.,(,...,),),),),.,.,.,.,.,.
1819,chr1,8865298,C,T,0.0,19.830000,(,(,(,(,...,),),),),),),),.,.,.
1969,chr1,9604274,C,T,0.0,12.580000,.,(,(,(,...,.,.,),),),),),),.,.
2722,chr1,15734167,C,T,0.0,26.940000,.,.,.,.,...,.,.,.,.,.,),),),),.
3378,chr1,17044825,C,T,0.0,19.800000,.,.,(,(,...,),),),),),.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241323,chrX,106986043,C,T,0.0,13.126667,(,(,(,.,...,.,),),.,.,),),),.,.
241639,chrX,119468602,C,T,0.0,17.080000,.,.,.,.,...,.,.,.,.,),),),),.,.
242106,chrX,130065235,C,T,0.0,19.843333,(,(,(,(,...,.,),),),),),.,.,.,.
242578,chrX,136206573,C,T,0.0,23.226667,(,(,.,.,...,),),),),),.,.,.,.,.


In [10]:
# using the index dataframes tabulate position counts and frequency for sequence and structure

def generate_count_df(df, value_name, tgt_col_list):
    ###################################################################################################
    # Purpose: produces a count matrix from a dataframe and list of target columns
    # Inputs: 1. df - dataframe to be counted
    #         2. value_name - name for the values being counted
    #         3. tgt_col_list - list of columns for which values should be counted
    # Output: a dataframe where the columns are value_name + tgt_col_list, the value name column
    #         contains all the unique values in the target columns and the other columns contain the 
    #         counts of each value for that column
    ###################################################################################################
    count_df = pd.DataFrame()
    unique_val_array = pd.unique(df[tgt_col_list].values.ravel('K'))
    count_df[value_name] = list(np.sort(unique_val_array))
    count_df = count_df.set_index(value_name)
    
    for tgt_col in tgt_col_list:
        count_series = df[tgt_col].value_counts()
        count_series.index.name = value_name
        count_df = pd.merge(left=count_df, right=count_series.rename(tgt_col), left_index=True, right_index=True, how='outer')

    count_df = count_df.reset_index()
    count_df.loc[len(count_df)] = ['total'] + list(count_df[tgt_col_list].sum())
    count_df = count_df.fillna(0)

    return count_df

count_df_dict = {}
freq_df_dict = {}
for biocondition, idx_df_list in idx_df_dict.items():
    count_df_dict[biocondition] = []
    freq_df_dict[biocondition] = []
    
    for i in range(2):
        value_name = 'nucleotide' if i == 0 else 'structure (dot-bracket)'
        count_df = generate_count_df(idx_df_list[i], value_name, idx_col_list)

        freq_df = pd.DataFrame(count_df[value_name])
        total_row = count_df.loc[len(count_df) - 1, idx_col_list]
        freq_df[idx_col_list] = count_df[idx_col_list].div(total_row)
        freq_df = freq_df.drop(index=(len(freq_df) - 1))

        count_df_dict[biocondition].append(count_df)
        freq_df_dict[biocondition].append(freq_df)

        print(f'Condition:{biocondition}')
        display(count_df)
        display(freq_df)

Condition:02_ALTER.4.PPIB.14L8


Unnamed: 0,nucleotide,idx_0,idx_1,idx_2,idx_3,idx_4,idx_5,idx_6,idx_7,idx_8,...,idx_11,idx_12,idx_13,idx_14,idx_15,idx_16,idx_17,idx_18,idx_19,idx_20
0,-,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A,103.0,74.0,94.0,81.0,61.0,60.0,30.0,29.0,264.0,...,129.0,72.0,40.0,82.0,86.0,53.0,107.0,108.0,60.0,84.0
2,C,91.0,144.0,86.0,118.0,134.0,146.0,232.0,287.0,39.0,...,8.0,90.0,128.0,100.0,100.0,128.0,96.0,86.0,140.0,124.0
3,G,87.0,91.0,123.0,109.0,135.0,122.0,25.0,22.0,18.0,...,197.0,120.0,150.0,127.0,103.0,126.0,104.0,76.0,112.0,89.0
4,T,97.0,69.0,76.0,71.0,49.0,52.0,93.0,42.0,59.0,...,46.0,98.0,62.0,71.0,91.0,73.0,73.0,110.0,68.0,83.0
5,total,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,...,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0


Unnamed: 0,nucleotide,idx_0,idx_1,idx_2,idx_3,idx_4,idx_5,idx_6,idx_7,idx_8,...,idx_11,idx_12,idx_13,idx_14,idx_15,idx_16,idx_17,idx_18,idx_19,idx_20
0,-,0.005263,0.005263,0.002632,0.002632,0.002632,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A,0.271053,0.194737,0.247368,0.213158,0.160526,0.157895,0.078947,0.076316,0.694737,...,0.339474,0.189474,0.105263,0.215789,0.226316,0.139474,0.281579,0.284211,0.157895,0.221053
2,C,0.239474,0.378947,0.226316,0.310526,0.352632,0.384211,0.610526,0.755263,0.102632,...,0.021053,0.236842,0.336842,0.263158,0.263158,0.336842,0.252632,0.226316,0.368421,0.326316
3,G,0.228947,0.239474,0.323684,0.286842,0.355263,0.321053,0.065789,0.057895,0.047368,...,0.518421,0.315789,0.394737,0.334211,0.271053,0.331579,0.273684,0.2,0.294737,0.234211
4,T,0.255263,0.181579,0.2,0.186842,0.128947,0.136842,0.244737,0.110526,0.155263,...,0.121053,0.257895,0.163158,0.186842,0.239474,0.192105,0.192105,0.289474,0.178947,0.218421


Condition:02_ALTER.4.PPIB.14L8


Unnamed: 0,structure (dot-bracket),idx_0,idx_1,idx_2,idx_3,idx_4,idx_5,idx_6,idx_7,idx_8,...,idx_11,idx_12,idx_13,idx_14,idx_15,idx_16,idx_17,idx_18,idx_19,idx_20
0,(,102.0,183.0,233.0,275.0,305.0,306.0,256.0,61.0,26.0,...,6.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,),0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0,...,180.0,226.0,295.0,283.0,247.0,235.0,150.0,88.0,36.0,18.0
2,.,278.0,197.0,147.0,105.0,75.0,74.0,124.0,314.0,349.0,...,194.0,152.0,84.0,96.0,133.0,145.0,230.0,292.0,344.0,362.0
3,total,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,...,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0


Unnamed: 0,structure (dot-bracket),idx_0,idx_1,idx_2,idx_3,idx_4,idx_5,idx_6,idx_7,idx_8,...,idx_11,idx_12,idx_13,idx_14,idx_15,idx_16,idx_17,idx_18,idx_19,idx_20
0,(,0.268421,0.481579,0.613158,0.723684,0.802632,0.805263,0.673684,0.160526,0.068421,...,0.015789,0.005263,0.002632,0.002632,0.0,0.0,0.0,0.0,0.0,0.0
1,),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013158,0.013158,...,0.473684,0.594737,0.776316,0.744737,0.65,0.618421,0.394737,0.231579,0.094737,0.047368
2,.,0.731579,0.518421,0.386842,0.276316,0.197368,0.194737,0.326316,0.826316,0.918421,...,0.510526,0.4,0.221053,0.252632,0.35,0.381579,0.605263,0.768421,0.905263,0.952632


In [11]:
# produce outputs for sequence context
excel_path = os.path.join(out_dir, 'off-tgt-seq-counts.xlsx')
with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
    for biocondition in idx_df_dict.keys():
        biocon_out_dir = os.path.join(out_dir, biocondition)
        os.makedirs(biocon_out_dir, exist_ok=True)
        
        idx_df_list = idx_df_dict[biocondition]
        count_df_list = count_df_dict[biocondition]
        freq_df_list = freq_df_dict[biocondition]

        value_names_list = ['nt','struct']

        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            for idx, val_name in enumerate(value_names_list):
                idx_df   = idx_df_list[idx]
                count_df = count_df_list[idx]
                freq_df  = freq_df_list[idx]

                tsv_path = os.path.join(biocon_out_dir, f'{biocondition}-{val_name}-idx.tsv.gz')
                idx_df.to_csv(tsv_path, sep='\t', compression='gzip', index=False)

                tsv_path = os.path.join(biocon_out_dir, f'{biocondition}-{val_name}-count.tsv.gz')
                count_df.to_csv(tsv_path, sep='\t', compression='gzip', index=False)
                count_df.to_excel(writer, sheet_name=f'{biocondition}-{val_name}-count', index=False)

                tsv_path = os.path.join(biocon_out_dir, f'{biocondition}-{val_name}-freq.tsv.gz')
                freq_df.to_csv(tsv_path, sep='\t', compression='gzip', index=False)
                freq_df.to_excel(writer, sheet_name=f'{biocondition}-{val_name}-freq', index=False)

            summary_df = idx_df_list[2]
            tsv_path = os.path.join(biocon_out_dir, f'{biocondition}-summary.tsv.gz')
            summary_df.to_csv(tsv_path, sep='\t', compression='gzip', index=False)
            summary_df.to_excel(writer, sheet_name=f'{biocondition}-summary', index=False)

In [12]:
# create a partial copy of the off_tgt dfs to store information for alignment analysis

alignment_df_dict = {}

for biocondition, off_tgt_df in off_tgt_df_dict.items():
    alignment_df = off_tgt_df[['chrom', 'pos', 'ref', 'alt', 'strand', 'most_transcribed']].copy()
    # alignment_df['gene_id'] = alignment_df['gene_id'].apply(lambda x: ','.join(set(x.split(','))))
    # alignment_df['gene_name'] = alignment_df['gene_name'].apply(lambda x: ','.join(set(x.split(','))))
    for seq_df_col in off_tgt_df.columns:
        if ('_mean_pct_snp' in seq_df_col) and (wt_condition not in seq_df_col):
            alignment_df[seq_df_col] = off_tgt_df[seq_df_col]
    alignment_df['gene_id'] = alignment_df['most_transcribed'].apply(lambda x: x.split(',')[0])
    alignment_df = alignment_df[['gene_id'] + [col for col in alignment_df.columns if col != 'gene_id']]
    # alignment_df['seq_context'] = off_tgt_df['seq_context']

    alignment_df['seq_context'] = off_tgt_df[['chrom', 'pos', 'strand', 'most_transcribed']].apply(get_row_seq_context,
        genome_fasta=genome_fasta, exon_map=exon_map,
        offset=alignment_offset,
        axis=1
    )
    alignment_df_dict[biocondition] = alignment_df


    print(f'Condition:{biocondition}')
    display(alignment_df)

Condition:02_ALTER.4.PPIB.14L8


Unnamed: 0,gene_id,chrom,pos,ref,alt,strand,most_transcribed,02_ALTER.4.PPIB.14L8_mean_pct_snp,seq_context
1081,ENSG00000157933.11,chr1,2229554,C,T,+,"ENSG00000157933.11,ENST00000378536.5,ENSE00001...",13.303333,CCGCCGCACAAGTTCGTGGTGCACTCGCACAAGGCCCTGGAGAACC...
1819,ENSG00000074800.16,chr1,8865298,C,T,-,"ENSG00000074800.16,ENST00000234590.10,ENSE0000...",19.830000,TGGCTGACCTGTACAAGTCCTTCATCAAGGACTACCCAGTGGTGTC...
1969,ENSG00000188807.13,chr1,9604274,C,T,+,"ENSG00000188807.13,ENST00000340305.9,ENSE00001...",12.580000,ATTTATAATAACCAGAGCCAGCCCTCGCGCTGGCCAGGATCCTCCT...
2722,ENSG00000116786.15,chr1,15734167,C,T,+,"ENSG00000116786.15,ENST00000375793.3,ENSE00004...",26.940000,GGTAGCGAATGGAGGTCGCTGGGGGCAGAGGGTCCGAGCCCTGTGG...
3378,ENSG00000117118.12,chr1,17044825,C,T,-,"ENSG00000117118.12,ENST00000375499.8,ENSE00003...",19.800000,CCGTATCAAGAAATTTGCCATCTATCGATGGGACCCAGACAAGGCT...
...,...,...,...,...,...,...,...,...,...
241323,ENSG00000133131.15,chrX,106986043,C,T,-,"ENSG00000133131.15,ENST00000355610.9,ENSE00001...",13.126667,TGTTGGACTTCTATCACAGACCTATCTGGAATGTGTCCAGGCCCAG...
241639,ENSG00000005022.6,chrX,119468602,C,T,ambiguous,"ENSG00000005022.6,ENST00000317881.9,ENSE000012...",17.080000,TCTCCAAGACGGCGGTAGCGCCCATCGAGCGGGTCAAGCTGCTGCT...
242106,ENSG00000102034.18,chrX,130065235,C,T,-,"ENSG00000102034.18,ENST00000335997.11,ENSE0000...",19.843333,GCCTCCCTCCTGCCAAGTTTGAGGGCAAGGCTAAGAAATGTCAGCC...
242578,ENSG00000022267.19,chrX,136206573,C,T,+,"ENSG00000022267.19,ENST00000651929.2,ENSE00003...",23.226667,CCTGTGTGGAATGCCGCAAGCCCATCGGTGCGGACTCCAAGGAGGT...


In [13]:
# set aligner parameters
aligner = Align.PairwiseAligner()
aligner.mode = 'local'

aligner.match_score = 2
aligner.mismatch_score = -2
aligner.open_gap_score = -4   # Penalty for opening a gap (first base)
aligner.extend_gap_score = 0 # Penalty for extending a gap (subsequent bases)

In [14]:
# add guide sequence information to alignment dfs and run alignment, store alignments if alignment dfs

for biocondition, alignment_df in alignment_df_dict.items():
    seq_context_list = list(alignment_df['seq_context'])
    guide_seq = guide_seq.upper()
    guide_comp_seq = str(Seq(guide_seq).reverse_complement())

    alignment_df['guide_seq'] = guide_seq
    alignment_df['guide_rc_seq'] = guide_comp_seq

    alignment_df['alignments'] = alignment_df['seq_context'].apply(lambda x: aligner.align(x, guide_comp_seq))

    display_cols = [
        'gene_id',
        'seq_context',
        'alignments'
    ]
    alignment_df_dict[biocondition] = alignment_df

    display(alignment_df[display_cols])

Unnamed: 0,gene_id,seq_context,alignments
1081,ENSG00000157933.11,CCGCCGCACAAGTTCGTGGTGCACTCGCACAAGGCCCTGGAGAACC...,<Bio.Align.PairwiseAlignments object at 0x16db...
1819,ENSG00000074800.16,TGGCTGACCTGTACAAGTCCTTCATCAAGGACTACCCAGTGGTGTC...,<Bio.Align.PairwiseAlignments object at 0x16db...
1969,ENSG00000188807.13,ATTTATAATAACCAGAGCCAGCCCTCGCGCTGGCCAGGATCCTCCT...,<Bio.Align.PairwiseAlignments object at 0x16db...
2722,ENSG00000116786.15,GGTAGCGAATGGAGGTCGCTGGGGGCAGAGGGTCCGAGCCCTGTGG...,<Bio.Align.PairwiseAlignments object at 0x16db...
3378,ENSG00000117118.12,CCGTATCAAGAAATTTGCCATCTATCGATGGGACCCAGACAAGGCT...,<Bio.Align.PairwiseAlignments object at 0x16db...
...,...,...,...
241323,ENSG00000133131.15,TGTTGGACTTCTATCACAGACCTATCTGGAATGTGTCCAGGCCCAG...,<Bio.Align.PairwiseAlignments object at 0x16c8...
241639,ENSG00000005022.6,TCTCCAAGACGGCGGTAGCGCCCATCGAGCGGGTCAAGCTGCTGCT...,<Bio.Align.PairwiseAlignments object at 0x16c8...
242106,ENSG00000102034.18,GCCTCCCTCCTGCCAAGTTTGAGGGCAAGGCTAAGAAATGTCAGCC...,<Bio.Align.PairwiseAlignments object at 0x16c8...
242578,ENSG00000022267.19,CCTGTGTGGAATGCCGCAAGCCCATCGGTGCGGACTCCAAGGAGGT...,<Bio.Align.PairwiseAlignments object at 0x16c8...


In [15]:
# compile information and position counts for best alignment

def proccess_alignment(alignment):
    ###################################################################################################
    # Purpose: process the text representation of the best alignment into a dictionary with information
    #          about the alignment
    # Inputs: 1. alignment - str of text representation of an alignment
    # Output: a dictionary with the following key/value pairs
    #         - target - [<start index of alignment>, <aligned sequence>, <end index of alignment>] for
    #                    context sequence
    #         - query - [<start index of alignment>, <aligned sequence>, <end index of alignment>] for
    #                   the guide sequence
    #         - alignment - notation representation of the alignment, string
    #         - mismatches - indexes of mismatched positions in the alignment
    #         - gaps - indexes of gap positions in the alignment
    ###################################################################################################
    alignment_str = str(alignment)
    alignment_str = alignment_str.split(' ')
    while '' in alignment_str:
        alignment_str.remove('')
    alignment_str = ','.join(alignment_str)
    alignment_str = alignment_str.split('\n')
    while '' in alignment_str:
        alignment_str.remove('')
    alignment_str = ','.join(alignment_str)
    alignment_list = alignment_str.split(',')
    while '' in alignment_list:
        alignment_list.remove('')

    alignment_dict = {}
    alignment_dict['target'] = [alignment_list[1], alignment_list[2], alignment_list[3]]
    alignment_dict['query'] = [alignment_list[8], alignment_list[9], alignment_list[10]]
    alignment_dict['alignment'] = alignment_list[5]

    mismatch_idx_list = []
    gap_idx_list = []
    for i, align_char in enumerate(alignment_dict['alignment']):
        if align_char != '|':
            if align_char == '.':
                mismatch_idx_list.append(i)
            elif align_char == '-':
                gap_idx_list.append(i)
            else:
                print('[WARNING] New char:')
                print(alignment_dict)

    alignment_dict['mismatches'] = mismatch_idx_list
    alignment_dict['gaps'] = gap_idx_list
            
    return alignment_dict

def normalize_alignment(seq_context_row, ref_col, align_strand):
    ###################################################################################################
    # Purpose: normalize alignment notations to full length of the guide or context sequence by padding
    #          with '_'
    # Inputs: 1. seq_context_row - subset of a row of an aligmnet df
    #         2. ref_col - row containing the sequences to be normalized to
    #         3. align_strand - 'target' or 'query' depending on which is the relevant strand in alignments
    # Output: a string of the alignment matched to its position in the full sequence length and padded
    #         to that length with '_'
    ###################################################################################################

    alignments = seq_context_row['alignments']
    seq_context = seq_context_row[ref_col]

    len_seq_context = len(seq_context)
    best_alignment = alignments[0]

    best_alignment_dict = proccess_alignment(best_alignment)
    ref_seq = best_alignment_dict[align_strand][1]
    alignment_start = best_alignment_dict[align_strand][0]
    alignment_end = best_alignment_dict[align_strand][2]

    norm_alignment = best_alignment_dict['alignment']
    gaps_total = len(best_alignment_dict['gaps'])
    for i in range(gaps_total):
        gap_idx = best_alignment_dict['gaps'][gaps_total - 1 - i]
        if ref_seq[gap_idx] == '-':
            norm_alignment = norm_alignment[:gap_idx] + norm_alignment[gap_idx + 1:]
    
    alignment_prefix = ''
    while len(alignment_prefix) < int(alignment_start):
        alignment_prefix = alignment_prefix + '_'
    
    norm_alignment = alignment_prefix + norm_alignment

    while len(norm_alignment) < len_seq_context:
        norm_alignment = norm_alignment + '_'
    
    if len(norm_alignment) != len_seq_context:
        print(seq_context_row.name)
        print(best_alignment)
        print (f'{len(alignment_prefix)}\t{len(best_alignment_dict['alignment'])}\t{len(norm_alignment)}\t{len_seq_context}')
        print(norm_alignment)

    return norm_alignment




info_col_list = ['gene_id', 'chrom', 'pos', 'ref', 'alt', 'strand']

context_idx_df_dict = {}
guide_idx_df_dict = {}
context_count_df_dict = {}
context_freq_df_dict = {}
guide_count_df_dict = {}
guide_freq_df_dict = {}

for biocondition, alignment_df in alignment_df_dict.items():
    alignment_df['context_norm_alignment_notation'] = alignment_df[['seq_context', 'alignments']].apply(normalize_alignment, ref_col='seq_context', align_strand='target', axis=1)
    alignment_df['guide_norm_alignment_notation']   = alignment_df[['guide_seq', 'alignments']].apply(normalize_alignment, ref_col='guide_seq', align_strand='query', axis=1)
    alignment_df['guide_norm_alignment_notation']   = alignment_df['guide_norm_alignment_notation'].apply(lambda x: x[::-1])
    alignment_df['context_align_start']             = alignment_df['alignments'].apply(lambda x: proccess_alignment(x[0])['target'][0])
    alignment_df['context_align_end']               = alignment_df['alignments'].apply(lambda x: proccess_alignment(x[0])['target'][2])
    alignment_df['query_align_start']               = alignment_df['alignments'].apply(lambda x: proccess_alignment(x[0])['query'][0])
    alignment_df['query_align_end']                 = alignment_df['alignments'].apply(lambda x: proccess_alignment(x[0])['query'][2])

    idx_col_list = [f'idx_{i}' for i in range(((2*alignment_offset) + 1))]
    context_idx_df = alignment_df[info_col_list + ['seq_context', 'context_norm_alignment_notation']].copy()
    context_idx_df[idx_col_list] = alignment_df.apply(lambda x: list(x['context_norm_alignment_notation']), axis=1, result_type='expand')
    context_count_df = generate_count_df(df=context_idx_df, value_name='alignment_notation', tgt_col_list=idx_col_list)
    context_freq_df = pd.DataFrame(context_count_df['alignment_notation'])
    display(context_count_df)
    total_row = context_count_df.loc[len(context_count_df) - 1, idx_col_list]
    context_freq_df[idx_col_list] = context_count_df[idx_col_list].div(total_row)
    context_freq_df = context_freq_df.drop(index=(len(context_freq_df) - 1))

    idx_col_list = [f'idx_{i}' for i in range(len(guide_seq))]
    guide_idx_df = alignment_df[info_col_list + ['guide_seq', 'guide_norm_alignment_notation']].copy()
    guide_idx_df[idx_col_list] = alignment_df.apply(lambda x: list(x['guide_norm_alignment_notation']), axis=1, result_type='expand')
    guide_count_df = generate_count_df(df=guide_idx_df, value_name='alignment_notation', tgt_col_list=idx_col_list)
    guide_freq_df = pd.DataFrame(guide_count_df['alignment_notation'])
    total_row = guide_count_df.loc[len(guide_count_df) - 1, idx_col_list]
    guide_freq_df[idx_col_list] = guide_count_df[idx_col_list].div(total_row)
    guide_freq_df = guide_freq_df.drop(index=(len(guide_freq_df) - 1))

    context_idx_df_dict[biocondition]   = context_idx_df
    guide_idx_df_dict[biocondition]     = guide_idx_df
    context_count_df_dict[biocondition] = context_count_df
    context_freq_df_dict[biocondition]  = context_freq_df
    guide_count_df_dict[biocondition]   = guide_count_df
    guide_freq_df_dict[biocondition]    = guide_freq_df

    print(f'Condition:{biocondition}')
    display(context_idx_df)
    display(guide_idx_df)
    display(context_count_df)
    display(context_freq_df)
    display(guide_count_df)
    display(guide_freq_df)

Unnamed: 0,alignment_notation,idx_0,idx_1,idx_2,idx_3,idx_4,idx_5,idx_6,idx_7,idx_8,...,idx_41,idx_42,idx_43,idx_44,idx_45,idx_46,idx_47,idx_48,idx_49,idx_50
0,-,0.0,0.0,0.0,8.0,17.0,27.0,42.0,50.0,60.0,...,60.0,46.0,34.0,30.0,19.0,9.0,4.0,0.0,0.0,0.0
1,.,0.0,0.0,7.0,10.0,6.0,3.0,9.0,5.0,12.0,...,14.0,11.0,5.0,5.0,8.0,4.0,3.0,2.0,0.0,0.0
2,_,350.0,330.0,310.0,300.0,283.0,270.0,254.0,239.0,225.0,...,253.0,263.0,269.0,280.0,290.0,305.0,314.0,330.0,346.0,363.0
3,|,30.0,50.0,63.0,62.0,74.0,80.0,75.0,86.0,83.0,...,53.0,60.0,72.0,65.0,63.0,62.0,59.0,48.0,34.0,17.0
4,total,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,...,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0


Condition:02_ALTER.4.PPIB.14L8


Unnamed: 0,gene_id,chrom,pos,ref,alt,strand,seq_context,context_norm_alignment_notation,idx_0,idx_1,...,idx_41,idx_42,idx_43,idx_44,idx_45,idx_46,idx_47,idx_48,idx_49,idx_50
1081,ENSG00000157933.11,chr1,2229554,C,T,+,CCGCCGCACAAGTTCGTGGTGCACTCGCACAAGGCCCTGGAGAACC...,|||||------||.|||_____________________________...,|,|,...,_,_,_,_,_,_,_,_,_,_
1819,ENSG00000074800.16,chr1,8865298,C,T,-,TGGCTGACCTGTACAAGTCCTTCATCAAGGACTACCCAGTGGTGTC...,________________||||.||.||____________________...,_,_,...,_,_,_,_,_,_,_,_,_,_
1969,ENSG00000188807.13,chr1,9604274,C,T,+,ATTTATAATAACCAGAGCCAGCCCTCGCGCTGGCCAGGATCCTCCT...,______|||-------|||-|||.||-||----------||.||__...,_,_,...,.,|,|,_,_,_,_,_,_,_
2722,ENSG00000116786.15,chr1,15734167,C,T,+,GGTAGCGAATGGAGGTCGCTGGGGGCAGAGGGTCCGAGCCCTGTGG...,_______|||----|.|||-----------.|||||----------...,_,_,...,-,-,-,-,-,-,-,|,|,|
3378,ENSG00000117118.12,chr1,17044825,C,T,-,CCGTATCAAGAAATTTGCCATCTATCGATGGGACCCAGACAAGGCT...,||||-------------||.|||_______________________...,|,|,...,_,_,_,_,_,_,_,_,_,_
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241323,ENSG00000133131.15,chrX,106986043,C,T,-,TGTTGGACTTCTATCACAGACCTATCTGGAATGTGTCCAGGCCCAG...,_______|||||__________________________________...,_,_,...,_,_,_,_,_,_,_,_,_,_
241639,ENSG00000005022.6,chrX,119468602,C,T,ambiguous,TCTCCAAGACGGCGGTAGCGCCCATCGAGCGGGTCAAGCTGCTGCT...,__________________||||----------|||___________...,_,_,...,_,_,_,_,_,_,_,_,_,_
242106,ENSG00000102034.18,chrX,130065235,C,T,-,GCCTCCCTCCTGCCAAGTTTGAGGGCAAGGCTAAGAAATGTCAGCC...,|||.||.|||-----------------------------|||____...,|,|,...,|,_,_,_,_,_,_,_,_,_
242578,ENSG00000022267.19,chrX,136206573,C,T,+,CCTGTGTGGAATGCCGCAAGCCCATCGGTGCGGACTCCAAGGAGGT...,_________|||||||-----||.||.||_________________...,_,_,...,_,_,_,_,_,_,_,_,_,_


Unnamed: 0,gene_id,chrom,pos,ref,alt,strand,guide_seq,guide_norm_alignment_notation,idx_0,idx_1,...,idx_10,idx_11,idx_12,idx_13,idx_14,idx_15,idx_16,idx_17,idx_18,idx_19
1081,ENSG00000157933.11,chr1,2229554,C,T,+,AGAAGACGGACGGCGGCAUU,_____|||.|||||||____,_,_,...,|,|,|,|,|,|,_,_,_,_
1819,ENSG00000074800.16,chr1,8865298,C,T,-,AGAAGACGGACGGCGGCAUU,_||.||.||||_________,_,|,...,|,_,_,_,_,_,_,_,_,_
1969,ENSG00000188807.13,chr1,9604274,C,T,+,AGAAGACGGACGGCGGCAUU,_||.||||||.|||||||||,_,|,...,.,|,|,|,|,|,|,|,|,|
2722,ENSG00000116786.15,chr1,15734167,C,T,+,AGAAGACGGACGGCGGCAUU,___||||||||.|||.||||,_,_,...,|,.,|,|,|,.,|,|,|,|
3378,ENSG00000117118.12,chr1,17044825,C,T,-,AGAAGACGGACGGCGGCAUU,___|||.||||||_______,_,_,...,|,|,|,_,_,_,_,_,_,_
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241323,ENSG00000133131.15,chrX,106986043,C,T,-,AGAAGACGGACGGCGGCAUU,|||||_______________,|,|,...,_,_,_,_,_,_,_,_,_,_
241639,ENSG00000005022.6,chrX,119468602,C,T,ambiguous,AGAAGACGGACGGCGGCAUU,________|||||||_____,_,_,...,|,|,|,|,|,_,_,_,_,_
242106,ENSG00000102034.18,chrX,130065235,C,T,-,AGAAGACGGACGGCGGCAUU,____||||||.||.|||___,_,_,...,.,|,|,.,|,|,|,_,_,_
242578,ENSG00000022267.19,chrX,136206573,C,T,+,AGAAGACGGACGGCGGCAUU,_____||.||.|||||||||,_,_,...,.,|,|,|,|,|,|,|,|,|


Unnamed: 0,alignment_notation,idx_0,idx_1,idx_2,idx_3,idx_4,idx_5,idx_6,idx_7,idx_8,...,idx_41,idx_42,idx_43,idx_44,idx_45,idx_46,idx_47,idx_48,idx_49,idx_50
0,-,0.0,0.0,0.0,8.0,17.0,27.0,42.0,50.0,60.0,...,60.0,46.0,34.0,30.0,19.0,9.0,4.0,0.0,0.0,0.0
1,.,0.0,0.0,7.0,10.0,6.0,3.0,9.0,5.0,12.0,...,14.0,11.0,5.0,5.0,8.0,4.0,3.0,2.0,0.0,0.0
2,_,350.0,330.0,310.0,300.0,283.0,270.0,254.0,239.0,225.0,...,253.0,263.0,269.0,280.0,290.0,305.0,314.0,330.0,346.0,363.0
3,|,30.0,50.0,63.0,62.0,74.0,80.0,75.0,86.0,83.0,...,53.0,60.0,72.0,65.0,63.0,62.0,59.0,48.0,34.0,17.0
4,total,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,...,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0


Unnamed: 0,alignment_notation,idx_0,idx_1,idx_2,idx_3,idx_4,idx_5,idx_6,idx_7,idx_8,...,idx_41,idx_42,idx_43,idx_44,idx_45,idx_46,idx_47,idx_48,idx_49,idx_50
0,-,0.0,0.0,0.0,0.021053,0.044737,0.071053,0.110526,0.131579,0.157895,...,0.157895,0.121053,0.089474,0.078947,0.05,0.023684,0.010526,0.0,0.0,0.0
1,.,0.0,0.0,0.018421,0.026316,0.015789,0.007895,0.023684,0.013158,0.031579,...,0.036842,0.028947,0.013158,0.013158,0.021053,0.010526,0.007895,0.005263,0.0,0.0
2,_,0.921053,0.868421,0.815789,0.789474,0.744737,0.710526,0.668421,0.628947,0.592105,...,0.665789,0.692105,0.707895,0.736842,0.763158,0.802632,0.826316,0.868421,0.910526,0.955263
3,|,0.078947,0.131579,0.165789,0.163158,0.194737,0.210526,0.197368,0.226316,0.218421,...,0.139474,0.157895,0.189474,0.171053,0.165789,0.163158,0.155263,0.126316,0.089474,0.044737


Unnamed: 0,alignment_notation,idx_0,idx_1,idx_2,idx_3,idx_4,idx_5,idx_6,idx_7,idx_8,...,idx_10,idx_11,idx_12,idx_13,idx_14,idx_15,idx_16,idx_17,idx_18,idx_19
0,-,0.0,0.0,0.0,0.0,3.0,9.0,34.0,26.0,29.0,...,49.0,31.0,22.0,29.0,18.0,6.0,3.0,0.0,0.0,0.0
1,.,0.0,0.0,16.0,32.0,11.0,38.0,63.0,46.0,15.0,...,95.0,28.0,21.0,47.0,37.0,16.0,7.0,14.0,0.0,0.0
2,_,278.0,188.0,169.0,137.0,105.0,98.0,90.0,82.0,68.0,...,73.0,79.0,78.0,92.0,111.0,124.0,148.0,199.0,274.0,318.0
3,|,102.0,192.0,195.0,211.0,261.0,235.0,193.0,226.0,268.0,...,163.0,242.0,259.0,212.0,214.0,234.0,222.0,167.0,106.0,62.0
4,total,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,...,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0


Unnamed: 0,alignment_notation,idx_0,idx_1,idx_2,idx_3,idx_4,idx_5,idx_6,idx_7,idx_8,...,idx_10,idx_11,idx_12,idx_13,idx_14,idx_15,idx_16,idx_17,idx_18,idx_19
0,-,0.0,0.0,0.0,0.0,0.007895,0.023684,0.089474,0.068421,0.076316,...,0.128947,0.081579,0.057895,0.076316,0.047368,0.015789,0.007895,0.0,0.0,0.0
1,.,0.0,0.0,0.042105,0.084211,0.028947,0.1,0.165789,0.121053,0.039474,...,0.25,0.073684,0.055263,0.123684,0.097368,0.042105,0.018421,0.036842,0.0,0.0
2,_,0.731579,0.494737,0.444737,0.360526,0.276316,0.257895,0.236842,0.215789,0.178947,...,0.192105,0.207895,0.205263,0.242105,0.292105,0.326316,0.389474,0.523684,0.721053,0.836842
3,|,0.268421,0.505263,0.513158,0.555263,0.686842,0.618421,0.507895,0.594737,0.705263,...,0.428947,0.636842,0.681579,0.557895,0.563158,0.615789,0.584211,0.439474,0.278947,0.163158


In [16]:
# tabulate counts and frequencies by guide region
guide_region_count_df_dict = {}
guide_region_freq_df_dict = {}

seed_col_list = [f'idx_{i}' for i in range(0,8)]
central_col_list = [f'idx_{i}' for i in range(8,12)]
supp_col_list = [f'idx_{i}' for i in range(12,len(guide_seq))]

for biocondition, alignment_df in alignment_df_dict.items():
    region_count_df = guide_count_df[['alignment_notation']].copy()
    region_count_df['seed'] = guide_count_df[seed_col_list].sum(axis=1)
    region_count_df['central'] = guide_count_df[central_col_list].sum(axis=1)
    region_count_df['supplementary'] = guide_count_df[supp_col_list].sum(axis=1)

    region_freq_df = region_count_df[region_count_df['alignment_notation'] != 'total'].copy()
    for region_col in region_count_df.columns:
        if region_col != 'alignment_notation':
            total_count = region_count_df.loc[len(region_count_df) - 1, region_col]
            region_freq_df[region_col] = region_freq_df[region_col]/total_count

    guide_region_count_df_dict[biocondition] = region_count_df
    guide_region_freq_df_dict[biocondition] = region_freq_df

    display(region_count_df)
    display(region_freq_df)

Unnamed: 0,alignment_notation,seed,central,supplementary
0,-,72.0,138.0,78.0
1,.,206.0,173.0,142.0
2,_,1147.0,291.0,1344.0
3,|,1615.0,918.0,1476.0
4,total,3040.0,1520.0,3040.0


Unnamed: 0,alignment_notation,seed,central,supplementary
0,-,0.023684,0.090789,0.025658
1,.,0.067763,0.113816,0.046711
2,_,0.377303,0.191447,0.442105
3,|,0.53125,0.603947,0.485526


In [17]:
# produce outputs for alignment analysis

excel_path = os.path.join(out_dir, 'alignment-counts.xlsx')
with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
    for biocondition in alignment_df_dict.keys():

        biocon_out_dir = os.path.join(out_dir, biocondition)
        os.makedirs(biocon_out_dir, exist_ok=True)
        
        context_idx_df          = context_idx_df_dict[biocondition]
        guide_idx_df            = guide_idx_df_dict[biocondition]
        context_count_df        = context_count_df_dict[biocondition]
        context_freq_df         = context_freq_df_dict[biocondition]
        guide_count_df          = guide_count_df_dict[biocondition]
        guide_freq_df           = guide_freq_df_dict[biocondition]
        guide_region_count_df   = guide_region_count_df_dict[biocondition]
        guide_region_freq_df    = guide_region_freq_df_dict[biocondition]
        summary_df              = alignment_df_dict[biocondition][info_col_list + ['seq_context', 'guide_seq', 'context_norm_alignment_notation', 'guide_norm_alignment_notation']]

        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            tsv_path = os.path.join(biocon_out_dir, f'{biocondition}-align-cont-idx.tsv.gz')
            context_idx_df.to_csv(tsv_path, sep='\t', compression='gzip', index=False)

            tsv_path = os.path.join(biocon_out_dir, f'{biocondition}-align-guid-idx.tsv.gz')
            guide_idx_df.to_csv(tsv_path, sep='\t', compression='gzip', index=False)

            tsv_path = os.path.join(biocon_out_dir, f'{biocondition}-align-cont-count.tsv.gz')
            context_count_df.to_csv(tsv_path, sep='\t', compression='gzip', index=False)
            context_count_df.to_excel(writer, sheet_name=f'{biocondition}-contex-count', index=False)

            tsv_path = os.path.join(biocon_out_dir, f'{biocondition}-align-guid-count.tsv.gz')
            guide_count_df.to_csv(tsv_path, sep='\t', compression='gzip', index=False)
            guide_count_df.to_excel(writer, sheet_name=f'{biocondition}-guid-count', index=False)

            tsv_path = os.path.join(biocon_out_dir, f'{biocondition}-align-cont-freq.tsv.gz')
            context_freq_df.to_csv(tsv_path, sep='\t', compression='gzip', index=False)
            context_freq_df.to_excel(writer, sheet_name=f'{biocondition}-contex-freq', index=False)

            tsv_path = os.path.join(biocon_out_dir, f'{biocondition}-align-guid-freq.tsv.gz')
            guide_freq_df.to_csv(tsv_path, sep='\t', compression='gzip', index=False)
            guide_freq_df.to_excel(writer, sheet_name=f'{biocondition}-guid-freq', index=False)

            tsv_path = os.path.join(biocon_out_dir, f'{biocondition}-align-reg-count.tsv.gz')
            guide_region_count_df.to_csv(tsv_path, sep='\t', compression='gzip', index=False)
            guide_region_count_df.to_excel(writer, sheet_name=f'{biocondition}-reg-count', index=False)

            tsv_path = os.path.join(biocon_out_dir, f'{biocondition}-align-reg-freq.tsv.gz')
            guide_region_freq_df.to_csv(tsv_path, sep='\t', compression='gzip', index=False)
            guide_region_freq_df.to_excel(writer, sheet_name=f'{biocondition}-reg-freq', index=False)

            tsv_path = os.path.join(biocon_out_dir, f'{biocondition}-align.tsv.gz')
            summary_df.to_csv(tsv_path, sep='\t', compression='gzip', index=False)
            summary_df.to_excel(writer, sheet_name=f'{biocondition}-summary', index=False)