# Sequence Level Features and Analysis

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from scipy.stats import ttest_1samp, poisson
from Bio import pairwise2
from Bio.pairwise2 import format_alignment



pd.set_option('display.float_format', '{:.10e}'.format)
pd.set_option('display.max_colwidth', 30)

In [2]:
csv_file = "../Data/R12-clean.csv"
df = pd.read_csv(csv_file)
df.head(5)

Unnamed: 0,Sequence,Copy Num,Length
0,AGTGCCATCGTGCGTATCCTTCACTC...,91,98
1,AGTGCCATCGTGCGTATCCTTCACGT...,86,98
2,AGTGCCATCGTGCGTATCCTGAACAT...,83,98
3,AGTGCCATCGTGCGTATCCCGCTCCG...,80,98
4,AGTGCCATCGTGCGTATCCTGAACAT...,78,98


### 1. k-mer Extraction

In [3]:
def extract_kmers(seq, k):
    """Extracts kmers from a sequence. Saves position and kmer chunk to preserve order and distance information for downstream secondary structure analysis. Collects kmers from all k reading frames."""
    all_kmers = [(i, seq[i:i+k]) for i in range(len(seq)-k+1)]
    return all_kmers

In [5]:
def getkmers(data, k):
    """Performs row-level extraction of kmers and returns long-form df."""
    kmer_data = []
    for idx, row in df.iterrows():
        kmers = extract_kmers(row['Sequence'], k)
        for position, kmer in kmers:
            kmer_data.append({'Sequence_ID': idx, 'Position': position, 'k-mer': kmer})
    
    kmer_df = pd.DataFrame(kmer_data)
    return kmer_df

In [6]:
getkmers(data=df, k=10).head()

Unnamed: 0,Sequence_ID,Position,k-mer
0,0,0,AGTGCCATCG
1,0,1,GTGCCATCGT
2,0,2,TGCCATCGTG
3,0,3,GCCATCGTGC
4,0,4,CCATCGTGCG


### 2. Search for motifs in variable regions.  
We previously considered a naive kmer search but there were too many false positives because high frequency kmers were arising from CRs. We pursue a new approach where we look for motifs only in the VRs and later add back potential overhangs into the CRs when we apply downstream secondary structure analysis to confirm active motifs.  

#### Designed constant regions of selection library: 
The following describes the template of the starting selection library:

forward primer | AAGTGCCATCGTGCGTATCC | 20 bp  
variable region 1 | (N)^22 | 22 bp  
mipomersen loading | GCGAAGCAGACTGAGGC | 17 bp  
variable region 2 | (N)^21 | 21 bp  
reverse primer | GTAGACTGGAGACACGACGA | 20 bp  

Boundary positions are approximate because of PCR mutations/sequencing errors.  For sequences of length 98, the orignal library size (nb: 100bp but NGS discards first and last reads), we estimate the boundary positions but allow a tuneable buffer of a few bps to account for the errors.  For some sequences of non-standard length, we perform pairwise alignment with CRs to more accurately determine the boundary positions.

In [120]:
designed_FP ="AAGTGCCATCGTGCGTATCC"
FP = designed_FP[1:]
MIP = "GCGAAGCAGACTGAGGC"
designed_RP = "GTAGACTGGAGACACGACGA"
RP = designed_RP[:-1]

In [118]:
PADNA_1 = df.iloc[0, 0]

#### a. Use PADNA-1 to set boundary estimates for standard length sequences
PADNA_1 is an example of an ideal sequence, so we use it to get our boundary estimates.

In [142]:
FP_start = PADNA_1.index(FP)
FP_end = FP_start + len(FP)

MIP_start = PADNA_1.index(MIP)
MIP_end = MIP_start + len(MIP)

RP_start = PADNA_1.index(RP)
RP_end = RP_start + len(RP)

# Variable region estimates
V1_start = FP_end
V1_end = MIP_start

V2_start = MIP_end
V2_end = RP_start

print(f'Boundary estimates\n----------------\nForward primer: [{FP_start}, {FP_end}]\n          VR 1: [{V1_start}, {V1_end}]\n    Mipomersen: [{MIP_start}, {MIP_end}]\n          VR 2: [{V2_start}, {V2_end}]\nReverse primer: [{RP_start}, {RP_end}]')

PADNA_1[FP_start:FP_end] == FP
PADNA_1[MIP_start:MIP_end] == MIP
PADNA_1[RP_start:RP_end] == RP

Boundary estimates
----------------
Forward primer: [0, 19]
          VR 1: [19, 41]
    Mipomersen: [41, 58]
          VR 2: [58, 79]
Reverse primer: [79, 98]


True

In [134]:
# PADNA_1
print('AGTGCCATCGTGCGTATCC|TTCACTCCTTGCTCGACAAGAA|GCGAAGCAGACTGAGGC|GTCCGATGGTCTAATTCTTCA|GTAGACTGGAGACACGACG')

print('Variable Regions:\n')
print(f'V1: {PADNA_1[V1_start:V1_end]}\nV2: {PADNA_1[V2_start:V2_

AGTGCCATCGTGCGTATCC|TTCACTCCTTGCTCGACAAGAA|GCGAAGCAGACTGAGGC|GTCCGATGGTCTAATTCTTCA|GTAGACTGGAGACACGACG
Boundary estimates
----------------
VR 1: [19, 40]
VR 2: [58, 78]


In [None]:
# Collect sequences of standard 98 bp length