In [116]:
import random
import math
import numpy as np
import itertools
import re
from collections import Counter,defaultdict
from Bio import SeqIO
import matplotlib.pyplot as plt

print("Loaded libraries!")

Loaded libraries!


In [65]:
# Returns proportion of biomarker k-mers of interest that are found.

def proportion_kmers_present(seq, biomarker_kmers):
    present = 0
    for k in biomarker_kmers:
        if k in seq:
            present += 1
    return present/len(biomarker_kmers)

In [67]:
## Function finds biomarker k-mers in the SILVA 16S rRNA database sequences.
## A biomarker k-mer has length klen and is found in some minimum proportion (min_freq_SILVA) of SILVA sequences.

def find_biomarker_kmers(klen, min_freq_SILVA):
    kmer_counts = defaultdict(int)
    kmers_of_interest = set()
    total_seqs = 0
    
    for h,i in enumerate(SeqIO.parse(f,'fasta')):
        id,d,s,L = str(i.id),str(i.description),str(i.seq).upper().replace('T','U'),len(i.seq)
        total_seqs += 1
        kmers = set()
        for i in range(0,len(s)+1-klen):
            kmer = s[i:i+klen]
            # rc_kmer = revcomp(kmer)
            # canonical_kmer = min(kmer,rc_kmer)
            kmers.add(kmer)
        for k in kmers:
            kmer_counts[k] += 1
        print("Identifying biomarkers!",total_seqs,end='\r')
    
    for k,v in sorted(kmer_counts.items(),key=lambda x:x[1]):
        if v/total_seqs > min_freq_SILVA:
            kmers_of_interest.add(k)
            # print(k,v/total_seqs)
    
    # print()
    # print(f"k-mer length used for analysis: {klen}")
    # print(f"Observed frequency threshold: {min_freq_SILVA}")
    # print(f"Sequences analyzed: {total_seqs}")
    # print(f"Total k-mers observed: {len(kmer_counts)}")
    # print(f"Total k-mers possible: {4**klen}")
    # print(f"Number of biomarker k-mers identified: {len(kmers_of_interest)}")
    
    return kmers_of_interest

In [69]:
# Function mutates a specified number of nucleotides (num_mutations) in a RNA string to a different base.

def mutate_dna(rna_string, num_mutations):
    rna_list = list(rna_string)
    rna_length = len(rna_list)
    if num_mutations > rna_length:
        raise ValueError("Number of mutations cannot exceed the length of the RNA string.")

    mutation_indices = random.sample(range(rna_length), num_mutations)

    for index in mutation_indices:
        current_base = rna_list[index]
        valid_bases = ['A', 'U', 'C', 'G']
        if current_base not in valid_bases:
            continue
        else:
            valid_bases.remove(current_base) # Ensure the new base is different.
            new_base = random.choice(valid_bases)
            rna_list[index] = new_base

    return "".join(rna_list)  # Convert back to a string

In [71]:
# Function returns the reverse complement of a RNA string.

def revcomp(seq):
    revcomp = ''
    for i in seq[::-1]:
        revcomp += {'A':'U','C':'G','G':'C','U':'A'}.get(i,i)
    return revcomp

In [73]:
# Identifies the number of A, C, G, and U homopolymers of some minimum length (shortest_homopolymer_len) in a RNA string.

def find_homopolymer(seq,L):
    homopolymers = [0,0,0,0]
    i = 0
    shortest_homopolymer_len = 4
    
    while i < L - shortest_homopolymer_len + 1:
        base = seq[i]
        homopolymer = base
        for j in range(i+1,L):
            if seq[j] == base:
                homopolymer += base
            else:
                break
        i = j
        if len(homopolymer) >= shortest_homopolymer_len:
            NT = list(set(homopolymer))[0]
            if NT == 'A': homopolymers[0] += 1
            elif NT == 'C': homopolymers[1] += 1
            elif NT == 'G': homopolymers[2] += 1
            elif NT == 'U': homopolymers[3] += 1
    
    return homopolymers

In [103]:
# Returns various statistics about the base composition of a RNA string e.g., GC content, number of A's, C's, G's, and U's.

def seq_composition(s,L):
    nts_counts = Counter(s)
    A = nts_counts['A']
    C = nts_counts['C']
    G = nts_counts['G']
    U = nts_counts['U']
    GC = (C+G)/L
    return [L,A,C,G,U,A/C,A/G,A/U,C/G,C/U,G/U,GC]

In [132]:
# Calculates the Shannon entropy of a list of numbers.

def shannon_entropy(kmer_counts):
    kmer_counts = [x for x in kmer_counts if x != 0]
    
    total_kmers = sum(kmer_counts)
    entropy = 0.0
    
    for count in kmer_counts:
        probability = count / total_kmers
        entropy -= probability * math.log2(probability)
    
    return entropy

In [148]:
# Calculates the proportion of k-mers where the k-mer and its reverse complement are present in a RNA string.

def kmer_parity(seq,L):
    results = []
    results_entropy = []
    for klen in range(1,10,2):
        kmer_counts = defaultdict(list)
        individual_kmer_counts = []
        for i in range(0,len(seq)+1-klen):
            kmer = seq[i:i+klen]
            rc_kmer = revcomp(kmer)
            canonical_kmer = min(kmer,rc_kmer)
            if canonical_kmer not in kmer_counts:
                kmer_counts[canonical_kmer] = [0,0]
            if kmer == canonical_kmer:
                kmer_counts[canonical_kmer][0] += 1
            else:
                kmer_counts[canonical_kmer][1] += 1
        dna_diff = 0
        for k,v in kmer_counts.items():
            dna_diff += abs(v[0]-v[1])
            individual_kmer_counts += v
        results.append(1 - dna_diff/L)
        results_entropy.append(shannon_entropy(individual_kmer_counts))
    return results, results_entropy

In [79]:
def find_conserved_nucleotide_fingerprint(query_sequence, nt_sequence):
    """
    Finds if a subsequence (nt_sequence) occurs in a query_sequence,
    allowing for gaps (extra letters) between the subsequence's letters.

    Args:
        query_sequence (str): The longer string to search within.
        nt_sequence (str): The subsequence to find.

    Returns:
             The indices of the subsequence in the query_sequence if found,
             and the proportion of observed conserved nucleotides.
    """

    found_univ_conserved_nts = []
    start = 0

    for i in nt_sequence:
        loci = query_sequence.find(i,start,len(query_sequence)+1)
        if loci != -1:
            found_univ_conserved_nts.append(start)
            start = loci + 1 + len(i) # adjust subsequence to search for next conserved motive in based upon position of last found motif
        else: continue
    return len(found_univ_conserved_nts)/len(nt_sequence),found_univ_conserved_nts

In [150]:
# Returns the number of universal 16S rRNA primers (from 16S rRNA wikipedia page) found in an RNA string.

def find_universal_primers(s):
    
    universal_primers = ['AGAGUUUGAUCCUGGCUCAG','AGAGUUUGAUC[AC]UGGCUCAG','ACUGCUGC[GC][CU]CCCGUAGGAGUCU','GACUCCUACGGGAGGC[AU]GCAG','GUAUUACCGCGGCUGCUGG','GUGCCAGC[AC]GCCGCGGUAA','GGAUUAGAUACCCUGGUA','GGACUAC[ACG][GC]GGGUAUCUAAU','CCGUCAAUUCCUUU[AG]AGUUU','UAAAACU[CU]AAA[GU]GAAUUGACGGG','[CU]AACGAGCGCAACCC','GGGUUGCGCUCGUUG','GGUUACCUUGUUACGACUU','CGGUUACCUUGUUACGACUU']
    matches = set()
    
    for motif in universal_primers:
        for match in re.finditer(motif, s):
            matches.add(match[0])
    
    return len(matches)

In [83]:
# Returns a list with the universally conserved nucleotides of the 16S rRNA.
# Consecutive conserved nucleotides form a single string in the list.

def universally_conserved_nucleotide_fingerprint():
    universally_conserved = {'U':13,'A':51,'C':54,'A':55,'U':56,'A':109,'A':151,'A':160,'A':243,'U':244,'A':246,'A':282,'U':323,'A':344,'G':346,'G':347,'C':355,'A':356,'G':357,'G':362,'A':364,'U':368,'A':389,'A':397,'U':405,'A':499,'G':505,'A':509,'G':515,'G':517,'C':519,'A':520,'G':521,'C':522,'G':527,'C':528,'G':530,'A':532,'A':533,'C':536,'U':565,'G':566,'U':571,'G':581,'A':676,'A':695,'A':704,'A':715,'G':725,'G':727,'C':732,'A':781,'A':787,'U':788,'A':790,'G':791,'A':792,'C':795,'U':801,'A':802,'A':815,'U':820,'A':864,'A':865,'G':885,'A':889,'U':891,'A':892,'C':899,'A':900,'A':901,'A':908,'A':909,'U':911,'A':914,'A':915,'A':919,'U':920,'G':922,'C':924,'G':925,'G':926,'C':936,'G':944,'U':956,'A':958,'A':959,'U':960,'C':972,'C':984,'G':1050,'U':1052,'G':1053,'C':1054,'A':1055,'G':1057,'G':1058,'U':1073,'A':1093,'U':1095,'U':1199,'G':1221,'A':1227,'C':1237,'U':1315,'G':1316,'A':1318,'A':1319,'G':1337,'G':1338,'A':1339,'U':1341,'G':1347,'U':1348,'A':1349,'G':1373,'G':1379,'C':1382,'U':1391,'G':1392,'A':1394,'C':1395,'C':1397,'C':1399,'C':1403,'G':1405,'U':1406,'A':1418,'A':1492,'A':1493,'G':1494,'U':1495,'C':1496,'C':1501,'G':1504,'G':1505,'C':1509,'U':1512,'G':1517,'G':1526}
    nt_sequence = 'UACAUAAAAUAAUAGGCAGGAUAAUAGAGGCAGCGCGAACUGUGAAAAGGCAAUAGACUAAUAAGAUACAAAAUAAAUGCGGCGUAAUCCGUGCAGGUAUUGACUGAAGGAUGUAGGCUGACCCCGUAAAGUCCGGCUGG'
    loci_sequence = [13,51,54,55,56,109,151,160,243,244,246,282,323,344,346,347,355,356,357,362,364,368,389,397,405,499,505,509,515,517,519,520,521,522,527,528,530,532,533,536,565,566,571,581,676,695,704,715,725,727,732,781,787,788,790,791,792,795,801,802,815,820,864,865,885,889,891,892,899,900,901,908,909,911,914,915,919,920,922,924,925,926,936,944,956,958,959,960,972,984,1050,1052,1053,1054,1055,1057,1058,1073,1093,1095,1199,1221,1227,1237,1315,1316,1318,1319,1337,1338,1339,1341,1347,1348,1349,1373,1379,1382,1391,1392,1394,1395,1397,1399,1403,1405,1406,1418,1492,1493,1494,1495,1496,1501,1504,1505,1509,1512,1517,1526]
    
    ref_seq_N = ''
    
    for i in range(1, 1501):
        if i not in loci_sequence:
            ref_seq_N += 'N'
        else:
            ref_seq_N += nt_sequence[loci_sequence.index(i)]
    
    clustered_nt_sequence = []
    
    for i in ref_seq_N.split('N'):
        if i != '': clustered_nt_sequence.append(i)
    
    # print(ref_seq_N)
    # print(clustered_nt_sequence)
    
    return clustered_nt_sequence

In [57]:
# Runs program that collects biomarker k-mers and the universally conserved nucleotide fingerprint.
# Only needs to be run once for a specified set of parameters for the biomarker search.

kmers_of_interest = find_biomarker_kmers(klen = 11, min_freq_SILVA = 0.5)
print(f"Identified {len(kmers_of_interest)} biomarker k-mers!")

conserved_nt_fingerprint = universally_conserved_nucleotide_fingerprint()

Identified 297 biomarker k-mers!


In [160]:
# Fasta file of 16S rRNA sequences that will be analyzed.
# State variable indicates if sequences are "real" or "mutated."
# If mutated it should include mutation rate e.g., "Mutated_0.1"

# f = 'SILVA_138.2_SSURef_NR99_tax_silva_filtered.fasta'
# state = "Real"

f = 'SILVA_138.2_SSURef_NR99_tax_silva_filtered.mutated_0.1.fasta'
state = 'Mutated_0.1'

In [None]:
# Analyzes input fasta file.

with open(f"{f}_results.txt",'w') as out:
    out.write(f'SeqID\tState\tSeqLen\tA\tC\tG\tU\tA/C\tA/G\tA/U\tC/G\tC/U\tG/U\tGC\tPR2_1mer\tPR2_3mer\tPR2_5mer\tPR2_7mer\tPR2_9mer\tEntropy_1mer\tEntropy_3mer\tEntropy_5mer\tEntropy_7mer\tEntropy_9mer\tHomopolymerA\tHomopolymerC\tHomopolymerG\tHomopolymerU\tProp_conserved_nt_fingerprint\tNum_universal_primers\tProp_biomarkers\tSeq\n')
    for h,i in enumerate(SeqIO.parse(f,'fasta')):
        id,d,s,L = str(i.id),str(i.description),str(i.seq).upper(),len(i.seq)
        seq_homopolymers = find_homopolymer(s,L)
        prop_conserved_nt_fingerprint = find_conserved_nucleotide_fingerprint(s, conserved_nt_fingerprint)[0]
        num_universal_primers = find_universal_primers(s)
        prop_biomarkers_found = proportion_kmers_present(s,kmers_of_interest)
        kmer_symmetry,kmer_entropy = kmer_parity(s,L)
        seq_stats = "\t".join([d,state] + list(map(str,seq_composition(s,L))) + list(map(str,kmer_symmetry)) + list(map(str,kmer_entropy)) + list(map(str,seq_homopolymers)) + [str(prop_conserved_nt_fingerprint),str(num_universal_primers),str(prop_biomarkers_found),s])
        out.write(f'{seq_stats}\n')
        print(h,end = '\r')
        # if h == 1000: break

print("Analysis complete!")

23326