<a href="https://colab.research.google.com/github/raoshilpa/UCLA_CM122_S23/blob/main/CM122_Proj1a_Shilpa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
import argparse
import numpy as np
import time
import zipfile
import math
import textwrap
import pprint as pprint
from collections import defaultdict, Counter, OrderedDict

ref_raw = '/content/sample_1000_reference_genome.fasta'
reads_raw = '/content/sample_1000_with_error_paired_reads.fasta'

refs_file = '/content/sample_1000_reference_genome.fasta'
reads_file = '/content/sample_1000_with_error_paired_reads.fasta'

def reads_format(reads_file):
    try:
        with open(reads_file, 'r') as read:
            first_line = True
            count = 0
            reads = []
            for line in read:
                count += 1
                if first_line:
                    first_line = False
                    continue
                ends = line.strip().split(',')
                reads.append(ends)
        return reads
    except IOError:
        print("error reading ", reads_file)
        return None

print(reads_format(reads_raw))

def ref_format(ref_fn):
    try:
        with open(ref_fn, 'r') as genome:
            first_line = True
            ref_genome = ''
            for line in genome:
                if first_line:
                    first_line = False
                    continue
                ref_genome += line.strip()
        return ref_genome
    except IOError:
        print("error reading ", ref_fn)
        return None

# ref = formatted file
def refDict(ref, k):
    d = OrderedDict()
    for i in range(len(ref)-k):
        seq = ref[i:i+k]
        d.setdefault(seq, []).append(i)
    return d

# print(refDict(ref_format('/content/cm122_project1a_10000_reference_genome.fasta'), 17))


[['CAGTCACCCAGTGCCGCTGATGCCCAAGCACAGAAAACGGATGTTGCTAT'], ['>read_0/2'], ['ATTACGAGACGCATTATCAGCGTATCTTGGGCTTTAACGTGTATAGGGCG'], ['>read_1/1'], ['TATCTACACATTACGAGACGCATTATCAGCGTCTCTTGGGCTTTAACGTG'], ['>read_1/2'], ['AAATGCATGGATCAGGGGCAAGATGCAGACACGGCTGTACTTCCCAAGGCA'], ['>read_2/1'], ['TAGAATCCACGGGGTAGGCGAATAATCCATTTTGTCACCCTCAACAAAC'], ['>read_2/2'], ['TTTTAGGAAAGTCAAATGCATGGATCAGGGGCAAGAGCAGACACGGCTT'], ['>read_3/1'], ['CAACTATGCCCCACATGACAAATCTGACGACGTGATTGCAGCCCACAAGG'], ['>read_3/2'], ['AGTTTGACGTATCCCAAGGATGAGATACAGTACATGAGTGCTCCTCTAC'], ['>read_4/1'], ['ATAGCCATTTAGGGCTTGATCCTATAGTTCTCCGTATCCAGTTGTGCTAA'], ['>read_4/2'], ['AGATGCTTGCTGTAACCGAGATGCCTCAGGCAGATACCTTAATGCGACGA'], ['>read_5/1'], ['CCCCAAGGATGAGATACAGTACATGAGTGCTCCTCTACTGACACGTTTCGC'], ['>read_5/2'], ['GATGCCCAAGCACAGAAAACGGATGTTGCTATAGAATCCACGGTGTAGGC'], ['>read_6/1'], ['CGGTGGTTTCCGGCATATGTAGATGCTTGTTGTAACCGAGATGCCTCAG'], ['>read_6/2'], ['GTTAGTTTCGAAGACTGCGCTACTCTGTTGAACCCATATTCAACCCTGA'], ['>read_7/1'], ['TCGGTG

In [None]:
  # create 17 dicts
  # use read length math to determine which ones we need
  # load the files & format them
k_mer_maps = {}
nucleotides = ['A', 'C', 'G', 'T']
for i in range(19):
    k_mer_maps[i] = refDict(ref_format(refs_file), i)

In [None]:
# align takes in the raw ref file, and one single read
def align(read, ref): 

  #format the files
  ref_formatted = ref_format(ref)

  # Split genome into 3 parts: first, middle, last. First = Last always.
                                 # First = Middle = Last if len % 3 == 0
  dict_fl = OrderedDict() #dictionary for first, last
  dict_mid = OrderedDict() #dict for middle (default = first/last)
  read_f = OrderedDict()
  read_m = OrderedDict()
  read_l = OrderedDict()
  n = len(read)

  if len(read) % 3 == 0: #if read = divisible by 3, return dict
    # print("length of read: ", n)
    dict_fl = k_mer_maps[int(len(read) / 3)] #split into equal thirds
    dict_mid = k_mer_maps[int(len(read) / 3)]
    L = int(n/3)
    # split read into equal thirds
    read_f = read[0:L]
    read_m = read[L:2*L]
    read_l = read[2*L:n]

  if len(read) % 3 != 0:
    # print("length of read: ", n)
    rounded_n = 3*round(n/3) # get closest multiple of 3. for 52 = 54 or 51
    first_last = int(rounded_n/3) # length of first and last segment
    diff = len(read)-2*int(first_last)
    dict_fl = k_mer_maps[first_last]
    dict_mid = k_mer_maps[diff]
    # split read into unequal 3rds
    read_f = read[0:first_last]
    a = first_last
    b = first_last + diff
    read_m = read[a:b]
    read_l = read[b:n]
  
  return dict_mid, dict_fl, read_f, read_m, read_l

reads_formatted = reads_format(reads_file)
# pprint(reads_formatted[0::2])

'''
At this point, we have the dictionaries for the first, middle, and last
parts of reads. 
Each read will be split into 3 parts.
  3 equal parts if |read| % 3 == 0
  2 equal first/last, and 1 middle, if |read| % 3 != 0
'''

'\nAt this point, we have the dictionaries for the first, middle, and last\nparts of reads. \nEach read will be split into 3 parts.\n  3 equal parts if |read| % 3 == 0\n  2 equal first/last, and 1 middle, if |read| % 3 != 0\n'

In [None]:
reads_notitle = reads_formatted[0::2] #ignore names in reads file
reads_title = reads_formatted[1::2]

def get_dict_for_read(whichread):
  #which read will want to be the number itself
  #which i will then have to process in this function
  dict_mid, dict_fl, read_f, read_m, read_l = align(reads_notitle[whichread][0], refs_file)
  return dict_mid, dict_fl, read_f, read_m, read_l

dict_mid, dict_fl, read_f, read_m, read_l = get_dict_for_read(27)
print(read_f)

print("reads_notitle[27]: ", reads_notitle)

def hamming_distance(p, q):
  return sum([(1 if p[i] != q[i] else 0) for i in range(len(p))])
  
def neighbors(read, d = 2):
  if d == 0:
    return {read}
  if len(read) == 1:
    return {'A', 'C', 'G', 'T'}
  neighborhood = set()
  suffix_neighbors = neighbors(read[1:], d)
  for text in suffix_neighbors:
    if hamming_distance(read[1:], text) < d:
      for i in ['A', 'C', 'G', 'T']:
        neighborhood.add(i + text)
    else:
      neighborhood.add(read[0] + text)
  return neighborhood

neighbors('TATCTACACATTACGAG')

for i in range(len(read_f)):
  read = reads_notitle[i][0]
  readNeighbor = neighbors(read, 1) #compute the 1-neighborhood of a read



reads_notitle[27]:  [['CAGTCACCCAGTGCCGCTGATGCCCAAGCACAGAAAACGGATGTTGCTAT'], ['ATTACGAGACGCATTATCAGCGTATCTTGGGCTTTAACGTGTATAGGGCG'], ['TATCTACACATTACGAGACGCATTATCAGCGTCTCTTGGGCTTTAACGTG'], ['AAATGCATGGATCAGGGGCAAGATGCAGACACGGCTGTACTTCCCAAGGCA'], ['TAGAATCCACGGGGTAGGCGAATAATCCATTTTGTCACCCTCAACAAAC'], ['TTTTAGGAAAGTCAAATGCATGGATCAGGGGCAAGAGCAGACACGGCTT'], ['CAACTATGCCCCACATGACAAATCTGACGACGTGATTGCAGCCCACAAGG'], ['AGTTTGACGTATCCCAAGGATGAGATACAGTACATGAGTGCTCCTCTAC'], ['ATAGCCATTTAGGGCTTGATCCTATAGTTCTCCGTATCCAGTTGTGCTAA'], ['AGATGCTTGCTGTAACCGAGATGCCTCAGGCAGATACCTTAATGCGACGA'], ['CCCCAAGGATGAGATACAGTACATGAGTGCTCCTCTACTGACACGTTTCGC'], ['GATGCCCAAGCACAGAAAACGGATGTTGCTATAGAATCCACGGTGTAGGC'], ['CGGTGGTTTCCGGCATATGTAGATGCTTGTTGTAACCGAGATGCCTCAG'], ['GTTAGTTTCGAAGACTGCGCTACTCTGTTGAACCCATATTCAACCCTGA'], ['TCGGTGGTTTCCGGCATATGTAGATGCTTGTTGTAACCGAGATGCCTCAG'], ['GTGACTAGCGCAAAGTTTGTCTGTATATTAGTAAGCCGTTGTTTCGAAG'], ['TCTGTTGAACCCATATACGAACCCTGAAGTCGAAGACGTTCTTGCTCAGC'], ['TGCTATAGAATCCACGGTCTAGGCGAAT

In [None]:
# find keys of substitutions for a given read
def substitutions(whichread):

  sub = 0

  dict_m, dict_fl, read_f, read_m, read_l = get_dict_for_read(whichread)

  sub_f = 0
  sub_m = 0
  sub_l = 0 # switch to true if we find a sub

  read_all_positions = [[], [], []]

  # index each part of read
  for part_of_read in ["read_f", "read_m", "read_l"]: 

    if part_of_read == "read_f":
     # look up neighbors in (dict of len(read_f)):
      neighbor_array = neighbors(read_f) # generate 2-neighbors of read_f
      for neighbor in neighbor_array:
        if neighbor in dict_fl.keys(): #if the neighbor exists in the dictionary
          for position in dict_fl[neighbor]:
            read_all_positions[0].append((neighbor, position))

    elif part_of_read == "read_m":
      # look up neighbors in (dict of len(read_m)):
      neighbor_array = neighbors(read_m) # generate 2-neighbors of read_m
      for neighbor in neighbor_array:
        if neighbor in dict_m.keys(): #if the neighbor exists in the dictionary
          for position in dict_m[neighbor]:
            read_all_positions[1].append((neighbor, position))

    elif part_of_read == "read_l":
      # look up neighbors in (dict of len(read_f)):
      neighbor_array = neighbors(read_f) # generate 2-neighbors of read_f
      for neighbor in neighbor_array:
        if neighbor in dict_fl.keys(): #if the neighbor exists in the dictionary
          for position in dict_fl[neighbor]:
            read_all_positions[2].append((neighbor, position))
  
  valid_sections = []
  # order = f,m,l for first -> middle -> last
  for position_f in read_all_positions[0]: 
    for position_m in read_all_positions[1]:
      if position_f[1] + len(position_f[0]) == position_m[1]: # pos of first occur + len of first occur = pos of second occur
        for position_l in read_all_positions[2]:
          if position_m[1] + len(position_m[0]) == position_l[1]:
            matchedString = position_f[0] + position_m[0] + position_l[0]
            ref_loc_of_matched_string = position_f[1]
            valid_sections.append((matchedString, ref_loc_of_matched_string)) # complete string of what we found 

  return valid_sections

In [None]:
''' call substitutions() on each read '''
differences = {}

for i in range(len(reads_notitle)): # for each read # len(reads_notitle)

  dict_m, dict_fl, read_f, read_m, read_l = get_dict_for_read(i)
  whole_read = read_f+read_m+read_l

  if len(whole_read) < 3: #my code can't account for the case where the length / 3
                          #since my algorithm relies on splitting reads into 3 segments
                          #this error is in align, where I do either L % 3 == 0 or != 0
    continue

  whichread = i
 
  # VALID_SECTIONS FORMAT
  # for each read:
    # array of tuples [(,),(,)] containing neighbors & position they occur at

  valid_sections = substitutions(whichread) # create valid sections for each

# print(sorted([(key, differences[key]) for key in differences.keys() if differences[key] > 4])) # error check

# "design your own way of distinguishing between mutations and errors."

Where I'm at right now:
There's an issue with my valid_sections. 
It doesn't contain the right info.

Figure out why over the weekend.