<a href="https://colab.research.google.com/github/recanoy/Coursera_PythonForGenomicDataScience_FinalExam/blob/main/final_exam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## (01) Installing Biopython

In [None]:
pip install biopython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 7.7 MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.79


## (02) Common Packages

In [None]:
import requests
import Bio
from Bio import *
from Bio import SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

## (03) Common Functions

In [None]:
# Extracting the file
def get_file(url, filename):
  req = requests.get(url)
  if req.status_code != 200: # the request is successful
    raise Exception("Could not get file")
  
  with open(filename, 'w') as fn:
    fn.write(req.text)

# Processing the file
def process_file(filename, filetype, info="all"):
  if info == "identifiers":
    return [seq_record.id for seq_record in SeqIO.parse(filename, filetype)]
  elif info == "sequence":
    return [seq_record.seq for seq_record in SeqIO.parse(filename, filetype)]
  elif info == "length":
    return [len(seq_record) for seq_record in SeqIO.parse(filename, filetype)]
  else:
    identifiers = [seq_record.id for seq_record in SeqIO.parse(filename, filetype)]
    sequence = [repr(seq_record.seq) for seq_record in SeqIO.parse(filename, filetype)]
    lengths = [len(seq_record) for seq_record in SeqIO.parse(filename, filetype)]

    complete_data = {
        "identifiers" : identifiers,
        "sequence" : sequence,
        "lengths" : lengths
    }

    return complete_data

# Determining the species
def species_finder(seq, E_VALUE_THRESH=0.01):
  #E_VALUE_THRESH = 0.01
  result_handle = NCBIWWW.qblast("blastn", "nt", seq)
  blast_record = NCBIXML.read(result_handle)

  for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
      if hsp.expect < E_VALUE_THRESH:
        print("*****Alignment*****")
        print("sequence:", alignment.title)
        print("length:", alignment.length)
        print("e value:", hsp.expect)
        print(hsp.query)
        print(hsp.match)
        print(hsp.sbjct)
        print("\n")


## (04) Getting the file

In [None]:
url = "https://d396qusza40orc.cloudfront.net/genpython/data_sets/dna.example.fasta"
filename = "dna.example.fasta"
filetype = "fasta"

get_file(url, filename)

## (05) Problem 1
* How many records are in the file?
  * A FASTA file is a single-line header, followed by lines of sequence data
  * The header line is distinguised from the sequence by the a greater-than (">") symbol in the first column.

In [None]:
print("There are %d record(s) in this FASTA file" % len(process_file(filename, filetype, info="identifiers")))

There are 25 record(s) in this FASTA file


## (06) Problem 2

* What are the lengths of the sequences in the file?
* What is the longest sequence?
* What is the sortest sequence?
* Is there more than one longest sequence?
* What are their identifiers?

In [None]:
# Determining the lengths of the sequences
def seq_len(lengths):
  max_seq = max(lengths) # maximum sequence length
  min_seq = min(lengths) # minimum sequence length
  
  max_ind = [ind for ind, value in enumerate(lengths) if value == max_seq]
  min_ind = [ind for ind, value in enumerate(lengths) if value == min_seq]

  num_max_seq = len([lengths[ind] for ind in max_ind])
  num_min_seq = len([lengths[ind] for ind in min_ind])

  print(
      """
      The maximum and minimum lengths of sequences is %d and %d respectively.
      There are %d record(s) which has the maximum length.
      There are %d record(s) which has the minimum length.
      """ % (max_seq, min_seq, num_max_seq, num_min_seq)
  )

  return max_ind, min_ind

In [None]:
lengths = process_file(filename, filetype, info="length")
identifiers = process_file(filename, filetype, info="identifiers")
max_ind, min_ind = seq_len(lengths)


      The maximum and minimum lengths of sequences is 4805 and 512 respectively.
      There are 1 record(s) which has the maximum length.
      There are 1 record(s) which has the minimum length.
      


In [None]:
for ind in max_ind:
  print("Identifier of sequence with max length: %s" % (identifiers[ind]))

Identifier of sequence with max length: gi|142022655|gb|EQ086233.1|323


In [None]:
for ind in min_ind:
  print("Identifier of sequence with min length: %s" % (identifiers[ind]))

Identifier of sequence with min length: gi|142022655|gb|EQ086233.1|521


## Problem 3

* A reading frame is a way dividing the DNA sequence of nucleotides into a set of consecutive, non-overlapping triplets (or codons)
* There are six possible reading frames: three in the forward (5' to 3') direction and three in the reverse (3' to 5')
* What is the length of the longest ORF in the file?
* What is the identifier of the sequence containing the longest ORF?
* For a given sequence identifier, what is the longest ORF contained in the sequence represented by that identifier?
* What is the starting position of the longest ORF in the sequence that contains it? **The positions should indicate the character number in the sequence**

In [None]:
from Bio.Seq import Seq
my_seq = next(SeqIO.parse(filename, filetype)).seq

In [None]:
def extract_reading_frames(my_seq):
  # Extracting the reading the frames from a sequence
  reading_frame1 = []                   # Reading frame 1
  reading_frame2 = [Seq(my_seq[0])]     # Reading frame 2
  reading_frame3 = [Seq(my_seq[0:2])]   # Reading frame 3
  for frame in range(1, 4):
    if frame == 1:
      for i in range(0, len(my_seq), 3):
        reading_frame1.append(my_seq[i:i+3])
    elif frame == 2:
      for i in range(1, len(my_seq), 3):
        reading_frame2.append(my_seq[i:i+3])
    else:
      for i in range(2, len(my_seq), 3):
        reading_frame3.append(my_seq[i:i+3])

  return reading_frame1, reading_frame2, reading_frame3

In [None]:
def extract_ORF(read_frame):
  # Determining the ORFs in the reading frame
  # ORFs aresequences that begin with start codons and end with stop codons.
  # ORFs have the potential of coding for proteins.
  # start_codons: [Seq("ATG")]
  # stop_codons: [Seq("TAA"), Seq("TAG"), Seq("TGA")]
  # 
  # Argument(s):
  # read_frame: Either reading frame 1 or 2 or 3
  #
  # Output:
  # ORF_LIST: List of all the ORFs
  start_codons = [Seq("ATG")]                         # contains the list of start codons
  stop_codons = [Seq("TAA"), Seq("TAG"), Seq("TGA")]  # contains the list of stop codons

  start_ind = [ind for ind, value in enumerate(read_frame) if value == start_codons[0]]   # the indices containing the start codons
  stop_ind = [ind for ind, value in enumerate(read_frame) if (value == stop_codons[0] or value == stop_codons[1] or value == stop_codons[2])] # the indices containing the stop codons

  ORF_LIST = []                                       # empty list containing the ORFs
  ORF_IND = []                                        # empty list containing the indices of the ORF
  ORF = Seq("")                                       # empty ORF character

  if len(start_ind) > 0 and len(stop_ind) > 0:        # evaluate this section is there are start and stop codons in the sequence
    if start_ind[0] < stop_ind[0]:
      for ind in range(start_ind[0], stop_ind[0]):      # evaluating the element in the first element in the reading frame
        ORF += read_frame[ind]
      ORF_LIST.append(ORF)
      ORF_IND.append((start_ind[0], stop_ind[0]))

    for start_index in range(1, len(start_ind)):        # evaluating the second element in the start codon indices
      ORF = Seq("")                                          
      for stop_index in range(1, len(stop_ind)):        # evaluating the second element in the stop codon indices
        if ((stop_ind[stop_index] < start_ind[start_index]) or (start_ind[start_index] < stop_ind[stop_index-1])):
          continue
        
        else:
          ORF_IND.append((start_ind[start_index], stop_ind[stop_index]))
          for ind in range(start_ind[start_index], stop_ind[stop_index]+1): # 
            ORF += read_frame[ind]                     # appending the codons in the empty characters
          ORF_LIST.append(ORF)


  # Determining the lengths of the reading frames
  ORF_len = []
  if len(ORF_LIST) > 0:
    for i in range(0, len(ORF_LIST)):
      ORF_len.append(len(ORF_LIST[i]))
  
  # Determining the locations of the maximum codons
  max_ORF = []
  max_ind = []
  max_seq = []
  if len(ORF_len) > 0:
    max_ORF = [max(ORF_len)]
    max_ind = [ORF_IND[ind] for ind, value in enumerate(ORF_len) if value == max_ORF[0]]
    max_seq = [ORF_LIST[ind] for ind, value in enumerate(ORF_len) if value == max_ORF[0]]

  # Returning the results
  return ORF_LIST, ORF_IND, ORF_len, max_ORF, max_ind, max_seq

In [None]:
read_frame1, read_frame2, read_frame3 = extract_reading_frames(my_seq)
print("Reading Frame 1:", read_frame1, "\n", "Reading Frame 2:", read_frame2, "\n", "Reading Frame 3:", read_frame3)

Reading Frame 1: [Seq('TCG'), Seq('GGC'), Seq('GAA'), Seq('GGC'), Seq('GGC'), Seq('AGC'), Seq('AAG'), Seq('TCG'), Seq('TCC'), Seq('ACG'), Seq('CGC'), Seq('AGC'), Seq('GCG'), Seq('GCA'), Seq('CCG'), Seq('CGG'), Seq('GCC'), Seq('TCT'), Seq('GCC'), Seq('GTG'), Seq('CGC'), Seq('TGC'), Seq('TTG'), Seq('GCC'), Seq('ATG'), Seq('GCC'), Seq('TCC'), Seq('AGC'), Seq('GCA'), Seq('CCG'), Seq('ATC'), Seq('GGA'), Seq('TCA'), Seq('AAG'), Seq('CCG'), Seq('CTG'), Seq('AAG'), Seq('CCT'), Seq('TCG'), Seq('CGC'), Seq('ATC'), Seq('AGG'), Seq('CGG'), Seq('CCA'), Seq('TAG'), Seq('TTG'), Seq('GCG'), Seq('CCA'), Seq('GTG'), Seq('ACC'), Seq('GTA'), Seq('CCA'), Seq('ACC'), Seq('GCC'), Seq('TTG'), Seq('ATG'), Seq('CGG'), Seq('CGC'), Seq('TCG'), Seq('GTC'), Seq('ATC'), Seq('GCT'), Seq('GCA'), Seq('TTG'), Seq('ATC'), Seq('GAG'), Seq('TAG'), Seq('CCA'), Seq('CCG'), Seq('CCG'), Seq('CCG'), Seq('CAA'), Seq('ATG'), Seq('CCC'), Seq('AGC'), Seq('ACG'), Seq('CCA'), Seq('ATG'), Seq('CGT'), Seq('TCT'), Seq('TCA'), Seq('TCC')

In [None]:
for frame in range(0, 3):
  if frame == 0:
    ORF_LIST, ORF_IND, ORF_len, max_ORF, max_ind, max_seq = extract_ORF(read_frame1)
    print("\n Processing reading frame 1 ...")
  elif frame == 1:
    ORF_LIST, ORF_IND, ORF_len, max_ORF, max_ind, max_seq = extract_ORF(read_frame2)
    print("\n Processing reading frame 2 ...")
  else:
    ORF_LIST, ORF_IND, ORF_len, max_ORF, max_ind, max_seq = extract_ORF(read_frame3)
    print("\n Processing reading frame 3 ...")
  print("ORF List:", ORF_LIST, "\n", "ORF Indices", ORF_IND, "\n", "ORF lengths", ORF_len, "\n", "Maximum Length of an ORF in a sequence", max_ORF, "\n", "Index of a the maximum ORF", max_ind, "\n", "Maximum sequence:", max_seq)


 Processing reading frame 1 ...
ORF List: [Seq('ATGGCCTCCAGCGCACCGATCGGATCAAAGCCGCTGAAGCCTTCGCGCATCAGGCGGCCA'), Seq('ATGCGGCGCTCGGTCATCGCTGCATTGATCGAGTAG'), Seq('ATGCCCAGCACGCCAATGCGTTCTTCATCCACATAG'), Seq('ATGCGTTCTTCATCCACATAG'), Seq('ATGACGACGAAACCTTCCTTGGCCAGCGCCTCGCCATACACGTTCCCCGATGTT...TAA'), Seq('ATGATGGCGGGATATTTCTTGCCTTCGTCGAAGTTCGGCGGGAAGTGGATGTCG...TAA'), Seq('ATGGCGGGATATTTCTTGCCTTCGTCGAAGTTCGGCGGGAAGTGGATGTCGGCT...TAA'), Seq('ATGTCGGCTGCGATATCCCAATACACATTCTTGATCTTGACGCTTTTCATGACA...TAA'), Seq('ATGACAGCTCCGTTCAGGGGGAGGGGGTAA'), Seq('ATGGCACCTACATGGATCCCTCACTGCTTCCGTCTCTCGCGTGGTTCGCCCACG...TGA')] 
 ORF Indices [(24, 44), (55, 66), (72, 83), (77, 83), (122, 192), (151, 192), (152, 192), (167, 192), (183, 192), (230, 272)] 
 ORF lengths [60, 36, 36, 21, 213, 126, 123, 78, 30, 129] 
 Maximum Length of an ORF in a sequence [213] 
 Index of a the maximum ORF [(122, 192)] 
 Maximum sequence: [Seq('ATGACGACGAAACCTTCCTTGGCCAGCGCCTCGCCATACACGTTCCCCGATGTT...TAA')]

 Processing readi

## (04) Problem 4

* A repeat is a substring of a DNA sequence that occurs in multiple copies (more than one) somewhere in the sequence
* For example, the sequence ACACA contains two copies of the sequence ACA - once at position 1 (index 0 in Python), and onece at position 3

In [None]:
my_seq = Seq("ACACACA")
n = 3

In [None]:
def repeats_finder(my_seq, n):
  repeats = {}
  # Starts at position 0
  rep = my_seq[0:0+n]

  count = 0
  position = []

  for j in range(0, len(my_seq)-n+1):
    print('Repeat Subject', rep, 'Repeat Query', my_seq[j:j+n])
    if rep == my_seq[j:j+n]:
      position.append(j+1)
      count += 1
  print('\n')

  if count > 1:
    repeats[rep] = [(count), position]

  # Starts at position 1
  for i in range(1, len(my_seq)):
    rep = my_seq[i:i+n]
    
    count = 0
    position = []
    if not key_finder(rep, repeats):
      for j in range(i, len(my_seq)-n+1):
        print('Repeat Subject', rep, 'Repeat Query', my_seq[j:j+n])
        if rep == my_seq[j:j+n]:
          position.append(j+1)
          count += 1
      print('\n')

      if count > 1:
        repeats[rep] = [(count), position]
    else:
      continue
  return(repeats)

def key_finder(rep, repeats):
  keys = repeats.keys()
  keys_list = [key[1] for key in enumerate(keys)]

  return any([key[1] == rep for key in enumerate(keys_list)])

In [None]:
reps = repeats_finder(my_seq, n)
print(reps)

Repeat Subject ACA Repeat Query ACA
Repeat Subject ACA Repeat Query CAC
Repeat Subject ACA Repeat Query ACA
Repeat Subject ACA Repeat Query CAC
Repeat Subject ACA Repeat Query ACA


Repeat Subject CAC Repeat Query CAC
Repeat Subject CAC Repeat Query ACA
Repeat Subject CAC Repeat Query CAC
Repeat Subject CAC Repeat Query ACA






{Seq('ACA'): [3, [1, 3, 5]], Seq('CAC'): [2, [2, 4]]}


## (05) Final Exam Questions

### 1: How many records are in the multi-FASTA file

In [None]:
fileurl = "https://d396qusza40orc.cloudfront.net/genpython/data_sets/dna2.fasta"
filename = "dna2.fasta"
filetype = "fasta"

In [None]:
get_file(fileurl, filename)

In [None]:
print("There are %d records in the multi-FASTA file." % len(process_file(filename, filetype, info="identifiers")))

There are 18 records in the multi-FASTA file.


### 2-3. What is the length of the longest sequence in the file?


In [None]:
lengths = process_file(filename, filetype, info="length")
max_ind, min_ind = seq_len(lengths)

print("\n Longest Sequence:", lengths[max_ind[0]], "\n", "\n Shortest Sequence", lengths[min_ind[0]])


      The maximum and minimum lengths of sequences is 4894 and 115 respectively.
      There are 1 record(s) which has the maximum length.
      There are 1 record(s) which has the minimum length.
      

 Longest Sequence: 4894 
 
 Shortest Sequence 115


### 4. What is the length of the longest ORF appeariing in reading frame 2 of any sequence

In [None]:
sequences = process_file(filename, filetype, info="sequence")

longest_ORF_rf2 = []
for seq_ind in range(0, len(sequences)):
  read_frame1, read_frame2, read_frame3 = extract_reading_frames(sequences[seq_ind])
  
  ORF_LIST, ORF_IND, ORF_len, max_ORF, max_ind, max_seq = extract_ORF(read_frame2)
  longest_ORF_rf2.append(max_ORF)

print("The length of the longest ORF appearing in reading frame 2 of any sequences is", max(longest_ORF_rf2))

The length of the longest ORF appearing in reading frame 2 of any sequences is [1458]


### 5. What is the starting position of the longest ORF in reading frame 3 in any of the sequences?

In [None]:
## Determining the longest ORF 
longest_ORF_rf3 = []
longest_ORF_rf3_indices = []
longest_ORF_rf3_sequences = []
for seq_ind in range(0, len(sequences)):
  read_frame1, read_frame2, read_frame3 = extract_reading_frames(sequences[seq_ind])

  ORF_LIST, ORF_IND, ORF_len, max_ORF, max_ind, max_seq = extract_ORF(read_frame3)
  longest_ORF_rf3.append(max_ORF)
  longest_ORF_rf3_sequences.append(max_seq)

max_ORF_rf3 = max(longest_ORF_rf3)
indices = [ind for ind, value in enumerate(longest_ORF_rf3) if value == max_ORF_rf3]

## Determining the position of the longest ORF in reading frame 3
longest_seq = sequences[indices[0]]
ORF = longest_ORF_rf3_sequences[indices[0]]
n = len(ORF[0])

start_position = []
for i in range(0, len(longest_seq)):
  if longest_seq[i:i+n] == ORF[0]:
    start_position.append(i)

print("The starting position of the longest ORF in reading frame 3 in any sequences is ", start_position[0]+1)

The starting position of the longest ORF in reading frame 3 in any sequences is  636


### (6) What is the length of the longest ORF appearing in any sequence and in any forward reading frame?

In [None]:
long_ORFs = {}

for frame in range(0, 3):
  if frame == 0:
    longest_ORF = []
    for seq_ind in range(0, len(sequences)):
      read_frame1, read_frame2, read_frame3 = extract_reading_frames(sequences[seq_ind])

      ORF_LIST, ORF_IND, ORF_len, max_ORF, max_ind, max_seq = extract_ORF(read_frame1)
      print(max_ORF)
      longest_ORF.append(max_ORF)
    long_ORFs["Reading Frame 1"] = max(longest_ORF)
  elif frame == 1:
    longest_ORF = []
    for seq_ind in range(0, len(sequences)):
      read_frame1, read_frame2, read_frame3 = extract_reading_frames(sequences[seq_ind])

      ORF_LIST, ORF_IND, ORF_len, max_ORF, max_ind, max_seq = extract_ORF(read_frame2)
      longest_ORF.append(max_ORF)                                                                    
    long_ORFs["Reading Frame 2"] = max(longest_ORF)
  else:
    longest_ORF = []
    for seq_ind in range(0, len(sequences)):
      read_frame1, read_frame2, read_frame3 = extract_reading_frames(sequences[seq_ind])

      ORF_LIST, ORF_IND, ORF_len, max_ORF, max_ind, max_seq = extract_ORF(read_frame3)
      longest_ORF.append(max_ORF)
    long_ORFs["Reading Frame 3"] = max(longest_ORF)

print(long_ORFs)

[1296]
[105]
[1440]
[2097]
[1059]
[1194]
[]
[117]
[39]
[312]
[180]
[1044]
[1509]
[27]
[249]
[204]
[]
[195]
[237]
[]
[1185]
[525]
[1281]
[552]
[153]
[135]
[]
[420]
[177]
[819]
[1458]
[27]
[]
[276]
[]
[570]
[588]
[144]
[285]
[324]
[222]
[249]
[186]
[]
[210]
[711]
[183]
[1401]
[1644]
[102]
[219]
[168]
[]
[1821]
{'Reading Frame 1': [2097], 'Reading Frame 2': [1458], 'Reading Frame 3': [1821]}


## (07) What is the length of the longest forward ORF that appears in the sequence with the identifier gi|142022655|gb|EQ086233.1|16?

In [None]:
identifiers = process_file(filename, filetype, info="identifiers")
identifier = "gi|142022655|gb|EQ086233.1|16"

ind = [identifiers.index(identifier)]
seq = sequences[ind[0]]

In [None]:
longest_ORF_myseq = {}
read_frame1, read_frame2, read_frame3 = extract_reading_frames(seq)

for frame in range(0, 3):

  if frame == 0:
    ORF_LIST, ORF_IND, ORF_len, max_ORF, max_ind, max_seq = extract_ORF(read_frame1)
    
    longest_ORF_myseq["Reading Frame 1:"] = max_ORF
  elif frame == 1:
    ORF_LIST, ORF_IND, ORF_len, max_ORF, max_ind, max_seq = extract_ORF(read_frame2)

    longest_ORF_myseq["Reading Frame 2:"] = max_ORF
  else:
    ORF_LIST, ORF_IND, ORF_len, max_ORF, max_ind, max_seq = extract_ORF(read_frame3)

    longest_ORF_myseq["Reading Frame 3:"] = max_ORF

print(longest_ORF_myseq)


{'Reading Frame 1:': [1509], 'Reading Frame 2:': [1458], 'Reading Frame 3:': [1644]}


### (08) Find the most frequently occuring repeat of length 6 in all sequences. How many times does it occur in all?

In [None]:
def reps_compile(reps, reps_compiled):
  keys_compiled = reps_compiled.keys()   # keys in the compiled repeats
  vals_compiled = reps_compiled.values() # values in the compiled repeats

  keys = reps.keys()            # keys in the inputted repeats
  vals = reps.values()          # values in the inputted repeats

  keys_compiled_list = [key_comp[1] for key_comp in enumerate(keys_compiled)]
  keys_list = [key[1] for key in enumerate(keys)]

  if len(keys_list) > 0:
    for ind in range(0, len(keys_list)):
      if len(reps_compiled) > 0:
        if any([keys_list[ind] == val for i, val in enumerate(keys_compiled_list)]):
          reps_compiled[keys_list[ind]] += reps[keys_list[ind]][0]
      else:
        for ind in range(0, len(keys_list)):
          reps_compiled[keys_list[ind]] = reps[keys_list[ind]][0]
  
  return reps_compiled

In [None]:
reps_compiled = {}
for seq in range(0, len(sequences)):
  reps = repeats_finder(sequences[seq], 6)
  reps_compiled = reps_compile(reps, reps_compiled)

import pickle
import os
from google.colab import drive
drive.mount('/content/drive')
PROJECT_ROOT_DIR = '/content/drive/'

with open(os.path.join(PROJECT_ROOT_DIR, 'reps_compiled.pkl'), 'wb') as handle:
  pickle.dump(reps_compiled, handle, protocol=pickle.HIGHEST_PROTOCOL)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Repeat Subject ACGAGC Repeat Query CCGACC
Repeat Subject ACGAGC Repeat Query CGACCG
Repeat Subject ACGAGC Repeat Query GACCGC
Repeat Subject ACGAGC Repeat Query ACCGCG
Repeat Subject ACGAGC Repeat Query CCGCGG
Repeat Subject ACGAGC Repeat Query CGCGGT
Repeat Subject ACGAGC Repeat Query GCGGTG
Repeat Subject ACGAGC Repeat Query CGGTGA
Repeat Subject ACGAGC Repeat Query GGTGAC
Repeat Subject ACGAGC Repeat Query GTGACC
Repeat Subject ACGAGC Repeat Query TGACCA
Repeat Subject ACGAGC Repeat Query GACCAG
Repeat Subject ACGAGC Repeat Query ACCAGC
Repeat Subject ACGAGC Repeat Query CCAGCG
Repeat Subject ACGAGC Repeat Query CAGCGT
Repeat Subject ACGAGC Repeat Query AGCGTG
Repeat Subject ACGAGC Repeat Query GCGTGC
Repeat Subject ACGAGC Repeat Query CGTGCC
Repeat Subject ACGAGC Repeat Query GTGCCG
Repeat Subject ACGAGC Repeat Query TGCCGG
Repeat Subject ACGAGC Repeat Query GCCGGC
Repeat Subject ACGAGC Repeat Query CCGGCC
Repeat Subj

In [None]:
import pickle
import os
from google.colab import drive
drive.mount('/content/drive')
PROJECT_ROOT_DIR = '/content/drive/'

with open(os.path.join(PROJECT_ROOT_DIR, 'reps_compiled.pkl'), 'wb') as handle:
  pickle.dump(reps_compiled, handle, protocol=pickle.HIGHEST_PROTOCOL)