# Motif Mark Test Notebook

## Building the Script

In [3]:
import re

In [None]:
import cairo
import math

In [5]:
#!/usr/bin/env python

# motif_mark.py

import re
import argparse

parser = argparse.ArgumentParser(description="Creates a visualization of one or more motifs across a sequence or multiple sequences provided in UCSC format. Requires sequences and a list of motifs to be visualized.")
parser.add_argument('-f','--file', help='absolute path to FASTA file of sequences to be searched.', required=True, type=str)
parser.add_argument('-o','--out_dir', help='absolute path to the output directory.', required=True, type=str)
parser.add_argument('-m','--motifs', help='absolute path to the text file containing the list of motifs to be visualized, formatted as one motif per line.', required=True, type=str)
args = parser.parse_args()


#####################################################################################
##### Make Dictionaries #############################################################
#####################################################################################

### IUPAC ambiguity codes and associated regular expressions for ignoring case
iupac = {"A":"[Aa]", "T":"[TtUu]", "C":"[Cc]", "G":"[Gg]", "U":"[TtUu]",
         "M":"[AaCc]", "R":"[AaGg]", "W":"[AaTtUu]",
         "S":"[CcGg]", "Y":"[CcTtUu]", "K":"[GgTtUu]",
         "V":"[AaCcGg]", "H":"[AaCcTtUu]", "D":"[AaGgTtUu]", "B":"[CcGgTtUu]",
         "N":"[AaTtCcGgUu]"}

### Motif dcitionary
motif_dict = {}
with open("motifs.txt") as motif_list:
    for motif in motif_list:
        motif = motif.strip("\n")
        motif_dict[motif] = 0
print("Created motif dictionary.", flush = True)

#####################################################################################
##### Define Higher Order Functions #################################################
#####################################################################################

def fasta_twofer(filepath):
    '''Takes in a FASTA file and creates a new FASTA file called "sequences_twofer.fasta"
    that only has two lines for every FASTA entry.'''
    first_line = True
    with open(filepath,"r+") as fasta, open(args.out_dir/"sequences_twofer.fasta", "w+") as out:
        for line in fasta:
            if first_line and line.startswith(">"):     ### If the first line, print line without "\n" because print statement appends "\n"
                out.write(line.strip())
                first_line = False     ### Never run through this part of the looping system again
            elif not first_line and line.startswith(">"):     ### Print "\n" preceeding all deflines
                out.write("\n", end="")
                out.write(line.strip())
            else:
                out.write(line.strip("\n"), end="")     ### Strip "\n" from all sequence lines
        out.write("\n")

def count_lines(infile):
    '''Opens the input file and returns the number of lines in the file.'''
    with open(infile) as file:
        for i, line in enumerate(file):
            pass
    return i + 1

def motif2regex(motif):
    '''Takes the motif and returns a regular expression that will search for that motif in
    a given sequence, regardless of case.'''
    motif_components = list(motif)
    for i in range(0, len(motif_components)):
        motif_components[i] = iupac.get(motif_components[i])
    new_motif = "".join(motif_components)
    return regex_motif

def motif_position(motif, sequence):
    '''Takes a motif in regular-expression format and searches for all instances of that motif
    in the provided sequence. Returns a list of start and end positions. Since Python idexes starting
    at 0, 1 is added to the start position to account for pixels when making the image.'''
    regex_motif = motif2regex(motif)
    positions = []
    for match in re.finditer(regex_motif, sequence):
        s = match.start()
        e = match.end()
        positions.append([s + 1, e])
    return positions

Created motif dictionary.


## In-Line Testing

In [6]:
motif = "[a][Aa]"
sequence = "AABBBAaA"

for match in re.finditer(motif, sequence):
    s = match.start()
    e = match.end()
    print(s, e)

6 8


In [7]:
motif = "ATCG"
motif_list = list(motif)
for i in range(0, len(motif_list)):
    motif_list[i] = iupac.get(motif_list[i])
new_motif = "".join(motif_list)
print(new_motif)

sequence = "ATCGGGGGGGATCG"

positions = []
for match in re.finditer(new_motif, sequence):
    s = match.start()
    e = match.end()
    positions.append([s + 1, e])
positions

[Aa][TtUu][Cc][Gg]


[[1, 4], [11, 14]]

In [9]:
with open("sequence.fasta", "r+") as file:
    linecount = 0
    for line in file:
        linecount += 1
        print(line)
        print(linecount)

>INSR chr19:7149896-7151209 (reverse complement)

1
aaaattctgccagacttggagaagtggctgagtcagttgtgatgtccacatgtagtcacg

2
tttgacatcccagggccacctcagcaggccgtctctggggagaattttctctgatttctt

3
ccccttcccttgctggacccctgcacctgctggggaagatgtagctcactccgtctagca

4
agtgatgggagcgagtggtccagggtcaaagccagggtgcccttactcggacacatgtgg

5
cctccaagtgtcagagcccagtggtctgtctaatgaagttccctctgtcctcaaaggcgt

6
tggttttgtttccacagAAAAACCTCTTCAGGCACTGGTGCCGAGGACCCTAGgtatgac

7
tcacctgtgcgacccctggtgcctgctccgcgcagggccggcggcgtgccaggcagatgc

8
ctcggagaacccaggggtttctgtggctttttgcatgcggcgggcagctgtgctggagag

9
cagatgcttcaccaattcagaaatccaatgccttcactctgaaatgaaatctgggcatga

10
atgtggggagaaaccttcactaacacactcttgctaaaacatagaatcatgggagtgctt

11
cctgggtaccccctccctgccttctgtttgcagccactgtttgctcactaaacatctctg

12
cacctcccgcgtgcatttgcagaggtgggtgggggtccccggagtctgagctccccgcgg

13
ctgggtgccccgacccagcagctcctacaccatgaatggaggttgatctggaaacagaat

14
attttcatgaaagggcgacagggtatgaacaaaagaacaccgtgtcgctcactgaattcc

15
acggaggagagtcaggga

16
>MBNL chr3:152432504-15

In [14]:
def fasta_twofer(filepath):
    first_line = True
    with open(filepath,"r+") as fasta, open("sequences_twofer.fasta", "w+") as out:
        for line in fasta:
            if first_line and line.startswith(">"):
                out.write(line)
                first_line = False     ### Never run through this part of the looping system again
            elif not first_line and line.startswith(">"):     ### Print "\n" preceeding all deflines
                out.write("\n")
                out.write(line)
            else:
                out.write(line.strip("\n"))     ### Strip "\n" from all sequence lines
        out.write("\n")

In [15]:
fasta_twofer("sequence.fasta")

## `Pycairo` Example from Leslie

In [None]:
width, height = 800, 500

#create the coordinates to display your graphic, desginate output
surface = cairo.SVGSurface("example.svg",width, height)
#create the coordinates you will be drawing on (like a transparency) - you can create a transformation matrix
context = cairo.Context(surface)
#context.scale(width,height) #will set your drawing surface to a 0.0-1.0 scale

#Need to tell cairo where to put the brush, the color and width, and the shape you want it to draw
#draw a line
context.set_line_width(1)
context.move_to(50,25)        #(x,y)
context.line_to(450,25)
context.stroke()

#draw a rectangle
context.rectangle(100,100,150,350)        #(x0,y0,x1,y1)
context.fill()

context.move_to(25,250)
context.set_source_rgb(.25,.5,.5)
context.set_line_width(5)
context.curve_to(100,400,400,100,750,250)
context.stroke()

context.set_source_rgb(0,.6,.8)
context.arc(500, 300, 50, 0, 2*math.pi)
context.fill_preserve()            #draws filled circle, but preserves for later drawing
context.set_source_rgb(.3,.3,.3)
context.stroke()

#write out drawing
surface.finish()

In [None]:
WIDTH, HEIGHT = 256, 256

surface = cairo.ImageSurface (cairo.FORMAT_ARGB32, WIDTH, HEIGHT)
ctx = cairo.Context (surface)

ctx.scale (WIDTH, HEIGHT) # Normalizing the canvas

pat = cairo.LinearGradient (0.0, 0.0, 0.0, 1.0)
pat.add_color_stop_rgba (1, 0.7, 0, 0, 0.5) # First stop, 50% opacity
pat.add_color_stop_rgba (0, 0.9, 0.7, 0.2, 1) # Last stop, 100% opacity

ctx.rectangle (0, 0, 1, 1) # Rectangle(x0, y0, x1, y1)
ctx.set_source (pat)
ctx.fill ()

ctx.translate (0.1, 0.1) # Changing the current transformation matrix

ctx.move_to (0, 0)
# Arc(cx, cy, radius, start_angle, stop_angle)
ctx.arc (0.2, 0.1, 0.1, -math.pi/2, 0)
ctx.line_to (0.5, 0.1) # Line to (x,y)
# Curve(x1, y1, x2, y2, x3, y3)
ctx.curve_to (0.5, 0.2, 0.5, 0.4, 0.2, 0.8)
ctx.close_path ()

ctx.set_source_rgb (0.3, 0.2, 0.5) # Solid color
ctx.set_line_width (0.02)
ctx.stroke ()

surface.write_to_png ("example2.png") # Output to PNG