-
Notifications
You must be signed in to change notification settings - Fork 6
/
Exam6_10.py
169 lines (131 loc) · 3.47 KB
/
Exam6_10.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python
fa="dna.example.fasta"
#Funcion sequencia
def sequ(fa):
f= open(fa, "r")
file = f.readlines()
#print file
sequences = []
seq = ""
for f in file:
if not f.startswith('>'):
f = f.replace(" ", "")
f = f.replace("\n", "")
seq = seq + f
else:
sequences.append(seq)
seq = ""
# Add the last seq
sequences.append(seq)
sequences = sequences[1:]
return sequences
# Find all indexs
def find_index(sequence,n):
start_position = n-1
start_indexs = []
stop_indexs = []
for i in range(n-1, len(sequence), 3):
if sequence[i:i+3] == "ATG":
start_indexs.append(i)
# Find all stop codon indexs
for i in range(n-1, len(sequence), 3):
stops =["TAA", "TGA", "TAG"]
if sequence[i:i+3] in stops:
stop_indexs.append(i)
ind=[start_position,start_indexs,stop_indexs]
#print ind
return ind
def find_orf(sequence,n):
ind=find_index(sequence,n)
start_position = ind[0]
start_indexs = ind[1]
stop_indexs = ind[2]
orf = []
mark = 0
for i in range(0,len(start_indexs)):
for j in range(0, len(stop_indexs)):
if start_indexs[i] < stop_indexs[j] and start_indexs[i] > mark:
orf.append(sequence[start_indexs[i]:stop_indexs[j]+3])
mark = stop_indexs[j]+3
break
return orf
############################################################
#What is the length of the longest ORF appearing in any sequence in any of the 3 forward reading frames?
def punto6(fa):
sequences=sequ(fa)
n = 1
lengths = []
for i in sequences:
# print("["+str(n)+"]")
orfs = find_orf(i,1) + find_orf(i,2) + find_orf(i,3)
for j in orfs:
lengths.append(len(j))
n += 1
print(max(lengths))
#What is the length of the longest ORF that appears in the sequence with the identifier gi|142022655|gb|EQ086233.1|97?
# Find the sequence with the identifier num
def find_idenfitier(num):
f = open(fa, "r")
file = f.readlines()
seq = ""
identifier = 0
for i in range(0, len(file)):
if num in file[i]:
identifier = i
for f in file[identifier+1:]:
if not f.startswith('>'):
f = f.replace(" ", "")
f = f.replace("\n", "")
seq = seq + f
else:
break
lengths = []
orfs = find_orf(seq,1) + find_orf(seq,2) + find_orf(seq,3)
for j in orfs:
lengths.append(len(j))
print(max(lengths))
#Find the most frequently occurring repeat of length 6 in all sequences. How many times does it occur in all?
def find_length(num):
f = open(fa, "r")
file = f.readlines()
sequences = []
seq = ""
for f in file:
if not f.startswith('>'):
f = f.replace(" ", "")
f = f.replace("\n", "")
seq = seq + f
else:
sequences.append(seq)
seq = ""
# Add the last seq
sequences.append(seq)
sequences = sequences[1:]
def get_all_repeats(sequence):
length = len(sequence)
repeats = []
for i in range(length):
repeats.append(sequence[i:i + (num*2)])
return repeats
all_six_repearts = []
for i in sequences:
repeats_list = get_all_repeats(i)
for j in repeats_list:
all_six_repearts.append(j)
def most_common(lst):
return max(set(lst), key=lst.count)
print(most_common(all_six_repearts))
print(all_six_repearts.count(most_common(all_six_repearts)))
###########################################
#Punto 6
print 'Punto 6'
punto6(fa)
#Punto 7
print 'gi|142022655|gb|EQ086233.1|97'
print 'Hay que cambiar la cadena que se busca'
num="gi|142022655|gb|EQ086233.1|97"
find_idenfitier(num)
#Punto 8,9 y 10
print 'Modificar el numero en la busqueda de string, dependiendo de la pregunta'
num=6
find_length(num)