/
modl_perspective.py
211 lines (152 loc) · 8.1 KB
/
modl_perspective.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""
A perspective application of MODL using a supervised loss, based on the
performance of a Naive Bayes classifier.
This will select the combinations (of reference sets) that best predict the
query set using a Naive Bayes classifier.
This is currently in beta and requires some tuning of the parameters, so is not
yet available as option in the main command line, but is an interesting perspective.
This file is a fully functional example. To use it, simply replace the filepaths
at the beginning with the paths to your own files, adjust the parameters if
desired, and run the script.
"""
## Paths to BED files of regions
QUERY_PATH = "input/as_ginom/query_som_trans_lung.bed" # The query set
INCL_PATH = "input/as_ginom/mappability_human.bed" # Inclusion: the analysis will be limited to the sub-genome designated in this files. Regions outside of it will be discarded.
GENOME_PATH = "input/hg19.genome" # Chromosome sizes
# Reference sets
MORE_PATHS = [
"input/as_ginom/reference_1_hg19_Refseq_promoters_2000bp.bed",
"input/as_ginom/reference_2_hg19_Refseq_gene_bodies.bed",
"input/as_ginom/reference_3_Broad_A549_H3k04me3_Dex.bed",
"input/as_ginom/reference_4_Broad_A549_H3k36me3_Dex.bed",
"input/as_ginom/reference_5_Broad_A549_H3k79me2_Dex.bed"
]
# The meaning of the combinations returnedwill always be : query, followed by the reference sets in the order they are given
# For example, '[101010]' means "Query + Reference 2 (gene bodies) + Reference 4 (H3K36me3)"
## Parameters
# Subsampling parameters
KEEP_N_TIMES_QUERY = 100 # For each '1' line with the query, how many '0' lines without it do we keep?
QUERY_WEIGHT = 100 # In the Naive Bayes classifier, how much more do we weigh the presence of the query?
# MODL parameters
DESIRED_ITEMSETS = 7 # How many interesting combinations should MODL return?
RANDOM_SEED = 42
# ---------------------------------------------------------------------------- #
def presence_vector(row, combis):
"""
Given a list of combinations, returns a vector saying whether they are
present or absent in this row. The order in the vector is the same as given
in the combis list.
This works exactly like an inexact (transitive) count, which is OLOGRAM-MODL's default operating mode.
Example:
>>> row = [1,0,1,1,0,1]
>>> combis = [1,0,1,0,0,0],[0,1,1,0,0,0]
>>> res = presence_vector(row, combis)
>>> assert list(res) == [1,0]
"""
result = []
for combi in combis:
mask = row - combi
# If any element from the combi is missing, it's absent, so
# add a 0. Otherwise add a 1.
if min(mask) < 0 : result +=[0]
else: result += [1]
return result
import pandas as pd
import numpy as np
import sklearn
import pybedtools
from sklearn.naive_bayes import GaussianNB
from pygtftk.stats.intersect.overlap_stats_compute import compute_true_intersection
from pygtftk.stats.intersect.overlap.overlap_regions import does_combi_match_query
from pygtftk.stats.intersect.modl.dict_learning import Modl
from pygtftk import utils
from pygtftk.stats.intersect.read_bed import read_bed_as_list as read_bed
from pygtftk.utils import chrom_info_as_dict
from pygtftk import arg_formatter
np.random.seed(RANDOM_SEED) # Random seed for reproducibility
## Prepare files
bedA = pybedtools.BedTool(QUERY_PATH).sort().merge()
bedsB = [pybedtools.BedTool(bedfilepath).sort().merge() for bedfilepath in MORE_PATHS]
# Do the exclusion manually Generate a fake bed for the entire genome, using the chromsizes
bed_incl = pybedtools.BedTool(INCL_PATH)
chrom_len = chrom_info_as_dict(open(GENOME_PATH, 'r'))
full_genome_bed = [str(chrom) + '\t' + '0' + '\t' + str(chrom_len[chrom]) + '\n' for chrom in chrom_len if chrom != 'all_chrom']
full_genome_bed = pybedtools.BedTool(full_genome_bed)
bed_excl = full_genome_bed.subtract(bed_incl)
bedA = read_bed.exclude_concatenate(bedA, bed_excl)
bedsB = [read_bed.exclude_concatenate(bedB, bed_excl) for bedB in bedsB]
full_genome_bed_after_excl = read_bed.exclude_concatenate(full_genome_bed, bed_excl)
# Note that by definition, in this intersections' matrix only regions where at
# least two sets are open are given. For example {4} alone is not found.
# To fix it, use a fake full genome bed as query, so there is always one file
# open, then truncate it to get the flags_matrix.
#true_intersection = compute_true_intersection(bedA, bedsB)
true_intersection = compute_true_intersection(full_genome_bed_after_excl, [bedA] + bedsB)
flags_matrix = np.array([i[3] for i in true_intersection])
"""
# NOTE : The length of each intersection is also accessible, in case you want
# the final matrix to have one row per X base pairs instead of 1 row per intersection/event.
# For reference, the true_intersection object looks like this: [('chr1', 150, 180, np.array([1,1,0])), ('chr1', 180, 200, np.array([1,1,1])), ('chr1', 350, 380, np.array([1,1,0]))]
for i in true_intersection: # For each observed intersection...
inter_chr, inter_start, inter_stop = i[0], i[1], i[2] # Get the chromosome, start and end of the observed intersection
intersection_length = inter_stop - inter_start # Deduce the length of the intersection
intersection_flags = i[3] # Which combination was observed here?
# Now you can process it.
"""
flags_matrix = flags_matrix[:,1:] # Remove first column that represents the fake, full genome BED.
# Print unique rows with transitive (inexact) counting
uniqueValues, occurCount = np.unique(flags_matrix, return_counts=True, axis=0)
for uniqueRow in uniqueValues:
count = 0
for row in flags_matrix:
if does_combi_match_query(
tuple(row),
tuple(uniqueRow),
exact = False):
count += 1
print(uniqueRow, ' - freq = ', count)
print("------------------")
# Perform subsampling of the majority class ('0', meaning query is absent)
flags_matrix_with_query = flags_matrix[flags_matrix[:, 0] != 0]
draw_this_many = int(np.around(KEEP_N_TIMES_QUERY * len(flags_matrix_with_query)))
random_rows = flags_matrix[np.random.randint(flags_matrix.shape[0], size=draw_this_many),:]
flags_matrix = np.concatenate((flags_matrix_with_query,random_rows))
# This is the custom error function that we will use
def custom_error_function_bnb(X_true, X_rebuilt, encoded, dictionary):
"""
This is a non-pure function that only cares about the dictionary and discards X_true in
favor of the previous flags_matrix.
"""
# Always assume that the first column is the query, so we split it.
# Truncate the first column, with the query, since that is what we want to predict
Y = flags_matrix[:,0]
X = flags_matrix[:,1:]
new_dictionary = []
for atom in dictionary:
new_dictionary += [atom[1:]]
dictionary = np.array(new_dictionary)
# Convert to a binary matrix giving, for each atom in the dictionary,
# whether it is present at this row
X_bydict_bin = [presence_vector(row, dictionary) for row in X]
X_bydict_bin = np.array(X_bydict_bin)
# Use Gaussian Naive Bayes on the data and retrieve accuracy
bnb = GaussianNB()
# Potential sample weights
sample_weights = []
for y in Y:
if y == 1 : sample_weights += [QUERY_WEIGHT]
else: sample_weights += [1]
bnb.fit(X_bydict_bin, Y, sample_weight= sample_weights)
Y_pred = bnb.predict(X_bydict_bin)
# Compute F1-score
score = sklearn.metrics.f1_score(Y, Y_pred)
# print(pd.crosstab(Y,Y_pred))
return -score # We want an error, higher means worse
## Run MODL with custom error function based on error of BNB
utils.VERBOSITY = 3 # Ensure DEBUG messages are shown
# Keep only combis WITH query (first element is not 0)
flags_matrix_with_query = flags_matrix[flags_matrix[:, 0] != 0]
combi_miner = Modl(flags_matrix_with_query,
multiple_overlap_max_number_of_combinations = DESIRED_ITEMSETS, # How many words to find ?
error_function = custom_error_function_bnb) # Custom error function in step 2
interesting_combis = combi_miner.find_interesting_combinations()