/
get_coverage_per_read.m
181 lines (164 loc) · 6.33 KB
/
get_coverage_per_read.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
function [coverage reads_ok intron_list read_starts paired_reads] = get_coverage_per_read(CFG, gene, reverse_ret)
% GET_COVERAGE_PER_READ Gets the reads from the BAM file covering the gene region.
%
% [coverage reads_ok intron_list read_starts paired_reads] = get_coverage_per_read(CFG, gene, reverse_ret)
%
% -- input --
% CFG: configuration struct
% gene: struct defining a gene with start, stops, exons etc.
% reverse_ret: if true, the positions are considered in reverse
% direction on the reverse strand
%
% -- output --
% coverage: matrix of exonic positions x reads
% reads_ok: indicates success of file parsing
% intron_list: nx4 list of introns
% (intron start, intron stop, confirmation, strand)
% read_starts: vector of number of reads starting at the given exonic positions
% paired_reads: struct including starts, stops and mates for paired-end reads
%
%
% This program is free software; you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation; either version 3 of the License, or
% (at your option) any later version.
%
% Written (W) 2009-2011 Regina Bohnert, Gunnar Raetsch
% Copyright (C) 2009-2011 Max Planck Society
%
% initialisation
coverage = [];
reads_ok = 1;
intron_list = zeros(2, 0);
read_starts = zeros(0, 1);
paired_reads.starts = zeros(1, 0);
paired_reads.stops = zeros(1, 0);
paired_reads.mates = zeros(2, 0);
collapse = double(~CFG.paired & nargout<4);
if nargin<3
reverse_ret = 0;
end
if nargout==5
assert(CFG.paired==1);
end
if ~isfield(CFG, 'both_strands')
CFG.both_strands = 0;
end
if CFG.both_strands
strand = '0';
else
strand = gene.strand;
end
% parameters for get_reads
if ~isfield(CFG, 'tracks_max_intron_len')
CFG.tracks_max_intron_len = 1e9;
end
if ~isfield(CFG, 'tracks_min_exon_len')
CFG.tracks_min_exon_len = -1;
end
if ~isfield(CFG, 'tracks_max_mismatches')
CFG.tracks_max_mismatches = CFG.read_len;
end
subsample = 1000; mapped = 1; spliced = 1; maxminlen = 0;
win = CFG.read_len;
eidx = [max(gene.eidx(1)-win,1):gene.eidx(1)-1, gene.eidx, gene.eidx(end)+1:gene.eidx(end)+win];
win_size = length(max(gene.eidx(1)-win,1):gene.eidx(1)-1);
for f = 1:length(CFG.tracks_fn{gene.chr_num}),
fname = CFG.tracks_fn{gene.chr_num}{f};
if ~exist(fname, 'file'),
warning('BAM file %s does not exist', fname);
end
try
if nargout==5
[coverage_idx_tmp{f}, intron_list_tmp, cov_paired_tmp, mates_tmp] = get_reads(fname, gene.chr, eidx(1), eidx(end), strand, collapse, subsample, CFG.tracks_max_intron_len, CFG.tracks_min_exon_len, CFG.tracks_max_mismatches, mapped, spliced, maxminlen, CFG.paired);
clear cov_paired_tmp;
elseif nargout==3 || nargout==4
[coverage_idx_tmp{f}, intron_list_tmp] = get_reads(fname, gene.chr, eidx(1), eidx(end), strand, collapse, subsample, CFG.tracks_max_intron_len, CFG.tracks_min_exon_len, CFG.tracks_max_mismatches, mapped, spliced, maxminlen, CFG.paired);
else
[coverage_idx_tmp{f}] = get_reads(fname, gene.chr, eidx(1), eidx(end), strand, collapse, subsample, CFG.tracks_max_intron_len, CFG.tracks_min_exon_len, CFG.tracks_max_mismatches, mapped, spliced, maxminlen, CFG.paired);
end
catch
warning('get_reads failed');
intron_list = intron_list';
reads_ok = 0;
return;
end
if exist('intron_list_tmp', 'var')
if ~isempty(intron_list_tmp),
intron_list = [intron_list intron_list_tmp];
end
end
if exist('mates_tmp', 'var')
if ~isempty(mates_tmp),
paired_reads.mates = [paired_reads.mates mates_tmp+1];
end
end
end
% process coverage: convert to exonic position indices
coverage_idx = [coverage_idx_tmp{:}];
if ~collapse
coverage_idx = unique(coverage_idx', 'rows')'; % no overlapping reads
if ~isempty(coverage_idx)
coverage = sparse(coverage_idx(1,:), coverage_idx(2,:), 1, max(coverage_idx(1,:)), eidx(end)-eidx(1)+1)';
else
coverage = sparse([], [], 1, eidx(end)-eidx(1)+1, 0);
end
coverage = coverage(eidx(win_size+1:win_size+gene.exonic_len)-eidx(1)+1, :);
% no overlapping reads
assert(~any(any(full(coverage>1))));
else
coverage = sum(coverage_idx, 1);
coverage = sparse(coverage(eidx(win_size+1:win_size+gene.exonic_len)-eidx(1)+1)');
end
% process intron list (1: intron start, 2: intron stop, 3: confirmation, 4: strand)
if nargout>2
intron_list = [intron_list', zeros(size(intron_list,2), 1), (gene.strand=='-')*ones(size(intron_list,2), 1)];
if ~isempty(intron_list)
intron_list_unique = unique(intron_list, 'rows');
for n = 1:size(intron_list_unique,1),
intron_list_unique(n,3) = sum(intron_list_unique(n,1)==intron_list(:,1) & ...
intron_list_unique(n,2)==intron_list(:,2) & ...
intron_list_unique(n,4)==intron_list(:,4));
end
intron_list = intron_list_unique;
clear intron_list_unique;
end
end
% process read starts: count reads starting at each exonic position
if nargout>3
read_starts = zeros(gene.exonic_len, 1);
for c = 1:size(coverage, 2),
fidx = find(coverage(:,c)~=0, 1, 'first');
if ~isempty(fidx),
if fidx==1 && sum(coverage(:,c),1)<CFG.read_len, continue; end
read_starts(fidx) = read_starts(fidx) + 1;
end
end
end
% process read starts and stops: store indices (eidx based)
if nargout>4
paired_reads.starts = nan(1, size(coverage, 2));
paired_reads.stops = nan(1, size(coverage, 2));
for c = 1:size(coverage, 2),
fidx = find(coverage(:,c)~=0, 1, 'first');
lidx = find(coverage(:,c)~=0, 1, 'last');
if ~isempty(fidx) && ~isempty(lidx)
if (fidx==1 || lidx==size(coverage, 1)) && sum(coverage(:,c),1)<CFG.read_len, continue; end
paired_reads.starts(c) = gene.eidx(fidx);
paired_reads.stops(c) = gene.eidx(lidx);
end
end
idx = find(~isnan(paired_reads.starts) & ~isnan(paired_reads.stops));
[C fidx1] = intersect(paired_reads.mates(1,:), idx);
[C fidx2] = intersect(paired_reads.mates(2,:), idx);
paired_reads.mates = paired_reads.mates(:,intersect(fidx1, fidx2));
end
% collapse coverage as single reads are not required anymore
if CFG.paired || nargout>3
coverage = sum(coverage, 2);
end
% reverse for minus strand
if reverse_ret && gene.strand=='-'
rev_idx = size(coverage,1):-1:1;
coverage = coverage(rev_idx,:);
end