Skip to content

Commit

Permalink
bam_alt: parsing of optional fields
Browse files Browse the repository at this point in the history
  • Loading branch information
pveber committed Aug 19, 2014
1 parent b3d4a95 commit 0e560b5
Show file tree
Hide file tree
Showing 4 changed files with 205 additions and 58 deletions.
176 changes: 134 additions & 42 deletions src/lib/biocaml_bam_alt.ml
Expand Up @@ -20,23 +20,23 @@ let wrap f x =
| Bgzf.Parse_error err -> error_string err
| End_of_file -> error_string "Bam: premature end of file reading the header"

let input_int32_as_int iz =
Int32.to_int_exn (Bgzf.input_int32 iz)
let input_s32_as_int iz =
Int32.to_int_exn (Bgzf.input_s32 iz)

let read_header = wrap (fun iz ->
let magic = Bgzf.input_string iz 4 in
if magic <> "BAM\001" then parse_error "Incorrect magic string, not a BAM file" ;
let l_text = input_int32_as_int iz in
let l_text = input_s32_as_int iz in
let text = Bgzf.input_string iz l_text in
Biocaml_sam.parse_header text
)

let read_one_reference_information iz =
let parse iz =
let l_name = input_int32_as_int iz in
let l_name = input_s32_as_int iz in
let name = Bgzf.input_string iz (l_name - 1) in
let _ = Bgzf.input_char iz in (* name is a NULL terminated string *)
let length = input_int32_as_int iz in
let length = input_s32_as_int iz in
return (name, length)
in
wrap parse iz >>= fun (name, length) ->
Expand All @@ -50,7 +50,7 @@ let read_reference_information = wrap (fun iz ->
| Ok refseq -> loop (refseq :: accu) (n - 1)
| Error _ as e -> e
in
let n_ref = input_int32_as_int iz in
let n_ref = input_s32_as_int iz in
loop [] n_ref
>>| Array.of_list
)
Expand Down Expand Up @@ -81,7 +81,7 @@ let get_4_0 =
let mask = Int32.of_int_exn 0xf in
fun x -> Int32.bit_and mask x |> Int32.to_int_exn

let cigar_op_of_int32 x =
let cigar_op_of_s32 x =
let op_len = get_32_4 x in
match get_4_0 x with
| 0 -> `Alignment_match op_len
Expand Down Expand Up @@ -114,14 +114,121 @@ let char_of_seq_code = function
| 15 -> 'N'
| l -> failwithf "letter not in [0, 15]: %d" l ()

let parse_cstring iz max_size =
let buf = Buffer.create max_size in
let rec loop n =
if n > max_size
then error_string "Unexpectedly long string in optional field"
else (
let c = Bgzf.input_char iz in
if c = '\000' then return (Buffer.contents buf)
else (Buffer.add_char buf c ; loop (n + 1))
)
in loop 0

let int32_is_positive =
let mask = Int32.(shift_left one 31) in
fun i -> Int32.(compare (bit_and i mask) zero) = 0

let parse_cCsSiIf iz = function
| 'c' ->
let i = Int32.of_int_exn (Bgzf.input_s8 iz) in
return (Sam.optional_field_value_i i, 1)
| 'C' ->
let i = Int32.of_int_exn (Bgzf.input_u8 iz) in
return (Sam.optional_field_value_i i, 1)
| 's' ->
let i = Int32.of_int_exn (Bgzf.input_s16 iz) in
return (Sam.optional_field_value_i i, 2)
| 'S' ->
let i = Int32.of_int_exn (Bgzf.input_u16 iz) in
return (Sam.optional_field_value_i i, 2)
| 'i' ->
let i = Bgzf.input_s32 iz in
return (Sam.optional_field_value_i i, 4)
| 'I' ->
let i = Bgzf.input_s32 iz in
if int32_is_positive i then return (Sam.optional_field_value_i i, 4)
else error_string "Use of big unsigned ints in optional fields is not well-defined in the specification of SAM/BAM files"
(* There's something fishy in the SAM/BAM format
specification. It says:
<< An integer may be stored as one of `cCsSiI' in BAM, representing
int8_t, uint8_t, int16_t, uint16_t, int32_t and uint32_t,
respectively. In SAM, all single integer types are mapped
to int32_t >>
But how are we supposed to map uint32_t to int32_t? Seems
like people from picard tools also noticed this problem:
http://sourceforge.net/p/samtools/mailman/message/24239571/ *)
| 'f' ->
let i = Bgzf.input_s32 iz in
return (Sam.optional_field_value_f (Int32.float_of_bits i), 4)
| _ -> error_string "Incorrect numeric optional field type identifier"

let cCsSiIf_size = function
| 'c'
| 'C' -> return 1
| 's' | 'S' -> return 2
| 'i' | 'I'
| 'f' -> return 4
| _ -> error_string "Incorrect numeric optional field type identifier"

let parse_optional_field_value iz remaining = function
| 'A' ->
Sam.optional_field_value_A (Bgzf.input_char iz) >>= fun v ->
return (v, 1)
| 'c' | 'C' | 's' | 'S' | 'i' | 'I' | 'f' as typ ->
parse_cCsSiIf iz typ
| 'Z' ->
parse_cstring iz remaining >>= fun s ->
Sam.optional_field_value_Z s >>= fun value ->
return (value, String.length s + 1) (* to account for the NULL character *)
| 'H' ->
parse_cstring iz remaining >>= fun s ->
Sam.optional_field_value_H s >>= fun value ->
return (value, String.length s + 1) (* to account for the NULL character *)
| 'B' ->
let typ = Bgzf.input_char iz in
let n = Bgzf.input_s32 iz in
(
match Int32.to_int n with
| Some n ->
cCsSiIf_size typ >>= fun elt_size ->
let elts = List.init n ~f:(fun _ -> input_string_or_fail iz elt_size) in
let bytes_read = 5 (* array type and size *) + elt_size * n in
Sam.optional_field_value_B typ elts >>= fun value ->
return (value, bytes_read)
| None ->
error_string "Too many elements in B-type optional field"
)
| c -> error "Incorrect optional field type identifier" c <:sexp_of< char>>

let parse_optional_field iz remaining =
let tag = input_string_or_fail iz 2 in
let field_type = Bgzf.input_char iz in
parse_optional_field_value iz remaining field_type >>= fun (field_value, shift) ->
Sam.optional_field tag field_value >>= fun field ->
return (field, shift)

let parse_optional_fields iz remaining =
let rec loop remaining accu =
if remaining = 0 then return (List.rev accu)
else
parse_optional_field iz remaining >>= fun (field, used_chars) ->
loop (remaining - 3 - used_chars) (field :: accu)
in
loop remaining []

let read_alignment (refseqs : Sam.ref_seq array) iz =
(* the type annotation for the [refseqs] argument is mandatory to disambiguate
types [Sam.ref_seq] and [Sam.program]. *)
let block_size = input_int32_as_int iz in
let block_size = input_s32_as_int iz in
(* let block = Bgzf.input_string iz block_size in *)
(* if String.length block < block_size then parse_error "Premature end of file while reading alignment." ; *)

let refID = input_int32_as_int iz in
let refID = input_s32_as_int iz in
if refID < -1 || refID > Array.length refseqs
then raise (Parse_error (Error.create "Incorrect refID field while reading an alignment" refID <:sexp_of<int>>)) ;
let rname =
Expand All @@ -130,7 +237,7 @@ let read_alignment (refseqs : Sam.ref_seq array) iz =
| i -> Some refseqs.(i).Sam.name
in

let pos = match Bgzf.input_int32 iz |> Int32.to_int with
let pos = match Bgzf.input_s32 iz |> Int32.to_int with
| Some (- 1) -> None
| Some 2147483647
| None -> parse_error "A read has a position greater than 2^31"
Expand All @@ -139,11 +246,11 @@ let read_alignment (refseqs : Sam.ref_seq array) iz =
else Some (n + 1)
in

let bin_mq_nl = Bgzf.input_int32 iz in
let bin_mq_nl = Bgzf.input_s32 iz in
let l_read_name = get_8_0 bin_mq_nl in (* cannot fail since by construction get_8_0 returns integers less than 256 *)
let mapq = get_16_8 bin_mq_nl in

let flag_nc = Bgzf.input_int32 iz in
let flag_nc = Bgzf.input_s32 iz in
let flags =
match Int32.shift_right flag_nc 16 |> Int32.to_int_exn |> Sam.Flags.of_int with
| Ok flags -> flags
Expand All @@ -152,9 +259,9 @@ let read_alignment (refseqs : Sam.ref_seq array) iz =
in
let n_cigar_op = get_16_0 flag_nc in

let l_seq = input_int32_as_int iz in
let l_seq = input_s32_as_int iz in

let next_refID = input_int32_as_int iz in
let next_refID = input_s32_as_int iz in
if next_refID < -1 || next_refID > Array.length refseqs
then raise (Parse_error (Error.create "Incorrect next_refID field while reading an alignment" next_refID <:sexp_of<int>>)) ;
let rnext =
Expand All @@ -163,14 +270,14 @@ let read_alignment (refseqs : Sam.ref_seq array) iz =
| i -> Some (`Value refseqs.(i).Sam.name)
in

let pnext = match Bgzf.input_int32 iz |> Int32.to_int with
let pnext = match Bgzf.input_s32 iz |> Int32.to_int with
| Some (- 1) -> None
| Some 2147483647
| None -> parse_error "A read has a position > than 2^31"
| Some n -> if n > 0 then Some (n + 1) else parse_error "A read has a negative next position"
in

let tlen = input_int32_as_int iz in
let tlen = input_s32_as_int iz in

let read_name =
let r = input_string_or_fail iz (l_read_name - 1) in
Expand All @@ -180,13 +287,13 @@ let read_alignment (refseqs : Sam.ref_seq array) iz =

let cigar =
List.init n_cigar_op ~f:(fun _ ->
cigar_op_of_int32 (Bgzf.input_int32 iz)
cigar_op_of_s32 (Bgzf.input_s32 iz)
) in

let seq =
let r = String.make l_seq ' ' in
for i = 0 to (l_seq + 1) / 2 - 1 do
let c = Bgzf.input_byte iz in
let c = Bgzf.input_u8 iz in
r.[2 * i] <- char_of_seq_code (c lsr 4) ;
if 2 * i + 1 < l_seq then r.[2 * i + 1] <- char_of_seq_code (c land 0xf) ;
done ;
Expand All @@ -199,30 +306,15 @@ let read_alignment (refseqs : Sam.ref_seq array) iz =

let remaining = block_size - 32 - l_read_name - 4 * n_cigar_op - ((l_seq + 1) / 2) - l_seq in

(* UNDER CONSTRUCTION *)
(* let optional_fields = *)
(* let rec loop remaining accu = *)
(* if remaining = 0 then List.rev accu *)
(* else ( *)
(* let tag = input_string_or_fail iz 2 in *)
(* let field_type = Bgzf.input_char iz in *)
(* let field_value, shift = match field_type with *)
(* | 'C' -> ( *)
(* match Int32.of_int (Bgzf.input_byte iz) with *)
(* | Some i -> `i i, 1 *)
(* | None -> parse_error "Expected a 32-bit integers in optional argument" *)
(* ) in *)
(* let field = Sam.optional_field ~tag ~field_type ~field_value () in *)
(* loop (remaining - 3 - shift) (field :: accu) *)
(* | _ -> assert false *)
(* ) *)
(* in *)
(* loop remaining [] in *)


let _ = input_string_or_fail iz remaining in

Sam.alignment ~qname:read_name ~flags ?rname ?pos ~mapq ~cigar ?rnext ?pnext ~tlen ~seq ~qual ()
let optional_fields =
match parse_optional_fields iz remaining with
| Ok optf -> optf
| Error e -> raise (Parse_error e)
in

Sam.alignment
~qname:read_name ~flags ?rname ?pos ~mapq ~cigar ?rnext ?pnext
~tlen ~seq ~qual ~optional_fields ()

let read_alignment_stream refseqs iz =
Stream.from (fun _ ->
Expand Down
2 changes: 1 addition & 1 deletion src/lib/biocaml_bam_alt.mli
Expand Up @@ -10,4 +10,4 @@ val read : in_channel -> (header * alignment Or_error.t Stream.t) Or_error.t

val with_file : string -> f:(header -> alignment Or_error.t Stream.t -> 'a) -> 'a Or_error.t

val write : header -> alignment Stream.t -> out_channel -> unit
(* val write : header -> alignment Stream.t -> out_channel -> unit *)
71 changes: 58 additions & 13 deletions src/lib/biocaml_bgzf.ml
Expand Up @@ -78,13 +78,6 @@ let input_u16 ic =
let b2 = input_byte ic in
b1 + b2 lsl 8

let input_u32 ic =
let b1 = input_byte ic in
let b2 = input_byte ic in
let b3 = input_byte ic in
let b4 = input_byte ic in
b1 + b2 lsl 8 + b3 lsl 16 + b4 lsl 24

let input_int32 ic =
let b1 = input_byte ic in
let b2 = input_byte ic in
Expand Down Expand Up @@ -188,14 +181,30 @@ let input_char =
if input iz buf 0 1 = 0 then raise End_of_file
else buf.[0]

let input_byte iz =
let input_u8 iz =
Char.code (input_char iz)

let input_int32 ic =
let b1 = input_byte ic in
let b2 = input_byte ic in
let b3 = input_byte ic in
let b4 = input_byte ic in
(* input_s* functions adapted from Batteries BatIO module *)
let input_s8 iz =
let b = input_u8 iz in
if b land 128 <> 0 then b - 256
else b

let input_u16 iz =
let b1 = input_u8 iz in
let b2 = input_u8 iz in
b1 lor (b2 lsl 8)

let input_s16 iz =
let i = input_u16 iz in
if i land 32768 <> 0 then i - 65536
else i

let input_s32 iz =
let b1 = input_u8 iz in
let b2 = input_u8 iz in
let b3 = input_u8 iz in
let b4 = input_u8 iz in
Int32.logor (Int32.of_int b1)
(Int32.logor (Int32.shift_left (Int32.of_int b2) 8)
(Int32.logor (Int32.shift_left (Int32.of_int b3) 16)
Expand Down Expand Up @@ -297,6 +306,42 @@ let rec output oz buf pos len =
let remaining = len - ncopy in
if remaining > 0 then output oz buf (pos + ncopy) remaining

let output_char =
let buf = String.make 1 ' ' in
fun oz c -> output oz buf 0 1

(* output_* functions adapted from Batteries BatIO module *)
let output_u8 oz n =
if n < 0 || n > 0xFF then raise (Invalid_argument "Bgzf.write_u8") ;
output_char oz (Char.unsafe_chr (n land 0xFF))

let output_s8 oz n =
if n < -0x80 || n > 0x7F then raise (Invalid_argument "Bgzf.write_s8") ;
if n < 0 then
output_u8 oz (n + 256)
else
output_u8 oz n

let output_u16 oz n =
if n < 0 || n > 0xFFFF then raise (Invalid_argument "Bgzf.write_u16") ;
output_u8 oz (n lsr 8);
output_u8 oz n

let output_s16 oz n =
if n < -0x8000 || n > 0x7FFF then raise (Invalid_argument "Bgzf.write_s16") ;
if n < 0 then
output_u16 oz (65536 + n)
else
output_u16 oz n

let output_s32 oz n =
let base = Int32.to_int n in
let big = Int32.to_int (Int32.shift_right_logical n 24) in
output_u8 oz big;
output_u8 oz (base lsr 16);
output_u8 oz (base lsr 8);
output_u8 oz base

let bgzf_eof = "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"

let dispose_out oz =
Expand Down

0 comments on commit 0e560b5

Please sign in to comment.