In [32]:
import sys
import os
import logging
import argparse
import time
import xml.etree.ElementTree as ET

xml_file="/nobackup/vickers_lab/projects/20221122_9074_ES_ARMseq_human_byMars/intermediate_data/bowtie1_genome_1mm_NTA_smallRNA_count/result/CAC_10_007DE/CAC_10_007DE.count.mapped.xml"
sample_name="CAC_10_007DE"
trna_read_file="/nobackup/vickers_lab/projects/20221122_9074_ES_ARMseq_human_byMars_tRNA_pos/CAC_10_007DE.tRNA.txt"

def accept_func(feature_name:str):
  return(feature_name.startswith("tRNA:"))

assert(not accept_func("miRNA:hsa-let-7a-5p"))
assert(accept_func("tRNA:hsa-let-7a-5p"))


# Get query sequence

In [33]:
logger = logging.getLogger('tRNA_position')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)-8s - %(message)s')
logger.info(f"Reading {xml_file} ...")

tree = ET.parse(xml_file)
root = tree.getroot()

2024-04-15 14:05:23,507 - tRNA_position - INFO     - Reading /nobackup/vickers_lab/projects/20221122_9074_ES_ARMseq_human_byMars/intermediate_data/bowtie1_genome_1mm_NTA_smallRNA_count/result/CAC_10_007DE/CAC_10_007DE.count.mapped.xml ...


In [34]:
query_seq_map = {}

queries_node = root.find('queries')
for query_node in queries_node.findall('query'):
  query_name = query_node.get("name").rstrip()
  if "sequence" in query_node.attrib:
    query_sequence=query_node.get("sequence")
  else:
    query_sequence=query_node.get("seq")
  query_seq_map[query_name] = query_sequence

print(query_seq_map)

{'A00252:306:HLH3MDSX5:1:1101:12626:2002:CLIP_': 'GGCTGGTCCGATGGTAGTGGGTTATCAGAACT', 'A00252:306:HLH3MDSX5:1:1102:18909:8531:CLIP_': 'CCCCCCACTGCTAAATTTGACTGGCTT', 'A00252:306:HLH3MDSX5:1:1101:24316:17487:CLIP_': 'TGAGGTAGTAGGTTGTATAGTT', 'A00252:306:HLH3MDSX5:1:1102:30906:1752:CLIP_CCA': 'CCGGGGTTCGATTCCCCGACGGGGAG', 'A00252:306:HLH3MDSX5:1:1104:27190:17769:CLIP_': 'GCATTGGTGGTTCAGTGGTAGAATTCTCGCCT', 'A00252:306:HLH3MDSX5:1:1103:18367:20118:CLIP_': 'GGCTGGTCCGATGGTAGTGGGTTATCAGAACTT', 'A00252:306:HLH3MDSX5:1:1101:26386:24987:CLIP_': 'GGCTGGTCCGATGGTAGTGGGTTATCAGAAC', 'A00252:306:HLH3MDSX5:1:1101:31060:17738:CLIP_': 'TGAGGTAGTAGATTGTATAGTT', 'A00252:306:HLH3MDSX5:1:1106:15257:25880:CLIP_CCA': 'GGGGTTCGATTCCCCGACGGGGAG', 'A00252:306:HLH3MDSX5:1:1104:8043:15922:CLIP_CCA': 'CGGGGTTCGATTCCCCGACGGGGAG', 'A00252:306:HLH3MDSX5:1:1110:32786:31062:CLIP_': 'CCCCCCACTGCTAAATTTGACTGGCTTT', 'A00252:306:HLH3MDSX5:1:1101:9896:26209:CLIP_': 'CGGGCCGCCGGTGAAATACCACTACT', 'A00252:306:HLH3MDSX5:1:1106:29

In [36]:
result_node = root.find('subjectResult')
feature_map = {}

for feature_group_node in result_node.findall('subjectGroup'):
  feature_nodes = feature_group_node.findall('subject')
  for feature_node in feature_nodes:
    feature_name = feature_node.get("name")
    if not accept_func(feature_name):
      continue
    
    region_nodes = feature_node.findall('region')
    for region_node in region_nodes:
      seqname = region_node.get("seqname")
      if not seqname.startswith('Homo_sapiens_'):
        continue

      sequence = region_node.get("sequence")
      cur_queries = {}

      query_nodes = region_node.findall('query')
      for query_node in query_nodes:
        qname = query_node.get('qname')
        qseq = query_seq_map[qname]
        cur_queries[qname] = {
          "sample": sample_name,
          "offset": query_node.get('offset'),
          "query_count": query_node.get('query_count'),
          "sequence": qseq
        }

      feature_map[feature_name] = {
        "sequence": sequence,
        "queries": cur_queries 
      }

print(feature_map['tRNA:tRNA-Ser-GCT-3-1'])

{'sequence': 'GACGAGGTGGCCGAGTGGTtAAGGCGATGGACTGCTAATCCATTGTGCTTTGCACGCGTGGGTTCGAATCCCATCCTCGTCG', 'queries': {'A00252:306:HLH3MDSX5:1:1177:22101:33066:CLIP_CCA': {'sample': 'CAC_10_007DE', 'offset': '66', 'query_count': '101', 'sequence': 'AATCCCATCCTCGTCG'}, 'A00252:306:HLH3MDSX5:1:1127:16532:17691:CLIP_': {'sample': 'CAC_10_007DE', 'offset': '0', 'query_count': '30', 'sequence': 'GACGACGTGGCCGAGTGG'}, 'A00252:306:HLH3MDSX5:1:2152:12274:31266:CLIP_': {'sample': 'CAC_10_007DE', 'offset': '0', 'query_count': '6', 'sequence': 'GACGACGTGGCCGAGTGGTTAAG'}, 'A00252:306:HLH3MDSX5:1:1118:16025:12336:CLIP_': {'sample': 'CAC_10_007DE', 'offset': '0', 'query_count': '3', 'sequence': 'GACGACGTGGCCGAGTGGTTAAGGC'}, 'A00252:306:HLH3MDSX5:2:2262:12608:1470:CLIP_CCA': {'sample': 'CAC_10_007DE', 'offset': '66', 'query_count': '2', 'sequence': 'AAACCCATCCTCGTCG'}, 'A00252:306:HLH3MDSX5:1:1121:2519:34914:CLIP_CCA': {'sample': 'CAC_10_007DE', 'offset': '66', 'query_count': '1', 'sequence': 'AATCACATCCTCGT

In [46]:
with open(trna_read_file, "wt") as fout:
  fout.write("category\tfname\tsequence\tmap_offset\tsample\tquery_count\tquery_length\n")
  for fname in sorted(feature_map.keys()):
    fmap = feature_map[fname]
    fout.write(f"parent\t{fname}\t{fmap['sequence']}\t0\t\t0\t{len(fmap['sequence'])}\n")
    for qname in fmap['queries'].keys():
      qmap = fmap['queries'][qname]
      fout.write(f"read\t{fname}\t{qmap['sequence']}\t{qmap['offset']}\t{qmap['sample']}\t{qmap['query_count']}\t{len(qmap['sequence'])}\n")