In [2]:
import pysam

data_dir = "/home/cadel/replication"
data_files = [
    "missing_bosTau2.sam",
    "missing_canFam2.sam",
    "missing_hg18.sam",
    "missing_mm8.sam",
    "missing_rn4.sam"
]

In [5]:
from collections import defaultdict
import os
BAM_CMATCH = 0
BAM_CINS = 1
BAM_CDEL = 2
BAM_CREF_SKIP = 3
BAM_CSOFT_CLIP = 4
BAM_CHARD_CLIP = 4
BAM_CPAD = 6
BAM_CEQUAL = 7
BAM_CDIFF = 8
BAM_CBACK = 9

def count_mismatches(
    ops,
    sequence_length,
):
    matches = 0

    for op, count in ops:
        if op == BAM_CMATCH or op == BAM_CEQUAL:
            matches += count
        
        if op == BAM_CDEL:
            matches -= count

    return sequence_length - matches

exact_uces_found = defaultdict(set)

for data_file in data_files:
    for aln in pysam.AlignmentFile(os.path.join(data_dir, data_file), 'r'):
        mm = count_mismatches(aln.cigartuples, aln.query_length)
        if mm == 0:
            exact_uces_found[data_file].add(aln.query_name)

In [8]:
from collections import Counter

all_uces = [v for _, vals in exact_uces_found.items() for v in vals]

In [38]:
exact_uces_found["missing_mm8.sam"]["chr1:36131776-36132003"]

TypeError: 'set' object is not subscriptable

In [20]:
with open("/home/cadel/replication/valid_but_missing.txt", "w") as f:
    for uce in [x for x,y in Counter(all_uces).items() if y >= 3]:
        f.write(uce + "\n")

In [46]:
hg18 = "caggccagggctcctccgtgcccagGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGACCAGTTCTCTGCCTGTCCCTGCCAGGTACACCCCTGTGGGCCGCTCCTTCTTCTCACCGCCTGAGGGCTACTACCACCCGCTGGGGGGTGGGCGCGAGGTCTGGTTCGGCTTTCACCAGTCTGTGCGCCCTGCCATGTGGAAGATGATGCTCAACATTGATGGTGAG"
mm8 = "CTCACCATCAATGTTGAGCATCATCTTCCACATGGCAGGGCGCACAGACTGGTGAAAGCCGAACCAGACCTCGCGCCCACCCCCCAGCGGGTGGTAGTAGCCCTCAGGCGGTGAGAAGAAGGAGCGGCCCACAGGGGTGTACCTGGCAGGGACAGGCAGAGAACTGGTCCCTCGGGCACAGGCCCCCACCCTGTGAGGCATCctgggcacggaggagccctggcctg"
mm8_rc = "CAGGCCAGGGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGACCAGTTCTCTGCCTGTCCCTGCCAGGTACACCCCTGTGGGCCGCTCCTTCTTCTCACCGCCTGAGGGCTACTACCACCCGCTGGGGGGTGGGCGCGAGGTCTGGTTCGGCTTTCACCAGTCTGTGCGCCCTGCCATGTGGAAGATGATGCTCAACATTGATGGTGAG"

print(hg18.upper())
print(mm8_rc)

hg18.upper() == mm8_rc

found = "CAGGCCAGGGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGACCAGTTCTCTGCCTGTCCCTGCCAGGTACACCCCTGTGGGCCGCTCCTTCTTCTCACCGCCTGAGGGCTACTACCACCCGCTGGGGGGTGGGCGCGAGGTCTGGTTCGGCTTTCACCAGTCTGTGCGCCCTGCCATGTGGAAGATGATGCTCAACATTGATGGTGAGTGGGGAGAGCTATGGAGCCAGGGGCACCC"
found_3_of_5 = "AGGGACCAGTTCTCTGCCTGTCCCTGCCAGGTACACCCCTGTGGGCCGCTCCTTCTTCTCACCGCCTGAGGGCTACTACCACCCGCTGGGGGGTGGGCGCGAGGTCTGGTTCGGCTTTCACCAGTCTGTGCGCCCTGCCATGTGGAAGATGATGCTCAACATTGATGGTGAGTGGGGAGAGCT"
print(found)
len(found_3_of_5)
found_3_of_5

CAGGCCAGGGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGACCAGTTCTCTGCCTGTCCCTGCCAGGTACACCCCTGTGGGCCGCTCCTTCTTCTCACCGCCTGAGGGCTACTACCACCCGCTGGGGGGTGGGCGCGAGGTCTGGTTCGGCTTTCACCAGTCTGTGCGCCCTGCCATGTGGAAGATGATGCTCAACATTGATGGTGAG
CAGGCCAGGGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGACCAGTTCTCTGCCTGTCCCTGCCAGGTACACCCCTGTGGGCCGCTCCTTCTTCTCACCGCCTGAGGGCTACTACCACCCGCTGGGGGGTGGGCGCGAGGTCTGGTTCGGCTTTCACCAGTCTGTGCGCCCTGCCATGTGGAAGATGATGCTCAACATTGATGGTGAG
CAGGCCAGGGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGACCAGTTCTCTGCCTGTCCCTGCCAGGTACACCCCTGTGGGCCGCTCCTTCTTCTCACCGCCTGAGGGCTACTACCACCCGCTGGGGGGTGGGCGCGAGGTCTGGTTCGGCTTTCACCAGTCTGTGCGCCCTGCCATGTGGAAGATGATGCTCAACATTGATGGTGAGTGGGGAGAGCTATGGAGCCAGGGGCACCC


'AGGGACCAGTTCTCTGCCTGTCCCTGCCAGGTACACCCCTGTGGGCCGCTCCTTCTTCTCACCGCCTGAGGGCTACTACCACCCGCTGGGGGGTGGGCGCGAGGTCTGGTTCGGCTTTCACCAGTCTGTGCGCCCTGCCATGTGGAAGATGATGCTCAACATTGATGGTGAGTGGGGAGAGCT'

In [31]:
def reverse_complement(seq: str) -> str:                                                                                   
    m = {"A": "T", "T": "A", "G": "C", "C": "G"}                                                                           
    return seq.translate(seq.maketrans(m))[::-1]

for i in range(len(hg18) - 49):
    print(hg18[i:i+50].upper())

#hg18[:50].upper()

CAGGCCAGGGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGT
AGGCCAGGGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTG
GGCCAGGGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGC
GCCAGGGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCC
CCAGGGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCC
CAGGGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCG
AGGGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGA
GGGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAG
GGCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGG
GCTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGG
CTCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGA
TCCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGAC
CCTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGACC
CTCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGACCA
TCCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGACCAG
CCGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGACCAGT
CGTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGACCAGTT
GTGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGACCAGTTC
TGCCCAGGATGCCTCACAGGGTGGGGGCCTGTGCCCGAGGGACCAGTTCT
GCCCAGGATGCCTCACAGGGTGGGGGCCTGT

In [35]:
mapped = ["36131758",
"36131759",
"36131760",
"36131761",
"36131762",
"36131763",
"36131764",
"36131765",
"36131766",
"36131767",
"36131768",
"36131769",
"36131770",
"36131771",
"36131772",
"36131773",
"36131774",
"36131775",
"36131776",
"36131777",
"36131778",
"36131779",
"36131780",
"36131781",
"36131782",
"36131783",
"36131784",
"36131785",
"36131786",
"36131787",
"36131788",
"36131789",
"36131790",
"36131791",
"36131792",
"36131793",
"36131794",
"36131795",
"36131796",
"36131797",
"36131798",
"36131799",
"36131800",
"36131801",
"36131802",
"36131803",
"36131804",
"36131805",
"36131806",
"36131807",
"36131808",
"36131809",
"36131810",
"36131811",
"36131812",
"36131813",
"36131814",
"36131815",
"36131816",
"36131817",
"36131818",
"36131819",
"36131820",
"36131821",
"36131822",
"36131823",
"36131824",
"36131825",
"36131826",
"36131827",
"36131828",
"36131829",
"36131830",
"36131831",
"36131832",
"36131833",
"36131834",
"36131835",
"36131836",
"36131837",
"36131838",
"36131839",
"36131840",
"36131841",
"36131842",
"36131843",
"36131844",
"36131845",
"36131846",
"36131847",
"36131848",
"36131849",
"36131850",
"36131851",
"36131852",
"36131853",
"36131854",
"36131855",
"36131856",
"36131857",
"36131858",
"36131859",
"36131860",
"36131861",
"36131862",
"36131863",
"36131864",
"36131865",
"36131866",
"36131867",
"36131868",
"36131869",
"36131870",
"36131871",
"36131872",
"36131873",
"36131874",
"36131875",
"36131876",
"36131877",
"36131878",
"36131879",
"36131880",
"36131881",
"36131882",
"36131883",
"36131884",
"36131885",
"36131886",
"36131887",
"36131888",
"36131889",
"36131890",
"36131891",
"36131892",
"36131893",
"36131894",
"36131895",
"36131896",
"36131897",
"36131898",
"36131899",
"36131900",
"36131901",
"36131902",
"36131903",
"36131904",
"36131905",
"36131906",
"36131907",
"36131908",
"36131909",
"36131910",
"36131911",
"36131912",
"36131913",
"36131914",
"36131915",
"36131916",
"36131917",
"36131918",
"36131919",
"36131920",
"36131921",
"36131922",
"36131923",
"36131924",
"36131925",
"36131926",
"36131927",
"36131928",
"36131929",
"36131930",
"36131931",
"36131932",
"36131933",
"36131934",
"36131935",
"36131936",
"36131937",
"36131938",
"36131939",
"36131940",
"36131941",
"36131942",
"36131943",
"36131944",
"36131945",
"36131946",
"36131947",
"36131948",
"36131949",
"36131950",
"36131951",
"36131952",
"36131953",
"36131954",
"36131955",
"36131956",
"36131957",
"36131958",
"36131959",
"36131960",
"36131961",
"36131962",
"36131963",
"36131964",
"36131965",
"36131966",
"36131967",
"36131968",
"36131969",
"36131970",
"36131971",
"36131972",
"36131973",
"36131974",
"36131975",
"36131976",
"36131977",
"36131978",
"36131979",
"36131980",
"36131981",
"36131982",
"36131983",
"36131984"]

for i in range(36131776, 36132003):
    if str(i) not in mapped:
        print(i)


36131985
36131986
36131987
36131988
36131989
36131990
36131991
36131992
36131993
36131994
36131995
36131996
36131997
36131998
36131999
36132000
36132001
36132002
