#### Step 1. Find all assemblies

In [1]:
from datetime import datetime
from glob import glob
from os import chdir, listdir, path, stat, system
from pandas import Series
from pickle import dump, load
from sys import path as spath
spath.append("scripts/")
from CRISPRtools import CasOperon
chdir("data")
def baseAssembly(fname): return fname[:fname.rfind(".")], fname[fname.rfind("."):]

In [2]:
theDate = datetime.now()
allAssemblies = {}
refDatabases = ["NCBI/refseq/archaea","NCBI/refseq/bacteria","NCBI/genbank/bacteria","NCBI/genbank/archaea","PATRIC2/fastas"]
baseDbsDir = "/mnt/research/germs/shane/databases/assemblies/"
validExts = set([".fasta",".fna",".fa"])
for db in refDatabases:
    print("Checking",db)
    for assembly in listdir(baseDbsDir+db):
        asmID, ext = baseAssembly(assembly)
        if ext not in validExts: continue
        allAssemblies[asmID] = baseDbsDir + db + "/" + assembly
print("Ready to dump")
dump(allAssemblies,open("pickles/allAssemblies.p","wb"))     
print("Number of assemblies on %s: %i" % (theDate, len(allAssemblies)))

Checking NCBI/refseq/archaea
Checking NCBI/refseq/bacteria
Checking NCBI/genbank/bacteria
Checking NCBI/genbank/archaea
Checking PATRIC2/fastas
Ready to dump
Number of assemblies on 2019-06-12 19:46:48.197122: 674734


In [2]:
tStamp = path.getmtime("pickles/allAssemblies.p")
theDate = datetime.fromtimestamp(tStamp).strftime('%H:%M:%S %m-%d-%Y')
allAssemblies = load(open("pickles/allAssemblies.p","rb"))
print("Number of assemblies from all sources as of %s: %i assemblies" % (theDate, len(allAssemblies)))

Number of assemblies from all sources as of 19:47:00 06-12-2019: 674734 assemblies


#### Step 2. Process the CRISPR results
1. Check to see if the assembly related to file has a crispr array
2. Add any assembly with a crispr array to a master list

In [4]:
from CRISPRtools import CRISPRs
crisprDir = "/mnt/research/germs/shane/databases/crisprs/"
tools = ["pilerCR/", "minCED/"]
dbs = ['pat2','genbank','refseq']
crisprFiles = CRISPRs()
for tool in tools:
    for db in dbs: crisprFiles.hasCrispr(listdir(crisprDir+tool+db+"/"), tool == "pilerCR/",crisprDir+tool+db+"/",allAssemblies)
dump(crisprFiles, open("pickles/CRISPRs.p","wb")) 
print("\nDone\n")

Working on checking 216391 CRISPRs from /mnt/research/germs/shane/databases/crisprs/pilerCR/pat2/
	14% of the way through with 18004 CRISPRs found
	29% of the way through with 36012 CRISPRs found
	44% of the way through with 54110 CRISPRs found
	59% of the way through with 72150 CRISPRs found
	74% of the way through with 90306 CRISPRs found
	89% of the way through with 108513 CRISPRs found
Working on checking 515475 CRISPRs from /mnt/research/germs/shane/databases/crisprs/pilerCR/genbank/
	14% of the way through with 147680 CRISPRs found
	29% of the way through with 174833 CRISPRs found
	44% of the way through with 202155 CRISPRs found
	59% of the way through with 229280 CRISPRs found
	74% of the way through with 256379 CRISPRs found
	89% of the way through with 283459 CRISPRs found
Working on checking 163324 CRISPRs from /mnt/research/germs/shane/databases/crisprs/pilerCR/refseq/
	14% of the way through with 316243 CRISPRs found
	29% of the way through with 330763 CRISPRs found
	44% o

In [None]:
# Below this is experimental



























In [7]:
from sys import path as spath
spath.append("scripts/")
from pickle import load
from Bio.SeqIO import write, parse
from collections import Counter
from CRISPRtools import CasOperon
gene='Cas9'
#casOperons = load(open("data/pickles/casOperonDataStructureW%s.p" % gene,"rb"))
protID = "NZ_LT906463.1_ORF95365"
protID = "FNMX01000011_ORF6660"
protID = "CP027232_ORF135234"
genomicAsmName = casOperons.revMap[protID] 
protOperon = casOperons.casOperons[genomicAsmName]
protCRISPR = protOperon.crisprs[protID]

In [8]:
protOperon.assembly, protCRISPR

('/mnt/research/germs/shane/databases/assemblies/PATRIC2/fastas/1316593.3.fna',
 {'minCED': <CRISPRtools.CRISPR at 0x2ad9526890b8>})

In [9]:
protCRISPR['minCED'].repeats

{1: {'CTATCTGCCTGTGCGG': 3},
 2: {'CTTCGTCATTTGTTCGTCATTTGTTCGTC': 1,
  'GTTCGTCATTTCTTCGTTATTTGTTTATC': 1,
  'GTTCGTCATTTGTTCGTCATTTGTTCGTC': 1,
  'GTTCGTCATTTGTTCGTCTTTTCTTCGTC': 1,
  'GTTCGTCATTCGTTCGTCTTTTGTTCGTC': 1}}

In [8]:
assembliesWCrisprs = allAssemblies # copy.deepcopy(allAssemblies)
seqHashes,notInDB,nf = set(),set(),[]
sizeDist = []
samples = []
for db in dbs:
    for fileName in pilerCRFiles[db]:
        fsize = stat(pilerCRDir+db+"/"+fileName).st_size
        assemblyID = fileName[:fileName.rfind(".")] 
        if (fsize <= 200): 
            del assembliesWCrisprs[assemblyID]
        else: 
            sizeDist.append(fsize)
            samples.append(assemblyID)
print("Number of CRISPRs detected:",len(assembliesWCrisprs))
print("Not in db:", len(notInDB))
print(len(sizeDist))
assembliesWCrisprs = dump(open("pickles/allAssemblyW_CRISPRs.p","rb")) 

dump(assembliesWCrisprs,open("data/pickles/allAssemblyCRISPRs.p","wb"))     

Number of CRISPRs detected: 353602
Not in db: 0
353602


In [10]:
sizeDist= Series(sizeDist,index=samples)
sizeDist.describe()

count    353602.000000
mean       6349.021241
std        6234.788022
min        1419.000000
25%        1964.000000
50%        4302.000000
75%        8160.000000
max      166672.000000
dtype: float64

In [18]:
# from os import system,path
# from pickle import load
# count = 0
# assembliesWCrisprs = load(open("data/pickles/allAssemblyW_CRISPRs.p","rb"))
# asmLinkPath = "/mnt/research/germs/shane/transActRNA/data/assemblies/assemblies_W_CRISPRs"
# for asmName, asmPath in assembliesWCrisprs.items():
#     cmd = "ln -s %s %s/%s.fasta" % (asmPath,asmLinkPath,asmName)
#     if not path.exists("%s/%s.fasta" % (asmLinkPath,asmName)): 
#         count+=1
# #         system(cmd)    
# count

6940

In [None]:
pilerCR_results[protID] = PilerCRReader(pilerCRFileName)

In [None]:
for fileName in files:
    if (".pcrout" not in fileName):continue
    system("mv %s /mnt/research/germs/shane/databases/crisprs/pilerCR/pat2/" % (fileName))

In [None]:
import hashlib
def get_md5(filePath):
    with open(filePath, 'rb') as fh:
        m = hashlib.md5()
        while True:
            data = fh.read(8192)
            if not data:
                break
            m.update(data)
        return m.hexdigest()

In [17]:
count =0
linkpaths = listdir(asmLinkPath)
for fname in linkpaths:
    if fname.replace(".fasta","") not in allAssemblies:
        system("unlink %s/%s" %(asmLinkPath,fname))
        count +=1
count,len(linkpaths)

(97286, 440288)

In [None]:
%%bash
ls