#### Step 1. Find all assemblies

In [1]:
from datetime import datetime
from glob import glob
from os import chdir, listdir, path, stat, system
from pandas import Series
from pickle import dump, load
from sys import path as spath
spath.append("scripts/")
from CRISPRtools import CasOperon
chdir("data")
def baseAssembly(fname): return fname[:fname.rfind(".")], fname[fname.rfind("."):]

In [2]:
theDate = datetime.now()
allAssemblies = {}
refDatabases = ["NCBI/refseq/archaea","NCBI/refseq/bacteria","NCBI/genbank/bacteria","NCBI/genbank/archaea","PATRIC2/fastas"]
baseDbsDir = "/mnt/research/germs/shane/databases/assemblies/"
validExts = set([".fasta",".fna",".fa"])
for db in refDatabases:
    print("Checking",db)
    for assembly in listdir(baseDbsDir+db):
        asmID, ext = baseAssembly(assembly)
        if ext not in validExts: continue
        allAssemblies[asmID] = baseDbsDir + db + "/" + assembly
print("Ready to dump")
dump(allAssemblies,open("pickles/allAssemblies.p","wb"))     
print("Number of assemblies on %s: %i" % (theDate, len(allAssemblies)))

Checking NCBI/refseq/archaea
Checking NCBI/refseq/bacteria
Checking NCBI/genbank/bacteria
Checking NCBI/genbank/archaea
Checking PATRIC2/fastas
Ready to dump
Number of assemblies on 2019-05-14 11:55:14.287848: 619349


In [2]:
tStamp = path.getmtime("pickles/allAssemblies.p")
theDate = datetime.fromtimestamp(tStamp).strftime('%H:%M:%S %m-%d-%Y')
allAssemblies = load(open("pickles/allAssemblies.p","rb"))
print("Number of assemblies from all sources as of %s: %i assemblies" % (theDate, len(allAssemblies)))

Number of assemblies from all sources as of 17:10:53 05-13-2019: 619349 assemblies


#### Step 2. Process the CRISPR results
1. Check to see if the assembly related to file has a crispr array
2. Add any assembly with a crispr array to a master list

In [3]:
from CRISPRtools import CasOperons
crisprDir = "/mnt/research/germs/shane/databases/crisprs/"
tools = ["pilerCR/", "minCED/"]
dbs = ['pat2','genbank','refseq']
crisprFiles = CasOperons()
for tool in tools:
    for db in dbs:
        filePath = crisprDir+tool+db+"/"
        crisprFiles.hasCrispr(listdir(filePath), tool == "pilerCR/",filePath,allAssemblies)
crisprFiles.saveProgress()
dump(crisprFiles, open("pickles/casOperonDataStructure.p","wb")) 

Working on checking 216439 CRISPRs from /mnt/research/germs/shane/databases/crisprs/pilerCR/pat2/
	14% of the way through with 18011 CRISPRs found
	29% of the way through with 36017 CRISPRs found
	44% of the way through with 54121 CRISPRs found
	59% of the way through with 72164 CRISPRs found
	74% of the way through with 90322 CRISPRs found
	89% of the way through with 108534 CRISPRs found
Working on checking 462727 CRISPRs from /mnt/research/germs/shane/databases/crisprs/pilerCR/genbank/
	14% of the way through with 141872 CRISPRs found
	29% of the way through with 163164 CRISPRs found
	44% of the way through with 184595 CRISPRs found
	59% of the way through with 205849 CRISPRs found
	74% of the way through with 227070 CRISPRs found
	89% of the way through with 248362 CRISPRs found
Working on checking 160687 CRISPRs from /mnt/research/germs/shane/databases/crisprs/pilerCR/refseq/
	14% of the way through with 277003 CRISPRs found
	29% of the way through with 291255 CRISPRs found
	44% o

In [9]:
filePath = crisprDir+tools[0]+dbs[1]+"/"
filePath

'/mnt/research/germs/shane/databases/crisprs/pilerCR/pat2/'

In [16]:
casop.hasCrispr(listdir(filePath), True, filePath,allAssemblies)


Working on checking 216439 CRISPRs from /mnt/research/germs/shane/databases/crisprs/pilerCR/pat2/
	14% of the way through with 18011 CRISPRs found
	29% of the way through with 36017 CRISPRs found
	44% of the way through with 54121 CRISPRs found
	59% of the way through with 72164 CRISPRs found
	74% of the way through with 90322 CRISPRs found
	89% of the way through with 108534 CRISPRs found


In [4]:
dump(crisprFiles, open("pickles/casOperonDataStructure.p","wb")) 

In [14]:
def baseFile(fname): return fname[:fname.rfind(".")], fname[fname.rfind("."):]
class CasOperons:
    def __init__(self):
        self.casOperon = {}
        self.crisprs = {}
        self.numCrisprFiles = 0
        self.revMap = {}
        self.uniqNukSeqs = {}
        self.uniqNukSeqMap = {}
    
    def hasCrispr(self,crisprFiles,toolType,crisprPath,assemblyPath):
        nCrisprs = len(crisprFiles)
        print("Working on checking %i CRISPRs from %s" % (nCrisprs,crisprPath))
        validExts = set([".pcrout",".mnout"])
        percCutoff = int(nCrisprs*.15)
        counter = 0
        self.numCrisprFiles += nCrisprs
        if toolType: minSize = 200 #PilerCR File
        else: minSize = 0 #MinCED File
        for fileName in crisprFiles:
            counter += 1
            if counter % percCutoff == 0: print("\t%i%% of the way through with %i CRISPRs found" % (int((counter/float(nCrisprs))*100),len(self.crisprs)))
            fsize = stat(crisprPath+"/"+fileName).st_size
            baseAsmName, ext = baseFile(fileName)
            if fsize <= minSize or ext not in validExts:continue
            try: self.crisprs[baseAsmName].addCRISPR(crisprPath + fileName)
            except: self.crisprs[baseAsmName] = CasOperon(assemblyPath[baseAsmName],crisprPath + fileName)

In [15]:
casop = CasOperons()

In [8]:
assembliesWCrisprs = allAssemblies # copy.deepcopy(allAssemblies)
seqHashes,notInDB,nf = set(),set(),[]
sizeDist = []
samples = []
for db in dbs:
    for fileName in pilerCRFiles[db]:
        fsize = stat(pilerCRDir+db+"/"+fileName).st_size
        assemblyID = fileName[:fileName.rfind(".")] 
        if (fsize <= 200): 
            del assembliesWCrisprs[assemblyID]
        else: 
            sizeDist.append(fsize)
            samples.append(assemblyID)
print("Number of CRISPRs detected:",len(assembliesWCrisprs))
print("Not in db:", len(notInDB))
print(len(sizeDist))
assembliesWCrisprs = dump(open("pickles/allAssemblyW_CRISPRs.p","rb")) 

dump(assembliesWCrisprs,open("data/pickles/allAssemblyCRISPRs.p","wb"))     

Number of CRISPRs detected: 353602
Not in db: 0
353602


In [10]:
sizeDist= Series(sizeDist,index=samples)
sizeDist.describe()

count    353602.000000
mean       6349.021241
std        6234.788022
min        1419.000000
25%        1964.000000
50%        4302.000000
75%        8160.000000
max      166672.000000
dtype: float64

In [18]:
# from os import system,path
# from pickle import load
# count = 0
# assembliesWCrisprs = load(open("data/pickles/allAssemblyW_CRISPRs.p","rb"))
# asmLinkPath = "/mnt/research/germs/shane/transActRNA/data/assemblies/assemblies_W_CRISPRs"
# for asmName, asmPath in assembliesWCrisprs.items():
#     cmd = "ln -s %s %s/%s.fasta" % (asmPath,asmLinkPath,asmName)
#     if not path.exists("%s/%s.fasta" % (asmLinkPath,asmName)): 
#         count+=1
# #         system(cmd)    
# count

6940

In [None]:
pilerCR_results[protID] = PilerCRReader(pilerCRFileName)

In [None]:
for fileName in files:
    if (".pcrout" not in fileName):continue
    system("mv %s /mnt/research/germs/shane/databases/crisprs/pilerCR/pat2/" % (fileName))

In [None]:
import hashlib
def get_md5(filePath):
    with open(filePath, 'rb') as fh:
        m = hashlib.md5()
        while True:
            data = fh.read(8192)
            if not data:
                break
            m.update(data)
        return m.hexdigest()

In [17]:
count =0
linkpaths = listdir(asmLinkPath)
for fname in linkpaths:
    if fname.replace(".fasta","") not in allAssemblies:
        system("unlink %s/%s" %(asmLinkPath,fname))
        count +=1
count,len(linkpaths)

(97286, 440288)

In [None]:
%%bash
ls