In [3]:
from Bio.SeqIO import parse
import pandas

In [4]:
from collections import defaultdict
import pandas as pd
from Bio import SearchIO
from colorama import Fore, Back, Style

attribs = [ 'accession', 'bias', 'bitscore', 'description',
            'cluster_num', 'domain_exp_num', 'domain_included_num',
            'domain_obs_num', 'domain_reported_num', 'env_num',
            'evalue', 'id', 'overlap_num', 'region_num']

hitlist = {}

for hmm_hits in snakemake.input.hits :
    hits = defaultdict(list)
    with open( hmm_hits ) as handle :
        for queryresult in SearchIO.parse( handle, 'hmmer3-tab' ) :
            for hit in queryresult.hits :
                for attrib in attribs :
                    hits[attrib].append( getattr(hit, attrib) )
                hitlist[hit.id] = { 'hmm'      : queryresult.id,
                                    'bitscore' : hit.bitscore }

# Note : cointing proteins from 1 instead of 0 because that's how
# prodigal does it. Other counts are from 0.

with open( 'statistics/search_results.txt', 'w' ) as f :
    for fna,faa in list( zip( snakemake.input.genomes, snakemake.input.proteins ) ) :
        print( fna )
        f.write( fna + '\n')
        contigs  = [ rec.id for rec in parse( open( fna ), 'fasta' ) ]
        proteins = [ rec.id for rec in parse( open( faa ), 'fasta' ) ]
        annot = {}
        for r in proteins :
            contig = r.rsplit('_', 1)[0]
            if not contig in annot :
                annot[contig] = []
            annot[contig].append( r )
        for n,contig in enumerate( contigs ) :    
            print(   '   contig {n} : {c} '.format( n=n, c=contig ) )
            f.write( '   contig {n} : {c} '.format( n=n, c=contig ) + '\n' )
            for m,protein in enumerate( annot[contig] ) :
                
                if protein in hitlist :
                    print( Fore.RED + '      protien {m} : {p}'.format( m=m+1, p=protein ) )
                    print( Fore.RED + '         hmm : {h} bitscore {b}'.format( h=hitlist[protein]['hmm'],
                                                                                b=hitlist[protein]['bitscore'] )
                           + Style.RESET_ALL )
                    
                    f.write( '      protien {m} : {p}\n'.format( m=m+1, p=protein ) )
                    f.write( '         hmm : {h} bitscore {b}\n'.format( h=hitlist[protein]['hmm'],
                                                                                b=hitlist[protein]['bitscore'] ) )
                else :
                    print(   '      protien {m} : {p}'.format( m=m+1, p=protein ) )
                    f.write( '      protien {m} : {p}\n'.format( m=m+1, p=protein ) )

In [None]:
'''
from SuchTree import SuchTree
from itertools import combinations
import seaborn

forest = {}
for nwk in snakemake.input.trees :
    forest[nwk] = SuchTree( nwk )

for a,b in combinations( forest, 2 ) :
    T1 = forest[a]
    T2 = forest[b]
    leafs = set( T1.leafs.keys() ) & set( T2.leafs.keys() )
    X = T1.distances_by_name( list( combinations( leafs, 2 ) ) )
    Y = T2.distances_by_name( list( combinations( leafs, 2 ) ) )
    distances[ frozenset( ( a, b ) ) ] = { a : X, b : Y }
'''

In [6]:
with open( 'statistics/summary.txt', 'w' ) as f :
    f.write( 'done' )