In [1]:
#Author: Sophia Bałdysz 
#This class compares the positive pHMMs to all other pHMMs. These comparisons are done based on pickled database files 

import os 
import pathlib
import pandas as pd
import pickle


class compareDatabases():
    def __init__(self, positivelistfile, edgefile, nodefile): 
        """
        requires a list of positive pHMMs in txt format
        requires a pickle file with pHMMs: proteins
        edgefile can be imported into Cytoscape to draw a network
        nodefile gives counts of the proteins of the pHMMs and can be used as an attribute in Cytoscape 
        """
        self.keyslist = self.getPositivefromlist(positivelistfile)
        self.positivedict = {}
        self.totaldict = {} 
        
    def loadDictionary(self, filename):
        """
        this function unpacks the dictionaries in binary form 
        """
        pkl_file = open(filename, 'rb')
        dictionary = pickle.load(pkl_file)
        pkl_file.close()
        for k, v in dictionary.items():
            if k in self.keyslist:
                if len(v) > 5:
                    self.positivedict[k] = v
            self.totaldict[k] = v
            
    def getPositivefromlist(self, listfile):
        """
        this function gets the list of positive pHMMs from a csv file (requires a header, if no header - add Header=None in df)
        """
        df = pd.read_csv(listfile)
        Keys_list = df['Keys'].tolist()
        return Keys_list 
        
     #This is a function that will compare sets for keys in dictionaries
    def CompareValOfSets(self, a,b):
        """
        compares protein overlaps for pHMMs. a - positive pHMM, b - other pHMM 
        """
        overlap = a.intersection(b)
        total = a.union(b)
        results = round(float(len(overlap)) / len(total) * 100, 4)
        return results
        
    #This function takes the keys from the positive list and runs through all dictionaries to create a positive dictionary
    def runComparisonOfDicts(self, dict1, dict2, edgefile, nodefile):
        """
        this function takes the pHMMs from the positive dictionary and compares them to all other pHMMs 
        The output file is a tab-delimited file that can be imported into Cytoscape 
        """
        efile = open(edgefile, 'a+')
        nfile = open(nodefile, 'a+')
        
        for key in self.positivedict.keys():
            nfile.write(key+'\t'+str(len(self.positivedict[key]))+'\n')
            efile.write(key+'\t'+key+'\t'+'100.0'+'\n')
            for key2 in self.totaldict:
                if key != key2:
                    result = self.CompareValOfSets(self.positivedict[key], self.totaldict[key2])
                    if result != 0.0:
                        efile.write(key+'\t'+key2+'\t'+str(result)+'\n')
                        nfile.write(key2+'\t'+str(len(self.totaldict[key2]))+'\n')
        efile.close()
        nfile.close()

        
mycomparison = compareDatabases(.../positivelistfile.txt, .../edgefile.txt, .../nodefile.txt)

pklfilefolder = pathlib.Path(.../pathtofolder for pickled files of all the databases)

databasefiles = [f.as_posix() for f in pklfilefolder.iterdir() if f.suffix == '.pkl']

for dbfile in databasefiles:
    mycomparison.loadDictionary(dbfile)

In [2]:
mycomparison.runComparisonOfDicts(mycomparison.positivedict, mycomparison.totaldict, .../poritivelistfile.txt, .../edgefile.txt, .../nodefile.txt)