In [1]:
import requests 
import pandas as pd
from random import sample 
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns                                                             
import sys 
from networkx.algorithms import bipartite
from networkx.algorithms.community import greedy_modularity_communities

In [2]:
df_phages = pd.read_csv("data/phage_metadata.csv")
df_phages.head()

Unnamed: 0,phage,temperate,cluster,subcluster,morphotype,host genus,host species,genome length,is annotated,is phamerated,gcpercent
0,20ES,True,A,A2,SIPHO,Mycobacterium,smegmatis,53124,False,True,63.4
1,244,True,E,,SIPHO,Mycobacterium,smegmatis,74483,True,True,63.4
2,32HC,True,Z,,SIPHO,Mycobacterium,smegmatis,50781,False,True,65.7
3,39HC,False,B,B6,SIPHO,Mycobacterium,smegmatis,71565,False,True,70.0
4,40AC,True,A,A17,SIPHO,Mycobacterium,smegmatis,53396,False,True,63.3


In [3]:
df_genes = pd.read_csv("data/cleaned_gene_list.csv")
df_genes.head()

Unnamed: 0,gene ID,pham,function,translation,orientation,phage,gene number,uncleaned function
0,20ES_CDS_1,36676,NKF,MYGTRSSAFWASQPGKFDVLNLRMTFPSTSAHEIPDLTATDFVPEN...,F,20ES,1,
1,20ES_CDS_10,39578,lysin b,MSLQVGSSGELVNRWIRVMKARFASYAGKLKEDGYFGLDDKAVQQE...,F,20ES,10,lysin b
2,20ES_CDS_11,34196,terminase,MSLENHHPELAPSPPHIIGPSWQRTVDGSWHLPDPKMTLGWGVLKW...,F,20ES,11,terminase
3,20ES_CDS_12,39511,portal protein,MTAPLPGQEEIPDPAIARDEMISAFDDAVKNLKINTSYYEAERRPE...,F,20ES,12,portal protein
4,20ES_CDS_13,21454,capsid maturation protease,MITAAVAAYVQRFASMFTGPALSLGEWARFLQTLFPEVQRRYAQAA...,F,20ES,13,capsid maturation protease


In [4]:
# phages = sorted(list(df_phages[df_phages["cluster"]=="A"]["phage"].unique()))
phages = df_phages["phage"].unique()
gene_identifier = "function"

labels = ["5' start"] + [ident for ident in np.sort(df_genes[gene_identifier].unique())] + ["3' end"]
label_to_index = { labels[i]:i for i in range(len(labels))}

transition_counts = np.zeros((len(labels),len(labels)))

for phage in phages:
    phage_df = df_genes[df_genes["phage"] == phage].sort_values(by=['gene number'])
    i = label_to_index["5' start"]
    for index, row in phage_df.iterrows(): 
        j = label_to_index[row[gene_identifier]]
        transition_counts[i][j] += 1
        i = j
    transition_counts[i][label_to_index["3' end"]] += 1

In [5]:
transition_matrix = np.zeros((len(labels),len(labels)))
for i in range(len(labels)):
    sum_row = np.sum(transition_counts[i][:])
    transition_matrix[i][:] = transition_counts[i][:]/(sum_row if sum_row > 0 else 1)

In [6]:
for i in range(len(labels)):
    for j in range(len(labels)):
        if transition_matrix[i][j] == 1:
            pass

In [7]:
G_markov = nx.from_numpy_matrix(transition_matrix)

In [8]:
groupings = greedy_modularity_communities(G_markov, weight=None)

for g in range(30):
    print("GROUP ",g)
    for i in groupings[g]:
        print(labels[i])
    print()
    print()

GROUP  0
serine homologous recombinase
NKF
adp-ribosyl glycohydrolase
purple acid phosphatase
arsenate reductase
dpda-like trna-guanine transglycosylase
dsdna break-binding protein, adda-like
lipase, lipc-like
pentapeptide repeat protein
pe/ppe family protein
trna-methyltransferase
tyrosine homologous recombinase


GROUP  1
holin
lysin a, glycosyl hydrolase domain
lysin a, l-ala-d-glu peptidase domain
lysin a, n-acetylmuramoyl-l-alanine amidase domain
lysin a, protease c39 domain
lysin a, protease domain
lysin a, protease m15 domain
lysin a, protease m23 domain


GROUP  2
major tail protein
tail assembly chaperone
tail fiber
capsid decoration protein
glycoside hydrolase
head fiber protein


GROUP  3
adenylate kinase
terminase
lipoyl synthase
glucosaminyl deacetylase
rnasee
gtp cyclohydrolase i


GROUP  4
5' nucleotidase
phosphatase
oxidoreductase
deoxynucleoside monophosphate kinase
dna binding, hu-like domain


GROUP  5
pnuc-like nicotinamide riboside transporter
histidine triad nucle