In [70]:
import requests
import pandas as pd
import re

import os


clstr_path="../pangenome-repo/Pangenome-Analysis-Workflow/codes/Campylobacter_coli/cdhit_test.fatsa.clstr"
freq_path="../pangenome-repo/Pangenome-Analysis-Workflow/codes/Campylobacter_coli/Campylobacter_coli_cluster_frequencies.csv"
pan_annot_path='../pangenome-repo/Pangenome-Analysis-Workflow/codes/Campylobacter_coli/Campylobacter_coli_pangenome.csv'
clstr_fasta_path='../pangenome-repo/Pangenome-Analysis-Workflow/codes/Campylobacter_coli/Campylobacter_coli.fasta'

## essentials

In [9]:
import requests
import pandas as pd
import re
import os
import networkx as nx

def generate_files_paths(pipeline_output_path):
    '''
    takes the output dir path of the pipeline for a species, and returns the 4 file paths inside of it needed to extract the data

    Parameters:
    -----------
    pipeline_output_path: str, path to the pipeline output directory

    Returns:
    --------
    clstr_path: str, path to the cluster file, cdhit output
    freq_path: str, path to the cluster frequency file
    pan_annot_path: str, path to the pangenome annotation file
    clstr_fasta_path: str, path to the cluster fasta file, aslo cd hit output
    '''

    pipeline_output_path=pipeline_output_path.rstrip('/')
    basename=os.path.basename(pipeline_output_path)

    clstr_path=pipeline_output_path+'/'+basename+'.fasta.clstr'
    freq_path=pipeline_output_path+'/'+basename+'_cluster_frequencies.csv'
    pan_annot_path=pipeline_output_path+'/'+basename+'_pangenome.csv'
    clstr_fasta_path=pipeline_output_path+'/'+basename+'.fasta'

    return clstr_path, freq_path, pan_annot_path, clstr_fasta_path

# clstr_path, freq_path, pan_annot_path, clstr_fasta_path= generate_files_paths("../pangenome-repo/Pangenome-Analysis-Workflow/codes/Campylobacter_coli/")

def get_cluster_frequency(freq_path):
    '''
    Takes the path to the cluster frequency file and returns a dataframe with the cluster frequencies

    Parameters:
    freq_path: str

    Returns:
    freq_df: pandas.DataFrame
    '''

    freq_df=pd.read_csv(freq_path, index_col=0)
    freq_df.columns=['Cluster','frequency']
    freq_df.index=freq_df['Cluster']
    freq_df.drop('Cluster', axis=1, inplace=True)

    return freq_df

def get_cluster_representatives(clstr_path:str, save_csv:bool=False)->pd.DataFrame:
    '''
    Take a CD-HIT clstr output file and return a dictionary with cluster number as key and representative sequence as value.
    Optionally saves the dictionary as a csv file.

    parameters:
    ----------
    clstr_path: str, path to CD-HIT clstr file
    save_csv: bool, if True saves the dictionary as a csv file (optional)

    return:
    -------
    df: pd.DataFrame, data frame with cluster number as key and representative sequence as value

    description:
    ------------

    CD-HIT cluster file parser:
        .clstr file format is:

        >Cluster 489
        0	552aa, >fig|195.2069.peg.1050... *
        >Cluster 490
        0	551aa, >fig|195.2045.peg.1513... *
        1	530aa, >fig|195.2053.peg.1024... at 84.34%
        2	536aa, >fig|195.2186.peg.1791... at 94.40%
        3	541aa, >fig|195.2242.peg.678... at 94.82%
        >Cluster 491
        0	501aa, >fig|195.2049.peg.1588... at 99.80%
        1	551aa, >fig|195.2166.peg.1287... *
        2	453aa, >fig|195.2206.peg.1626... at 99.56%
        3	453aa, >fig|195.2226.peg.1423... at 99.78%

        Each cluster has a number, here representing a CDS; each entry starts with a >Cluster line followed by the cluster number
        Under each cluster there are all entries for that cluster in each sample genome, based on a sequence identity (here 80% from pipeline)
        One entry under a cluster is the representative sequence, marked with an asterisk (*), this is the entry we want to get its gene name from PATRIC 
        p.s., 195.2029.peg.1780 is the gene name for cluster 0

        The cluster parser aims to match each cluster number to its representative sequence and get the gene name from PATRIC using api requests
    '''
    with open(clstr_path, 'r') as clstr_file:

        found_representative=False #flag to know if we found a representative for the clstr number
        cluster='' #initialize cluster number

        cluster_dict={} #initialize dictionary to store cluster number and representative sequence

        for line in clstr_file:

            if line.startswith(">"): #if its a cluster number file, save the cluster number for the next iteration
                cluster=line[1:].strip()
                found_representative=False #reset to false in order to find rep

            elif not found_representative:
                if "*" in line:
                    found_representative=True
                    representative=line.split(",")[1].replace('... *', '').strip()[1:]
                    representative=representative.split("|")[1]
                    # print(representative)
                    cluster_dict[cluster]=representative

            else:
                continue

    df=pd.DataFrame(cluster_dict.items(), columns=['cluster', 'gene_representative'])
    df.set_index('cluster', inplace=True)
    df.index.name='Cluster'
    
    return df

def get_representative_products(clstr_fasta_path:str)->pd.DataFrame:
    '''
    Takes a CD-HIT fasta output and returns a dataframe that matches each CDS PATRIC ID with its product name

    the fasta output is of this form:

    >fig|195.2024.peg.83 Putative oxidoreductase ferredoxin-type protein, clusters with CPO
    MNFSQISDACVKCGKCIPVCTIHEVNRDETTSPRGFLDLLAAYKEEKLELDKEAKKIFES
    CFLCTNCVEVCPSKLRVDNVIEEVRYDIAKKFGIAWYKKIIFFFLRRRKILDLVAKLGYV
    FQSCAFKIQSQDQNVGMKAKFSMPFVKKGRLLTSFNKKSFLNSNPDFIDNGGEKTVGFFV
    GCLANYFYIDTANAVLKIAKEVKINVDLMKEQVCCGAPQFFTGDFKSVEILAKKNIEYFE
    KKLEKLDAIIIPEATCSAMLKIDLEHFFNMQNEPEWAKRAQKISSRIYMASEYFYKFTNL
    KELLESKKKLNYSITYHDPCHARKMQGVFKEPRELLKANYHFVEMSNPNACCGFGGVSMQ
    TDYYDRALSVGLKKASMIDESKACVVSAECSACRMQISNALEQNSSKAIFASPLELIAKA
    L
    >fig|195.2024.peg.542 Septum-associated rare lipoprotein A
    MKPYTINGKTYYPTVVSVGETADGIASWYGPGFHGKKTSNGETYNQNGLTAAHKTLPMNT
    ILKVTNLNNNRQVTVRVNDRGPFVNNRIIDLSKGAASQIDMIAAGTAPVRLEVIGFGSAN
    SGNNVVHSNINYGASGGIANNGQIYEGGNFMVQIGAFKNPSGAQTIASRYKTYRTYSSTI
    RKSSVDGLSRVFLTGFRSEEEARDFAASGAFAGAFVVRE

    The aim is to match 195.2024.peg.542 with Septum-associated rare lipoprotein A in teh dataframe

    param:
    ------
    - clstr_fasta_path: str, path of the fasta output

    return:
    -------
    - df: pd.DataFrame, df of columns gene_representative and product_name
    '''

    dict={}

    with open(clstr_fasta_path,'r') as f:
        for line in f.readlines():
            if line.startswith(">"):
                pattern=">fig.(\d{3}\.\d{4}.peg.\d+) (.+)"
                gene_representative=re.match(string=line, pattern=pattern).group(1)
                product_name=re.match(string=line, pattern=pattern).group(2)
                dict[gene_representative]=product_name
    

    df = pd.DataFrame(dict.items(), columns=['gene_representative', 'product_name'])
    
    return df

def combine_cluster_product(clstr_rep_df:pd.DataFrame, rep_prod_df:pd.DataFrame)->pd.DataFrame:
    '''
    Combines the cluster representative dataframe with the product name dataframe

    param:
    ------
    - clstr_rep_df: pd.DataFrame, dataframe with cluster number and gene representative (get_cluster_representatives() output)
    - rep_prod_df: pd.DataFrame, dataframe with gene representative and product name (get_representative_products() output)

    return:
    -------
    - df: pd.DataFrame, dataframe with cluster number, gene representative and product name
    '''
    # if clstr rep df doesnt have a column named Cluster, create one out of index
    if 'Cluster' not in clstr_rep_df.columns:
        clstr_rep_df['Cluster']=clstr_rep_df.index

    df=pd.merge(clstr_rep_df, rep_prod_df, on='gene_representative', how='left')
    df.drop(columns='gene_representative', inplace=True)

    df.set_index('Cluster', inplace=True)

    return df

def get_cluster_pan_gene_class(pan_annot_path:str)->pd.DataFrame:
    '''
    Takes a pangenome annotation file and returns a dataframe with cluster number and gene class
    This annotation file is a <species>_pangenome.csv file output from the pangenome analysis pipeline

    param:
    ------
    - pan_annot_path: str, path to the pangenome annotation file

    return:
    -------
    - pan_df: pd.DataFrame, dataframe with cluster number and gene class
    '''
    with open(pan_annot_path, 'r') as f:
        pan_df=pd.read_csv(f, index_col=0)
        pan_df.drop(pan_df.columns[[1]], axis=1, inplace=True)
        pan_df.columns=['Cluster', 'pan_gene_class']

    pan_df.set_index('Cluster', inplace=True)
    return pan_df

def get_cluster_representatives(clstr_path:str, save_csv:bool=False)->pd.DataFrame:
    '''
    Take a CD-HIT clstr output file and return a dictionary with cluster number as key and representative sequence as value.
    Optionally saves the dictionary as a csv file.

    parameters:
    ----------
    clstr_path: str, path to CD-HIT clstr file
    save_csv: bool, if True saves the dictionary as a csv file (optional)

    return:
    -------
    df: pd.DataFrame, data frame with cluster number as key and representative sequence as value

    description:
    ------------

    CD-HIT cluster file parser:
        .clstr file format is:

        >Cluster 489
        0	552aa, >fig|195.2069.peg.1050... *
        >Cluster 490
        0	551aa, >fig|195.2045.peg.1513... *
        1	530aa, >fig|195.2053.peg.1024... at 84.34%
        2	536aa, >fig|195.2186.peg.1791... at 94.40%
        3	541aa, >fig|195.2242.peg.678... at 94.82%
        >Cluster 491
        0	501aa, >fig|195.2049.peg.1588... at 99.80%
        1	551aa, >fig|195.2166.peg.1287... *
        2	453aa, >fig|195.2206.peg.1626... at 99.56%
        3	453aa, >fig|195.2226.peg.1423... at 99.78%

        Each cluster has a number, here representing a CDS; each entry starts with a >Cluster line followed by the cluster number
        Under each cluster there are all entries for that cluster in each sample genome, based on a sequence identity (here 80% from pipeline)
        One entry under a cluster is the representative sequence, marked with an asterisk (*), this is the entry we want to get its gene name from PATRIC 
        p.s., 195.2029.peg.1780 is the gene name for cluster 0

        The cluster parser aims to match each cluster number to its representative sequence and get the gene name from PATRIC using api requests
    '''
    with open(clstr_path, 'r') as clstr_file:

        found_representative=False #flag to know if we found a representative for the clstr number
        cluster='' #initialize cluster number

        cluster_dict={} #initialize dictionary to store cluster number and representative sequence

        for line in clstr_file:

            if line.startswith(">"): #if its a cluster number file, save the cluster number for the next iteration
                cluster=line[1:].strip()
                found_representative=False #reset to false in order to find rep

            elif not found_representative:
                if "*" in line:
                    found_representative=True
                    representative=line.split(",")[1].replace('... *', '').strip()[1:]
                    representative=representative.split("|")[1]
                    # print(representative)
                    cluster_dict[cluster]=representative

            else:
                continue

    df=pd.DataFrame(cluster_dict.items(), columns=['cluster', 'gene_representative'])
    df.set_index('cluster', inplace=True)
    df.index.name='Cluster'

    return df

def get_cluster_attributes(clstr_path, freq_path, pan_annot_path, clstr_fasta_path):


    clstr_patric_id_df=get_cluster_representatives(clstr_path)
    clstr_gene_class_df= get_cluster_pan_gene_class(pan_annot_path)
    _patric_id_product_df=get_representative_products(clstr_fasta_path)
    clstr_product_df=combine_cluster_product(clstr_patric_id_df, _patric_id_product_df)
    clstr_freq_df=get_cluster_frequency(freq_path)

    df=pd.concat([clstr_patric_id_df, clstr_gene_class_df, clstr_product_df, clstr_freq_df], axis=1)

    #if there exist a Cluster col remove
    if 'Cluster' in df.columns:
        df.drop('Cluster', axis=1, inplace=True)
    
    return df

def create_dicts(df:pd.DataFrame):
    '''
    takes a nxm data frame and returns m dictionaries of keys the index, and values each col repectively

    param:
    -----
    - df: pd.DataFrame, index=Cluster, cols=gene_representative,	pan_gene_class,	product_name,	frequency

    returns:
    --------
    - dicts: dict, dict of m dictionaries - key is the col name, value is the dictionary of the col
    '''
    dicts = {col: df[col].to_dict() for col in df.columns}
    return dicts

def set_node_attributes_by_cluster(df:pd.DataFrame, G:nx.graph):
    '''
    takes a dataframe and a graph and sets the node attributes for each cluster

    param:
    -----
    - df: pd.DataFrame, index=Cluster, cols=gene_representative,	pan_gene_class,	product_name,	frequency
    - G: nx.Graph, graph to set the node attributes

    returns:
    --------
    - G: nx.Graph, graph with node attributes set
    '''
    dicts = create_dicts(df)

    for key in dicts.keys():
        nx.set_node_attributes(G, dicts[key], key)

    return G

_output_path='../pangenome-repo/Pangenome-Analysis-Workflow/codes/Campylobacter_coli/'

def set_output_path(path:str):
    '''
    sets the output path for the pipeline

    param:
    ------
    - path: str, path to the output directory
    '''
    global _output_path
    _output_path=path
    return _output_path

def set_cluster_attributes(G:nx.Graph, pipeline_output=_output_path)->nx.Graph:
    '''
    takes a Graph and the pipeline's output path and performs all of the following: 

    - get all the files paths required for the attributes
    - generate df for each and concatenate them all in one of index Cluster
    - create dictionaries out of it and perform nx.set_node_attributes using each on G

    param:
    ------
    G: nx.Graph, network of nodes Clusters

    return:
    -------
    G: nx.Graph, network of nodes Clusters, attributes: pan_gene_class, gene_representative, pan_gene_class, frequency
    '''

    p1, p2, p3, p4 = generate_files_paths(pipeline_output)
    df = get_cluster_attributes(p1, p2, p3, p4)
    G = set_node_attributes_by_cluster(df, G)

    return G

In [10]:
p1, p2, p3, p4 = generate_files_paths(pipeline_output_path='/home/raysas/projects/capstone/pangenome-repo/Pangenome-Analysis-Workflow/codes/Campylobacter_coli/')
test_df = get_cluster_attributes(p1, p2, p3, p4)
dicts= create_dicts(test_df)

test_G=nx.read_graphml('../data/graphs-5/FDR_500randomized_SVM_0.6signed_corr_Campylobacter_coli_ciprofloxacin_significant.graphml')

test_G_2=set_cluster_attributes(test_G)

works :)

## cluster to PATRIC ID

In [23]:
def get_cluster_frequency(freq_path):
    '''
    Takes the path to the cluster frequency file and returns a dataframe with the cluster frequencies

    Parameters:
    freq_path: str

    Returns:
    freq_df: pandas.DataFrame
    '''

    freq_df=pd.read_csv(freq_path, index_col=0)
    freq_df.columns=['Cluster','frequency']
    freq_df.index=freq_df['Cluster']
    freq_df.drop('Cluster', axis=1, inplace=True)

    return freq_df

f_df=get_cluster_frequency(freq_path)

f_df

Unnamed: 0_level_0,Frequency
Cluster,Unnamed: 1_level_1
Cluster 0,103
Cluster 1,239
Cluster 2,3
Cluster 3,1
Cluster 4,268
...,...
Cluster 8016,1
Cluster 8017,1
Cluster 8018,1
Cluster 8019,1


In [69]:
def get_cluster_representatives(clstr_path:str, save_csv:bool=False)->pd.DataFrame:
    '''
    Take a CD-HIT clstr output file and return a dictionary with cluster number as key and representative sequence as value.
    Optionally saves the dictionary as a csv file.

    parameters:
    ----------
    clstr_path: str, path to CD-HIT clstr file
    save_csv: bool, if True saves the dictionary as a csv file (optional)

    return:
    -------
    df: pd.DataFrame, data frame with cluster number as key and representative sequence as value

    description:
    ------------

    CD-HIT cluster file parser:
        .clstr file format is:

        >Cluster 489
        0	552aa, >fig|195.2069.peg.1050... *
        >Cluster 490
        0	551aa, >fig|195.2045.peg.1513... *
        1	530aa, >fig|195.2053.peg.1024... at 84.34%
        2	536aa, >fig|195.2186.peg.1791... at 94.40%
        3	541aa, >fig|195.2242.peg.678... at 94.82%
        >Cluster 491
        0	501aa, >fig|195.2049.peg.1588... at 99.80%
        1	551aa, >fig|195.2166.peg.1287... *
        2	453aa, >fig|195.2206.peg.1626... at 99.56%
        3	453aa, >fig|195.2226.peg.1423... at 99.78%

        Each cluster has a number, here representing a CDS; each entry starts with a >Cluster line followed by the cluster number
        Under each cluster there are all entries for that cluster in each sample genome, based on a sequence identity (here 80% from pipeline)
        One entry under a cluster is the representative sequence, marked with an asterisk (*), this is the entry we want to get its gene name from PATRIC 
        p.s., 195.2029.peg.1780 is the gene name for cluster 0

        The cluster parser aims to match each cluster number to its representative sequence and get the gene name from PATRIC using api requests
    '''
    with open(clstr_path, 'r') as clstr_file:

        found_representative=False #flag to know if we found a representative for the clstr number
        cluster='' #initialize cluster number

        cluster_dict={} #initialize dictionary to store cluster number and representative sequence

        for line in clstr_file:

            if line.startswith(">"): #if its a cluster number file, save the cluster number for the next iteration
                cluster=line[1:].strip()
                found_representative=False #reset to false in order to find rep

            elif not found_representative:
                if "*" in line:
                    found_representative=True
                    representative=line.split(",")[1].replace('... *', '').strip()[1:]
                    representative=representative.split("|")[1]
                    # print(representative)
                    cluster_dict[cluster]=representative

            else:
                continue

    df=pd.DataFrame(cluster_dict.items(), columns=['cluster', 'gene_representative'])
    df.set_index('cluster', inplace=True)
    df.index.name='Cluster'

    return df

def get_cluster_attributes(clstr_path, freq_path, pan_annot_path, clstr_fasta_path):


    clstr_patric_id_df=get_cluster_representatives(clstr_path)
    clstr_gene_class_df= get_cluster_pan_gene_class(pan_annot_path)
    _patric_id_product_df=get_representative_products(clstr_fasta_path)
    clstr_product_df=combine_cluster_product(clstr_patric_id_df, _patric_id_product_df)
    clstr_freq_df=get_cluster_frequency(freq_path)

    df=pd.concat([clstr_patric_id_df, clstr_gene_class_df, clstr_product_df, clstr_freq_df], axis=1)

    #if there exist a Cluster col remove
    if 'Cluster' in df.columns:
        df.drop('Cluster', axis=1, inplace=True)
    
    return df

get_cluster_attributes(clstr_path, freq_path, pan_annot_path, clstr_fasta_path)

Unnamed: 0_level_0,gene_representative,pan_gene_class,product_name,Frequency
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cluster 0,195.2029.peg.1780,accessory,"helicase, Snf2 family",103
Cluster 1,195.2024.peg.1611,accessory,hypothetical protein,239
Cluster 2,195.2136.peg.1150,unique,hypothetical protein,3
Cluster 3,195.2145.peg.407,unique,hypothetical protein,1
Cluster 4,195.2024.peg.685,core,DNA-directed RNA polymerase beta' subunit (EC ...,268
...,...,...,...,...
Cluster 8016,195.2201.peg.1720,unique,Endonuclease I precursor (EC 3.1.21.1),1
Cluster 8017,195.2201.peg.1791,unique,hypothetical protein,1
Cluster 8018,195.2210.peg.964,unique,conserved hypothetical protein,1
Cluster 8019,195.2230.peg.1815,unique,hypothetical protein,1


In [30]:
clstr_rep_df=get_cluster_representatives(clstr_path, save_csv=True)
clstr_rep_df

Unnamed: 0_level_0,gene_representative
Cluster,Unnamed: 1_level_1
Cluster 0,195.2029.peg.1780
Cluster 1,195.2024.peg.1611
Cluster 2,195.2136.peg.1150
Cluster 3,195.2145.peg.407
Cluster 4,195.2024.peg.685
...,...
Cluster 8016,195.2201.peg.1720
Cluster 8017,195.2201.peg.1791
Cluster 8018,195.2210.peg.964
Cluster 8019,195.2230.peg.1815


## PATRIC ID to gene name

In [6]:
#now getting the gene names out of patric

genome_id="195.2069"
AMR_db="CARD"
drug="ciprofloxacin"
link=f"https://www.bv-brc.org/view/Genome/{genome_id}#view_tab=specialtyGenes&filter=and(eq(property,%22Antibiotic%20Resistance%22),eq(source,%22{AMR_db}%22),eq(antibiotics,%22{drug}%22))"

response=requests.get(link)
print(response.text)

<!DOCTYPE html>
<html lang="en">
	<head>
		
	

	
	  
		<link rel='stylesheet' href="/js/3.35.5/p3/resources/p3.css" />
	  
		<link rel='stylesheet' href="/js/3.35.5/molstar/mol-bvbrc/molstar.css" />
	  
		<link rel='stylesheet' href="/js/3.35.5/mauve_viewer/dist/mauve-viewer.css" />
	  
		<link rel='stylesheet' href="/js/3.35.5/heatmap/dist/hotmap.css" />
	  
	


	<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
	<meta name="Keywords" content="PATRIC, Pathosystems Resource Integration Center"/>
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<meta name="google-site-verification" content="TkjceNtwo7upCQl_gx8Zmhhp9tCmc08LitZGuvQ1Eb8" />
	<link rel="manifest" href="/public/manifest.json">
	<link rel="icon" sizes="16x16 32x32" href="/patric/favicon.ico">
	<link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@300;400&family=Roboto:wght@300;400;700&display=swap" rel="stylesheet">

	</head>
	<body onunload="" class="claro patric ">
		

## PATRIC ID to product name

In [35]:
def get_representative_products(clstr_fasta_path:str)->pd.DataFrame:
    '''
    Takes a CD-HIT fasta output and returns a dataframe that matches each CDS PATRIC ID with its product name

    the fasta output is of this form:

    >fig|195.2024.peg.83 Putative oxidoreductase ferredoxin-type protein, clusters with CPO
    MNFSQISDACVKCGKCIPVCTIHEVNRDETTSPRGFLDLLAAYKEEKLELDKEAKKIFES
    CFLCTNCVEVCPSKLRVDNVIEEVRYDIAKKFGIAWYKKIIFFFLRRRKILDLVAKLGYV
    FQSCAFKIQSQDQNVGMKAKFSMPFVKKGRLLTSFNKKSFLNSNPDFIDNGGEKTVGFFV
    GCLANYFYIDTANAVLKIAKEVKINVDLMKEQVCCGAPQFFTGDFKSVEILAKKNIEYFE
    KKLEKLDAIIIPEATCSAMLKIDLEHFFNMQNEPEWAKRAQKISSRIYMASEYFYKFTNL
    KELLESKKKLNYSITYHDPCHARKMQGVFKEPRELLKANYHFVEMSNPNACCGFGGVSMQ
    TDYYDRALSVGLKKASMIDESKACVVSAECSACRMQISNALEQNSSKAIFASPLELIAKA
    L
    >fig|195.2024.peg.542 Septum-associated rare lipoprotein A
    MKPYTINGKTYYPTVVSVGETADGIASWYGPGFHGKKTSNGETYNQNGLTAAHKTLPMNT
    ILKVTNLNNNRQVTVRVNDRGPFVNNRIIDLSKGAASQIDMIAAGTAPVRLEVIGFGSAN
    SGNNVVHSNINYGASGGIANNGQIYEGGNFMVQIGAFKNPSGAQTIASRYKTYRTYSSTI
    RKSSVDGLSRVFLTGFRSEEEARDFAASGAFAGAFVVRE

    The aim is to match 195.2024.peg.542 with Septum-associated rare lipoprotein A in teh dataframe

    param:
    ------
    - clstr_fasta_path: str, path of the fasta output

    return:
    -------
    - df: pd.DataFrame, df of columns gene_representative and product_name
    '''

    dict={}

    with open(clstr_fasta_path,'r') as f:
        for line in f.readlines():
            if line.startswith(">"):
                pattern=">fig.(\d{3}\.\d{4}.peg.\d+) (.+)"
                gene_representative=re.match(string=line, pattern=pattern).group(1)
                product_name=re.match(string=line, pattern=pattern).group(2)
                dict[gene_representative]=product_name
    

    df = pd.DataFrame(dict.items(), columns=['gene_representative', 'product_name'])
    
    return df

In [36]:
rep_prod_df=get_representative_products(clstr_fasta_path)
rep_prod_df

Unnamed: 0,gene_representative,product_name
0,195.2024.peg.575,Putative periplasmic protein
1,195.2024.peg.1455,"Fe2+ ABC transporter, permease protein 2"
2,195.2024.peg.1693,4-hydroxybenzoyl-CoA thioesterase family activ...
3,195.2024.peg.935,Cinnamyl alcohol dehydrogenase/reductase (EC 1...
4,195.2024.peg.24,Two-component transcriptional response regulat...
...,...,...
8016,195.2317.peg.807,"Beta-1,3-glucosyltransferase"
8017,195.2317.peg.1426,hypothetical protein
8018,195.2317.peg.1246,ATP synthase gamma chain (EC 3.6.3.14)
8019,195.2317.peg.1018,hypothetical protein


In [42]:
def combine_cluster_product(clstr_rep_df:pd.DataFrame, rep_prod_df:pd.DataFrame)->pd.DataFrame:
    '''
    Combines the cluster representative dataframe with the product name dataframe

    param:
    ------
    - clstr_rep_df: pd.DataFrame, dataframe with cluster number and gene representative (get_cluster_representatives() output)
    - rep_prod_df: pd.DataFrame, dataframe with gene representative and product name (get_representative_products() output)

    return:
    -------
    - df: pd.DataFrame, dataframe with cluster number, gene representative and product name
    '''
    # if clstr rep df doesnt have a column named Cluster, create one out of index
    if 'Cluster' not in clstr_rep_df.columns:
        clstr_rep_df['Cluster']=clstr_rep_df.index

    df=pd.merge(clstr_rep_df, rep_prod_df, on='gene_representative', how='left')
    df.drop(columns='gene_representative', inplace=True)

    df.set_index('Cluster', inplace=True)

    return df

In [43]:
clstr_prod_df=combine_cluster_product(get_cluster_representatives(clstr_path), get_representative_products(clstr_fasta_path))
clstr_prod_df

Unnamed: 0_level_0,product_name
Cluster,Unnamed: 1_level_1
Cluster 0,"helicase, Snf2 family"
Cluster 1,hypothetical protein
Cluster 2,hypothetical protein
Cluster 3,hypothetical protein
Cluster 4,DNA-directed RNA polymerase beta' subunit (EC ...
...,...
Cluster 8016,Endonuclease I precursor (EC 3.1.21.1)
Cluster 8017,hypothetical protein
Cluster 8018,conserved hypothetical protein
Cluster 8019,hypothetical protein


In [45]:
df=combine_cluster_product(get_cluster_representatives(clstr_path), get_representative_products(clstr_fasta_path))
df.to_csv('../data/cluster_descriptions/cluster_product.csv', index=False)

## cluster pan annotations

In [59]:
def get_cluster_pan_gene_class(pan_annot_path:str)->pd.DataFrame:
    '''
    Takes a pangenome annotation file and returns a dataframe with cluster number and gene class
    This annotation file is a <species>_pangenome.csv file output from the pangenome analysis pipeline

    param:
    ------
    - pan_annot_path: str, path to the pangenome annotation file

    return:
    -------
    - pan_df: pd.DataFrame, dataframe with cluster number and gene class
    '''
    with open(pan_annot_path, 'r') as f:
        pan_df=pd.read_csv(f, index_col=0)
        pan_df.drop(pan_df.columns[[1]], axis=1, inplace=True)
        pan_df.columns=['Cluster', 'pan_gene_class']

    pan_df.set_index('Cluster', inplace=True)
    return pan_df

In [60]:
clstr_gene_class_df=get_cluster_pan_gene_class(pan_annot_path)
clstr_gene_class_df

Unnamed: 0_level_0,pan_gene_class
Cluster,Unnamed: 1_level_1
Cluster 0,accessory
Cluster 1,accessory
Cluster 2,unique
Cluster 3,unique
Cluster 4,core
...,...
Cluster 8016,unique
Cluster 8017,unique
Cluster 8018,unique
Cluster 8019,unique


In [42]:
df=get_cluster_pan_gene_class(pan_annot_path)
df.to_csv('../data/cluster_descriptions/cluster_pan_gene_class.csv', index=False)

In [67]:
def get_cluster_attributes(clstr_path, freq_path, pan_annot_path, clstr_fasta_path):


    clstr_patric_id_df=get_cluster_representatives(clstr_path)
    clstr_gene_class_df= get_cluster_pan_gene_class(pan_annot_path)
    _patric_id_product_df=get_representative_products(clstr_fasta_path)
    clstr_product_df=combine_cluster_product(clstr_patric_id_df, _patric_id_product_df)
    clstr_freq_df=get_cluster_frequency(freq_path)

    df=pd.concat([clstr_patric_id_df, clstr_gene_class_df, clstr_product_df, clstr_freq_df], axis=1)

    #if there exist a Cluster col remove
    if 'Cluster' in df.columns:
        df.drop('Cluster', axis=1, inplace=True)
    
    return df



In [68]:
get_cluster_attributes(clstr_path, freq_path, pan_annot_path, clstr_fasta_path)

Unnamed: 0_level_0,gene_representative,pan_gene_class,product_name,Frequency
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cluster 0,195.2029.peg.1780,accessory,"helicase, Snf2 family",103
Cluster 1,195.2024.peg.1611,accessory,hypothetical protein,239
Cluster 2,195.2136.peg.1150,unique,hypothetical protein,3
Cluster 3,195.2145.peg.407,unique,hypothetical protein,1
Cluster 4,195.2024.peg.685,core,DNA-directed RNA polymerase beta' subunit (EC ...,268
...,...,...,...,...
Cluster 8016,195.2201.peg.1720,unique,Endonuclease I precursor (EC 3.1.21.1),1
Cluster 8017,195.2201.peg.1791,unique,hypothetical protein,1
Cluster 8018,195.2210.peg.964,unique,conserved hypothetical protein,1
Cluster 8019,195.2230.peg.1815,unique,hypothetical protein,1


In [1]:
import pandas as pd
df=pd.read_csv("../data/cluster_descriptions/cluster_product.csv")

In [10]:
len(list(df["product_name"]))

8021

In [4]:
dict={}
for i in list(df["product_name"]):
    dict[i]=0

for i in list(df["product_name"]):
    dict[i]+=1

In [6]:
count=0
for item in list(df['product_name']):
    if 'hypothetical' in item:
        count+=1
count

2860

In [21]:
presence_df=pd.read_csv('../data/presence_matrices/Campylobacter_coli_presence_absence_T.csv',index_col=0)

In [22]:
#replace index clusters by their matching product_name from df
my_dict=df.set_index('cluster').to_dict()['product_name']
my_dict

presence_df.rename(index=my_dict, inplace=True)

In [None]:
def set_node_attributes(G:nx.Graph, clstr_attribute_df):
    '''
    Set node attributes from a dataframe to a networkx graph

    param:
    ------
    - G: nx.Graph, networkx graph
    - clstr_attribute_df: pd.DataFrame, dataframe with cluster number and attributes

    return:
    -------
    - G: nx.Graph, networkx graph with attributes set
    '''
    #make the df a dict of index clusters
    my_dict=clstr_attribute_df.set_index('cluster').to_dict()

    attribute=clstr_attribute_df.columns[1]
    nx.set_node_attributes(G, my_dict, name=attribute)
    return G