# Association

Computes association measures.  
Adds them as node attributes`

In [13]:
import os,sys
os.chdir(os.path.expanduser('~/capstone-project'))
sys.path.append('src')

import pandas as pd
import networkx as nx

from sklearn.metrics import mutual_info_score
from scipy.stats import chi2_contingency
# from scipy.stats import fisher_exact
from scipy.stats import f_oneway
from statsmodels.stats.multitest import multipletests 

def apply_FDR_correction(p_val_dict):
    '''
    takes a dict of pval as vals and returns a dict of corrected p vals based on FDR (benjamini-hochberg)

    param:
    -------
    - p_val_dict: dict

    return:
    -------
    - corrected_p_val_dict: dict
    '''
    
    p_values_list = list(p_val_dict.values())

    _, p_values_corrected, _, _ = multipletests(p_values_list, method='fdr_bh')
    corrected_p_val_dict = dict(zip(p_val_dict.keys(), p_values_corrected))

    return corrected_p_val_dict

def get_labeled_matrix(species,drug):
    '''
    Takes a species and a drug name, returns the labeled matrix of the species-drug combination having samples as rows and features as columns (last col is the labels)

    param:
    -------
    - species: str
    - drug: str

    return:
    -------
    - labeled_matrix: pd.DataFrame

    '''
    # ----------------- features -----------------
    X_df = pd.read_csv(f'data/presence_matrices/{species}_filtered_SxG.csv', index_col=0) 

    # ----------------- phenotypes -----------------
    pheno_df= pd.read_csv(f'data/processed_phenotypes/{species}_{drug}.csv', index_col=0)
    y_df=pheno_df
    y_df.index = y_df.index.astype('float')

    y_df = y_df.sort_index()

    y_indices=list(y_df.index)

    # ----------------- intersection to created a labeled matrix -----------------

    X_df = X_df.sort_index()
    y_df = y_df.sort_index()

    y_indices=list(y_df.index)
    X_indices=list(X_df.index)

    intersection = [i for i in y_indices if i in X_indices]
    y_df = y_df.loc[intersection]
    X_df = X_df.loc[intersection]

    X_df = X_df.sort_index()
    y_df = y_df.sort_index() # -- just making sure bcs im paranoid :)

    # X = X_df.values
    # y = y_df.values


    labeled_matrix = pd.concat([X_df, y_df], axis=1)
    return labeled_matrix

def compute_associations(labeled_matrix, gene_list=None):
    '''
    Takes a labeled matrix and returns the association of each feature with the label using:  
    * Mutual Information
    * Chi2
    * One way ANOVA

    It will only computes it for a specific set of featurs if gene_list is provided (useful for a network nodes)

    The output consists of 3 dictionaries, each containing the feature names as keys.
        - MI will have the MI score as values
        - Chi2 will have a 3 item tuple as values: (chi2 score, pval, corrected pval)
        - Anova will have a 3 item tuple as values: (f_statistic, pval, corrected pval)

    param:
    -------
    - labeled_matrix: pd.DataFrame
    - gene_list: list of str (optional)

    return:
    -------
    - mi_scores: dict
    - chi2_scores: dict
    - anova_scores: dict
    '''
    mi_dict={}
    chi2_scores={}; chi2_pvals={};  chi2_pvals_corrected={}; chi2_dict={}
    anova_scores={}; anova_pvals={}; anova_pvals_corrected={}; anova_dict={}

    # -- Mutual Information
    for col in labeled_matrix.columns[:-1]:
        mi_dict[col]=mutual_info_score(labeled_matrix[col], labeled_matrix['SIR'])

    # -- Chi2
    for col in labeled_matrix.columns[:-1]:

        contingency_table = pd.crosstab(labeled_matrix[col], labeled_matrix['SIR'])
        chi2, p, dof, expected = chi2_contingency(contingency_table)

        chi2_scores[col] = chi2
        chi2_pvals[col] = p
    chi2_pvals_corrected = apply_FDR_correction(chi2_pvals)

    for col in labeled_matrix.columns[:-1]:
        chi2_dict[col]=(chi2_scores[col], chi2_pvals[col], chi2_pvals_corrected[col])


    # -- one way ANOVA
    for col in labeled_matrix.columns[:-1]:
        f_statistic, p_value = f_oneway(labeled_matrix[col], labeled_matrix['SIR'])

        anova_scores[col] = f_statistic
        anova_pvals[col] = p_value
    anova_pvals_corrected = apply_FDR_correction(anova_pvals)
    
    for col in labeled_matrix.columns[:-1]:
        anova_dict[col]=(anova_scores[col], anova_pvals[col], anova_pvals_corrected[col])

    return mi_dict, chi2_dict, anova_dict


## _E. coli_ 


Drugs used are:
* streptomycin
* sulfamethoxazole
* tetracycline
* cefalothin
* trimethoprim
* amoxicillin
* ampicillin
* levofloxacin
* ciprofloxacin

In [18]:
ampicillin_anova_cor_pvals

{'Cluster 0': 1.5228569931452879e-77,
 'Cluster 2': 3.3699383522929477e-124,
 'Cluster 4': 6.852419559990417e-123,
 'Cluster 6': 4.1034143279091424e-128,
 'Cluster 9': 2.029025189561793e-130,
 'Cluster 10': 2.029025189561793e-130,
 'Cluster 15': 2.3962595296848923e-129,
 'Cluster 17': 2.3962595296848923e-129,
 'Cluster 18': 2.3962595296848923e-129,
 'Cluster 19': 0.0022541014165764063,
 'Cluster 20': 4.1034143279091424e-128,
 'Cluster 21': 1.3615097686259157e-121,
 'Cluster 23': 1.6620992703704172e-100,
 'Cluster 24': 8.19647949223771e-105,
 'Cluster 26': 2.3962595296848923e-129,
 'Cluster 27': 2.3962595296848923e-129,
 'Cluster 28': 1.646158055195459e-125,
 'Cluster 29': 1.646158055195459e-125,
 'Cluster 32': 1.92157554048705e-29,
 'Cluster 36': 2.3962595296848923e-129,
 'Cluster 37': 4.1034143279091424e-128,
 'Cluster 42': 2.029025189561793e-130,
 'Cluster 43': 2.029025189561793e-130,
 'Cluster 46': 6.852419559990417e-123,
 'Cluster 48': 1.3615097686259157e-121,
 'Cluster 49': 2.6500

In [16]:
labeled_matrix_amoxicillin = get_labeled_matrix('Escherichia_coli', 'amoxicillin')
amoxicillin_mi_dict, amoxicillin_chi2_dict, amoxicillin_anova_dict = compute_associations(labeled_matrix_amoxicillin)
amoxicillin_chi2_cor_pvals = {k:v[2] for k,v in amoxicillin_chi2_dict.items()}; amoxicillin_anova_cor_pvals = {k:v[2] for k,v in amoxicillin_anova_dict.items()}
G_amoxicillin = nx.read_graphml('results/Escherichia_coli_amoxicillin/Escherichia_coli_amoxicillin_LOR_annotated.graphml')
nx.set_node_attributes(G_amoxicillin, amoxicillin_mi_dict, 'MI')
nx.set_node_attributes(G_amoxicillin,amoxicillin_chi2_cor_pvals , 'Chi2_corrected_pval')
nx.set_node_attributes(G_amoxicillin,amoxicillin_anova_cor_pvals , 'Anova_corrected_pval')
nx.write_graphml(G_amoxicillin, 'results/Escherichia_coli_amoxicillin/Escherichia_coli_amoxicillin_LOR_assoc.graphml')

labeled_matrix_streptomycin = get_labeled_matrix('Escherichia_coli', 'streptomycin')
streptomycin_mi_dict, streptomycin_chi2_dict, streptomycin_anova_dict = compute_associations(labeled_matrix_streptomycin)
G_streptomycin = nx.read_graphml('results/Escherichia_coli_streptomycin/Escherichia_coli_streptomycin_LOR_annotated.graphml')
streptomycin_chi2_cor_pvals = {k:v[2] for k,v in streptomycin_chi2_dict.items()}; streptomycin_anova_cor_pvals = {k:v[2] for k,v in streptomycin_anova_dict.items()}
nx.set_node_attributes(G_streptomycin, streptomycin_mi_dict, 'MI')
nx.set_node_attributes(G_streptomycin,streptomycin_chi2_cor_pvals , 'Chi2_corrected_pval')
nx.set_node_attributes(G_streptomycin,streptomycin_anova_cor_pvals , 'Anova_corrected_pval')
nx.write_graphml(G_streptomycin, 'results/Escherichia_coli_streptomycin/Escherichia_coli_streptomycin_LOR_assoc.graphml')


labeled_matrix_sulfamethoxazole = get_labeled_matrix('Escherichia_coli', 'sulfamethoxazole')
sulfamethoxazole_mi_dict, sulfamethoxazole_chi2_dict, sulfamethoxazole_anova_dict = compute_associations(labeled_matrix_sulfamethoxazole)
sulfamethoxazole_chi2_cor_pvals = {k:v[2] for k,v in sulfamethoxazole_chi2_dict.items()}; sulfamethoxazole_anova_cor_pvals = {k:v[2] for k,v in sulfamethoxazole_anova_dict.items()}
G_sulfamethoxazole = nx.read_graphml('results/Escherichia_coli_sulfamethoxazole/Escherichia_coli_sulfamethoxazole_LOR_annotated.graphml')
nx.set_node_attributes(G_sulfamethoxazole, sulfamethoxazole_mi_dict, 'MI')
nx.set_node_attributes(G_sulfamethoxazole,sulfamethoxazole_chi2_cor_pvals , 'Chi2_corrected_pval')
nx.set_node_attributes(G_sulfamethoxazole,sulfamethoxazole_anova_cor_pvals , 'Anova_corrected_pval')
nx.write_graphml(G_sulfamethoxazole, 'results/Escherichia_coli_sulfamethoxazole/Escherichia_coli_sulfamethoxazole_LOR_assoc.graphml')

labeled_matrix_tetracycline = get_labeled_matrix('Escherichia_coli', 'tetracycline')
tetracycline_mi_dict, tetracycline_chi2_dict, tetracycline_anova_dict = compute_associations(labeled_matrix_tetracycline)
tetracycline_chi2_cor_pvals = {k:v[2] for k,v in tetracycline_chi2_dict.items()}; tetracycline_anova_cor_pvals = {k:v[2] for k,v in tetracycline_anova_dict.items()}
G_tetracycline = nx.read_graphml('results/Escherichia_coli_tetracycline/Escherichia_coli_tetracycline_LOR_annotated.graphml')
nx.set_node_attributes(G_tetracycline, tetracycline_mi_dict, 'MI')
nx.set_node_attributes(G_tetracycline,tetracycline_chi2_cor_pvals , 'Chi2_corrected_pval')
nx.set_node_attributes(G_tetracycline,tetracycline_anova_cor_pvals , 'Anova_corrected_pval')
nx.write_graphml(G_tetracycline, 'results/Escherichia_coli_tetracycline/Escherichia_coli_tetracycline_LOR_assoc.graphml')

labeled_matrix_cefalothin = get_labeled_matrix('Escherichia_coli', 'cefalothin')
cefalothin_mi_dict, cefalothin_chi2_dict, cefalothin_anova_dict = compute_associations(labeled_matrix_cefalothin)
cefalothin_chi2_cor_pvals = {k:v[2] for k,v in cefalothin_chi2_dict.items()}; cefalothin_anova_cor_pvals = {k:v[2] for k,v in cefalothin_anova_dict.items()}
G_cefalothin = nx.read_graphml('results/Escherichia_coli_cefalothin/Escherichia_coli_cefalothin_LOR_annotated.graphml')
nx.set_node_attributes(G_cefalothin, cefalothin_mi_dict, 'MI')
nx.set_node_attributes(G_cefalothin,cefalothin_chi2_cor_pvals , 'Chi2_corrected_pval')
nx.set_node_attributes(G_cefalothin,cefalothin_anova_cor_pvals , 'Anova_corrected_pval')
nx.write_graphml(G_cefalothin, 'results/Escherichia_coli_cefalothin/Escherichia_coli_cefalothin_LOR_assoc.graphml')

labeled_matrix_trimethoprim = get_labeled_matrix('Escherichia_coli', 'trimethoprim')
trimethoprim_mi_dict, trimethoprim_chi2_dict, trimethoprim_anova_dict = compute_associations(labeled_matrix_trimethoprim)
trimethoprim_chi2_cor_pvals = {k:v[2] for k,v in trimethoprim_chi2_dict.items()}; trimethoprim_anova_cor_pvals = {k:v[2] for k,v in trimethoprim_anova_dict.items()}
G_trimethoprim = nx.read_graphml('results/Escherichia_coli_trimethoprim/Escherichia_coli_trimethoprim_LOR_annotated.graphml')
nx.set_node_attributes(G_trimethoprim, trimethoprim_mi_dict, 'MI')
nx.set_node_attributes(G_trimethoprim,trimethoprim_chi2_cor_pvals , 'Chi2_corrected_pval')
nx.set_node_attributes(G_trimethoprim,trimethoprim_anova_cor_pvals , 'Anova_corrected_pval')
nx.write_graphml(G_trimethoprim, 'results/Escherichia_coli_trimethoprim/Escherichia_coli_trimethoprim_LOR_assoc.graphml')

labeled_matrix_ampicillin = get_labeled_matrix('Escherichia_coli', 'ampicillin')
ampicillin_mi_dict, ampicillin_chi2_dict, ampicillin_anova_dict = compute_associations(labeled_matrix_ampicillin)
ampicillin_chi2_cor_pvals = {k:v[2] for k,v in ampicillin_chi2_dict.items()}; ampicillin_anova_cor_pvals = {k:v[2] for k,v in ampicillin_anova_dict.items()}
G_ampicillin = nx.read_graphml('results/Escherichia_coli_ampicillin/Escherichia_coli_ampicillin_LOR_annotated.graphml')
nx.set_node_attributes(G_ampicillin, ampicillin_mi_dict, 'MI')
nx.set_node_attributes(G_ampicillin,ampicillin_chi2_cor_pvals , 'Chi2_corrected_pval')
nx.set_node_attributes(G_ampicillin,ampicillin_anova_cor_pvals , 'Anova_corrected_pval')
nx.write_graphml(G_ampicillin, 'results/Escherichia_coli_ampicillin/Escherichia_coli_ampicillin_LOR_assoc.graphml')

labeled_matrix_levofloxacin = get_labeled_matrix('Escherichia_coli', 'levofloxacin')
levofloxacin_mi_dict, levofloxacin_chi2_dict, levofloxacin_anova_dict = compute_associations(labeled_matrix_levofloxacin)
levofloxacin_chi2_cor_pvals = {k:v[2] for k,v in levofloxacin_chi2_dict.items()}; levofloxacin_anova_cor_pvals = {k:v[2] for k,v in levofloxacin_anova_dict.items()}
G_levofloxacin = nx.read_graphml('results/Escherichia_coli_levofloxacin/Escherichia_coli_levofloxacin_LOR_annotated.graphml')
nx.set_node_attributes(G_levofloxacin, levofloxacin_mi_dict, 'MI')
nx.set_node_attributes(G_levofloxacin,levofloxacin_chi2_cor_pvals , 'Chi2_corrected_pval')
nx.set_node_attributes(G_levofloxacin,levofloxacin_anova_cor_pvals , 'Anova_corrected_pval')
nx.write_graphml(G_levofloxacin, 'results/Escherichia_coli_levofloxacin/Escherichia_coli_levofloxacin_LOR_assoc.graphml')


labeled_matrix_ciprofloxacin = get_labeled_matrix('Escherichia_coli', 'ciprofloxacin')
ciprofloxacin_mi_dict, ciprofloxacin_chi2_dict, ciprofloxacin_anova_dict = compute_associations(labeled_matrix_ciprofloxacin)
ciprofloxacin_chi2_cor_pvals = {k:v[2] for k,v in ciprofloxacin_chi2_dict.items()}; ciprofloxacin_anova_cor_pvals = {k:v[2] for k,v in ciprofloxacin_anova_dict.items()}
G_ciprofloxacin = nx.read_graphml('results/Escherichia_coli_ciprofloxacin/Escherichia_coli_ciprofloxacin_LOR_annotated.graphml')
nx.set_node_attributes(G_ciprofloxacin, ciprofloxacin_mi_dict, 'MI')
nx.set_node_attributes(G_ciprofloxacin,ciprofloxacin_chi2_cor_pvals , 'Chi2_corrected_pval')
nx.set_node_attributes(G_ciprofloxacin,ciprofloxacin_anova_cor_pvals , 'Anova_corrected_pval')
nx.write_graphml(G_ciprofloxacin, 'results/Escherichia_coli_ciprofloxacin/Escherichia_coli_ciprofloxacin_LOR_assoc.graphml')


In [15]:
drugs = ['streptomycin', 
         'sulfamethoxazole', 
         'tetracycline', 
         'cefalothin', 
         'trimethoprim', 
         'amoxicillin', 
         'ampicillin', 
         'levofloxacin', 
         'ciprofloxacin']

species = 'Escherichia_coli'

for drug in drugs:
    labeled_matrix=get_labeled_matrix(species, drug)
    mi_dict, chi2_dict, anova_dict = compute_associations(labeled_matrix)

    mi_df = pd.DataFrame.from_dict(mi_dict, orient='index', columns=['MI'])
    chi2_df = pd.DataFrame.from_dict(chi2_dict, orient='index', columns=['chi2', 'chi2_pval', 'corrected_pval'])
    anova_df = pd.DataFrame.from_dict(anova_dict, orient='index', columns=['f_statistic', 'f_pval', 'f_corrected_pval'])

    df=pd.concat([mi_df, chi2_df, anova_df], axis=1)
    df.to_csv(f'results/{species}_{drug}/{species}_{drug}_associations.csv')
