# _E. coli_ ampicillin analysis

This notebook aims to analyze the ampicillin resistance in _E. coli_ bacteria.

* import modules for cluster, data, and network analysis
* load the data
* perform pairwise associations
* perform ML approaches
* construct the network 
* analyze the network

## Essentials

In [1]:
import ast # --for string to list conversion
import os
import sys
from IPython.display import display
import networkx as nx
import pandas as pd
import numpy as np

from sklearn.linear_model import SGDClassifier
from sklearn.utils import resample
from sklearn.metrics import mutual_info_score

from scipy.stats import chi2_contingency
from scipy.stats import fisher_exact
from scipy.stats import f_oneway
from statsmodels.stats.multitest import multipletests 

os.chdir(os.path.expanduser('~/capstone-project'))
sys.path.append('src')

import cluster_analysis
import gene_associations
import network_construction
import network_analysis
import cluster_analysis

species='Escherichia_coli'; drug = 'ampicillin'


def apply_FDR_correction(p_val_dict):
    '''
    takes a dict of pval as vals and returns a dict of corrected p vals based on FDR (benjamini-hochberg)

    param:
    -------
    - p_val_dict: dict

    return:
    -------
    - corrected_p_val_dict: dict
    '''
    
    p_values_list = list(p_val_dict.values())

    _, p_values_corrected, _, _ = multipletests(p_values_list, method='fdr_bh')
    corrected_p_val_dict = dict(zip(p_val_dict.keys(), p_values_corrected))

    return corrected_p_val_dict

def flip_dict(d):
    '''
    flips a dict
    '''
    return {v: k for k, v in d.items()}

## Loading data

- presence absence data (filtered SxG)
- processed phenotypic readings
- ARGs for the dfrug

In [2]:
presence_path = f'data/presence_matrices/{species}_filtered_GxS.csv' #needed later on for resistance lors
presence_df = pd.read_csv(presence_path, index_col=0)
pheno_path = f'data/processed_phenotypes/{species}_{drug}.csv' 

#  ----------------- cluster info -----------------
cluster_info_df = pd.read_csv(f'data/clusters/{species}_cluster_info_filtered.csv', index_col=0)

# ----------------- LOR of resistance -----------------
lor_resistance_scores = cluster_analysis.get_cluster_resistance_LOR(presence_df, pheno_path)
lor_resistance_scores_dict =lor_resistance_scores.to_dict()['log_odds'] # -- we want it as cluster attributes later on

# ----------------- ARGs -----------------
ARG_df = pd.read_csv(f'data/ARG/{species}_ARG_sp_drugs_products.csv', index_col=0)
ARG_products_list = ast.literal_eval(ARG_df.loc[drug].tolist()[0]) #bcs its saved as a '['a','b']' str instead of a list, and tolist() puts this str in a one item list

# ----------------- SxG presence -----------------
X_df = pd.read_csv(f'data/presence_matrices/{species}_filtered_SxG.csv', index_col=0) 

features = X_df.columns.tolist()
products_list = [cluster_analysis.transform_cluster_to_product(cluster) for cluster in features]
ARG_products_list_in_pan = [p for p in products_list if p[:-2] in ARG_products_list]

# ----------------- phenotypes -----------------
pheno_df= pd.read_csv(f'data/processed_phenotypes/{species}_{drug}.csv', index_col=0)
y_df=pheno_df
y_df.index = y_df.index.astype('float')

y_df = y_df.sort_index()

y_indices=list(y_df.index)

# ----------------- intersection to created a labeled matrix -----------------

X_df = X_df.sort_index()
y_df = y_df.sort_index()

y_indices=list(y_df.index)
X_indices=list(X_df.index)

intersection = [i for i in y_indices if i in X_indices]
y_df = y_df.loc[intersection]
X_df = X_df.loc[intersection]

X_df = X_df.sort_index()
y_df = y_df.sort_index() # -- just making sure bcs im paranoid :)

X = X_df.values
y = y_df.values


labeled_matrix = pd.concat([X_df, y_df], axis=1)
labeled_matrix.shape

  result = getattr(ufunc, method)(*inputs, **kwargs)


(506, 18876)

## Pairwise associations

### Stats
- Mutual information
- $X^2$ test
- one-way ANOVA

### ML
- 200 SVM ensemble (lasso)

In [3]:
mi_scores={}; chi2_scores={}; fisher_scores={}; anova_scores={}
chi2_pvals={}; anova_pvals={}
chi2_pvals_corrected={}; anova_pvals_corrected={}

# -- Mutual Information
for col in labeled_matrix.columns[:-1]:
    mi_scores[col]=mutual_info_score(labeled_matrix[col], labeled_matrix['SIR'])
mi_ranked=sorted(mi_scores, key=mi_scores.get, reverse=True)

# -- Chi2
for col in labeled_matrix.columns[:-1]:
    
    contingency_table = pd.crosstab(labeled_matrix[col], labeled_matrix['SIR'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    chi2_scores[col] = chi2
    chi2_pvals[col] = p
chi2_pvals_corrected = apply_FDR_correction(chi2_pvals)
chi2_ranked=sorted(chi2_pvals_corrected, key=chi2_scores.get, reverse=False) # ranked based on smallest corrrected pval

# -- one way ANOVA
for col in labeled_matrix.columns[:-1]:
    f_statistic, p_value = f_oneway(labeled_matrix[col], labeled_matrix['SIR'])

    anova_scores[col] = f_statistic
    anova_pvals[col] = p_value
anova_pvals_corrected = apply_FDR_correction(anova_pvals)
anova_ranked=sorted(anova_pvals_corrected, key=anova_scores.get, reverse=False) # pcal ranking as well

In [4]:
# -- model training
n_models = 200
n_samples = int(0.8 * len(X)) # boostrap on 80% of the data

models=[]
for i in range(n_models): # this should give the same 200 models when retrained 
    # X_boot, y_boot = resample(X, y, n_samples=n_samples, random_state=i)
    model=SGDClassifier(loss='hinge', penalty= 'l1', max_iter=1000, tol=1e-3)
    model.fit(X, y.ravel())
    models.append(model)

weights = np.zeros((X.shape[1], n_models))
for i, m in enumerate(models):
    weights[:,i] = m.coef_[0]

avg_coef = np.mean(weights, axis=1)
avg_of_avg = np.mean(np.abs(avg_coef))
features = X_df.columns.tolist()

avg_coef_dict = dict(zip(features, avg_coef))

weights_df = pd.DataFrame(weights, index=features)
weights_df.columns = [f'model_{i+1}' for i in range(n_models)]

print(f'-- avg of the absolute value of the feature s coef avgs is {avg_of_avg}')

# ------------- computing the SVM features scores -------------

SVM_scores = avg_coef_dict

#### test

In [None]:
mi_scores = {}
for col in labeled_matrix.columns[:-1]:
    mi_scores[col]=mutual_info_score(labeled_matrix[col], labeled_matrix['SIR'])

mi_ranked_ARGS = gene_associations.get_ranked_ARGs_from_association(mi_scores, ARG_products_list, n=1000)
mi_ranked_ARGS

In [None]:
chi2_scores = {}; p_values={}
for col in labeled_matrix.columns[:-1]:
    
    contingency_table = pd.crosstab(labeled_matrix[col], labeled_matrix['SIR'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    chi2_scores[col] = chi2
    p_values[col] = p

# -- benjamini hochberg correction
p_values_corrected_dict = apply_FDR_correction(p_values)

chi2_ranked_ARGS = gene_associations.get_ranked_ARGs_from_association(p_values_corrected_dict, ARG_products_list, n=1000, sort_reverse=False)
chi2_ranked_ARGS

In [None]:
anova_scores = {}; p_values={}
for col in labeled_matrix.columns[:-1]:
    f_statistic, p_value = f_oneway(labeled_matrix[col], labeled_matrix['SIR'])

    anova_scores[col] = f_statistic
    p_values[col] = p_value

# -- corr
p_values_corrected_dict = apply_FDR_correction(p_values)

anova_ranked_ARGS = gene_associations.get_ranked_ARGs_from_association(p_values_corrected_dict, ARG_products_list, n=1000, sort_reverse=False)
anova_ranked_ARGS

In [None]:
lor_resistance_scores = cluster_analysis.get_cluster_resistance_LOR(presence_df, pheno_path)
lor_resistance_scores_dict =lor_resistance_scores.to_dict()['log_odds']

lor_ranked_ARGS = gene_associations.get_ranked_ARGs_from_association(lor_resistance_scores_dict, ARG_products_list, n =1000)
lor_ranked_ARGS

## Network construction

In [None]:
avg_coef_dict = {k: v for k, v in avg_coef_dict.items() if abs(v) > avg_of_avg}
print(f'-- number of genes having abs avg coef higher than {avg_of_avg} is {len(avg_coef_dict)} (will be taken as nodes)')

print('-- querying for ARGs amongs these nodes:')
temp_ARG = gene_associations.get_ranked_ARGs_from_association(avg_coef_dict, ARG_products_list, n=len(avg_coef_dict))
temp_ARG

In [None]:
# -- keeping important features (those with avg coef higher than 1)

# avg_coef_dict_filtered = {k: v for k, v in avg_coef_dict.items() if abs(v) > 1}
# print(f'-- number of genes having abs avg coef higher than 1 is {len(avg_coef_dict_filtered)} (will be taken as nodes)')
# print('-- querying for ARGs amongs these nodes:')
# temp_ARG = gene_associations.get_ranked_ARGs_from_association(avg_coef_dict_filtered, ARG_products_list, n=len(avg_coef_dict_filtered))
# temp_ARG

In [None]:
node_clusters = list(avg_coef_dict.keys())
weights_df = weights_df.loc[node_clusters]
weights_df = weights_df.T #-- genes as cols

corr_SVM = weights_df.corr()

gene_pairs_corr = []; t=0.5; SVM_G = nx.Graph()

for i in range(corr_SVM.shape[0]):
    for j in range(i+1, corr_SVM.shape[0]):
        if abs(corr_SVM.iloc[i,j]) > t:
            gene_pairs_corr.append((corr_SVM.index[i], corr_SVM.index[j], corr_SVM.iloc[i,j]))
            SVM_G.add_edge(corr_SVM.index[i], corr_SVM.index[j], weight=corr_SVM.iloc[i,j])

nx.write_graphml(SVM_G,f'data/temp/{species}_{drug}_SVM_{t}.graphml')
del corr_SVM # (to save some space)

In [None]:
network_analysis.plot_degree_distribution(SVM_G)
network_analysis.plot_log_log(SVM_G)

In [None]:
# -- breakpoint, run this if wanna proceed construction from middle
t=0.5
SVM_G = nx.read_graphml(f'data/temp/{species}_{drug}_SVM_{t}.graphml')
gene_pairs_corr = list(SVM_G.edges(data=False))

In [None]:
network_construction.set_pheno_path(pheno_path)
LOR_coocc=network_construction.compute_cooccurence_LOR(gene_pairs_corr)

In [None]:
G = network_construction.construct_network(LOR_coocc)
nx.write_graphml(G,f'results/Escherichia_coli_ampicillin/{species}_{drug}_LOR.graphml')

## Analysis

In [None]:
# G = nx.read_graphml(f'results/Escherichia_coli_ampicillin/{species}_{drug}_LOR_0.5.graphml')
G = cluster_analysis.set_node_attributes_by_cluster(cluster_info_df, G)
nx.set_node_attributes(G, lor_resistance_scores_dict, f'log_odds_{drug}_resistance')
nx.write_graphml(G,f'results/Escherichia_coli_ampicillin/{species}_{drug}_LOR_annotated.graphml')
nodes = list(G.nodes)

network_analysis.plot_degree_distribution(G)
network_analysis.plot_log_log(G)
stats =network_analysis.compute_network_stats(G, 'LOR co-occurence for nodes above avg_avg_coef')

stats

### Scores

In [None]:
ranked_mi = gene_associations.get_ranked_ARGs_from_association(mi_scores, ARG_products_list, n=1000)
ranked_chi2 = gene_associations.get_ranked_ARGs_from_association(chi2_pvals_corrected, ARG_products_list, n=1000, sort_reverse=False)
ranked_anova = gene_associations.get_ranked_ARGs_from_association(anova_pvals_corrected, ARG_products_list, n=1000, sort_reverse=False)
ranked_svm= gene_associations.get_ranked_ARGs_from_association(SVM_scores, ARG_products_list, n=1000)

ranked_mi = flip_dict(ranked_mi)
ranked_chi2 = flip_dict(ranked_chi2)
ranked_anova = flip_dict(ranked_anova)
ranked_svm = flip_dict(ranked_svm)

index = ARG_products_list_in_pan
pairwise_df = pd.DataFrame(index=index)
pairwise_df['mi'] = pairwise_df.index.map(ranked_mi)
pairwise_df['chi2'] = pairwise_df.index.map(ranked_chi2)
pairwise_df['anova'] = pairwise_df.index.map(ranked_anova)
pairwise_df['svm'] = pairwise_df.index.map(ranked_svm)

pairwise_df.to_csv(f'results/{species}_{drug}/{species}_{drug}_pairwise_ARGs.csv')
pairwise_df

In [None]:
bet_cen = nx.betweenness_centrality(G)
weighted_bet_cen = nx.betweenness_centrality(G, weight='weight')
close_cen = nx.closeness_centrality(G)
weighted_close_cen = nx.closeness_centrality(G, distance='weight')
eigen_cen = nx.eigenvector_centrality(G)
page_rank = nx.pagerank(G)
degree_cen = nx.degree_centrality(G)
weighted_degree_cen = nx.degree_centrality(G)

clustering_coef = nx.clustering(G)

ranked_bet_cen = gene_associations.get_ranked_ARGs_from_association(bet_cen, ARG_products_list, n=1000)
ranked_weighted_bet_cen = gene_associations.get_ranked_ARGs_from_association(weighted_bet_cen, ARG_products_list, n=1000)
ranked_close_cen = gene_associations.get_ranked_ARGs_from_association(close_cen, ARG_products_list, n=1000)
ranked_weighted_close_cen = gene_associations.get_ranked_ARGs_from_association(weighted_close_cen, ARG_products_list, n=1000)
ranked_eigen_cen = gene_associations.get_ranked_ARGs_from_association(eigen_cen, ARG_products_list, n=1000)
ranked_page_rank = gene_associations.get_ranked_ARGs_from_association(page_rank, ARG_products_list, n=1000)
ranked_degree_cen = gene_associations.get_ranked_ARGs_from_association(degree_cen, ARG_products_list, n=1000)
ranked_weighted_degree_cen = gene_associations.get_ranked_ARGs_from_association(weighted_degree_cen, ARG_products_list, n=1000)
ranked_clustering_coef = gene_associations.get_ranked_ARGs_from_association(clustering_coef, ARG_products_list, n=1000)

ranked_bet_cen = flip_dict(ranked_bet_cen)  # -- to get the ARGs as keys
ranked_weighted_bet_cen = flip_dict(ranked_weighted_bet_cen)
ranked_close_cen = flip_dict(ranked_close_cen)
ranked_weighted_close_cen = flip_dict(ranked_weighted_close_cen)
ranked_eigen_cen = flip_dict(ranked_eigen_cen)
ranked_page_rank = flip_dict(ranked_page_rank)
ranked_degree_cen = flip_dict(ranked_degree_cen)
ranked_weighted_degree_cen = flip_dict(ranked_weighted_degree_cen)
ranked_clustering_coef = flip_dict(ranked_clustering_coef)

index = ARG_products_list_in_pan
net_df = pd.DataFrame(index=index)

net_df['betweenness'] = net_df.index.map(ranked_bet_cen)
net_df['weighted_betweenness'] = net_df.index.map(ranked_weighted_bet_cen)
net_df['closeness'] = net_df.index.map(ranked_close_cen)
net_df['weighted_closeness'] = net_df.index.map(ranked_weighted_close_cen)
net_df['eigenvector'] = net_df.index.map(ranked_eigen_cen)
net_df['page_rank'] = net_df.index.map(ranked_page_rank)
net_df['degree'] = net_df.index.map(ranked_degree_cen)
net_df['weighted_degree'] = net_df.index.map(ranked_weighted_degree_cen)
net_df['clustering_coef'] = net_df.index.map(ranked_clustering_coef)

net_df.to_csv(f'results/{species}_{drug}/{species}_{drug}_network_centralities_ARGs.csv')
net_df

In [None]:
print(ranked_mi)
print(ranked_chi2)
print(ranked_anova)
print(ranked_svm)
print(ranked_bet_cen)
print(ranked_weighted_bet_cen)
print(ranked_close_cen)
print(ranked_weighted_close_cen)
print(ranked_eigen_cen)
print(ranked_page_rank)
print(ranked_degree_cen)
print(ranked_weighted_degree_cen)
print(ranked_clustering_coef)

In [None]:
df = pd.concat([pairwise_df, net_df], axis=1)
df.to_csv(f'results/{species}_{drug}/{species}_{drug}_ARGs_all.csv')
df

#### test

In [None]:
# --betweenness
bet = nx.betweenness_centrality(G, weight='weight')
gene_associations.get_ranked_ARGs_from_association(bet, ARG_products_list, n=len(nodes))

In [None]:
# --closeness
clo = nx.closeness_centrality(G, distance='weight')
gene_associations.get_ranked_ARGs_from_association(clo, ARG_products_list, n=len(nodes))

In [None]:
weightd_deg = dict(G.degree(weight='weight'))
gene_associations.get_ranked_ARGs_from_association(weightd_deg, ARG_products_list, n=len(nodes))

In [None]:
eigen = nx.eigenvector_centrality(G, weight='weight')
gene_associations.get_ranked_ARGs_from_association(eigen, ARG_products_list, n=len(nodes))

In [None]:
nx.set_node_attributes(G, lor_resistance_scores_dict, f'log_odds_{drug}_resistance')

In [None]:
G.nodes(data=True)

In [None]:
type(G)

In [None]:
import cluster_analysis
cluster_info_df = pd.read_csv(f'data/clusters/{species}_cluster_info_filtered.csv', index_col=0)
G = cluster_analysis.set_node_attributes_by_cluster(cluster_info_df, G)

In [None]:
G.nodes(data=True)

In [None]:
nx.write_graphml(G,f'results/Escherichia_coli_ampicillin/{species}_{drug}_LOR_annotated.graphml')

### Clustering

In [None]:
louvain_com = nx.algorithms.community.louvain_communities(G, weight='weight')


In [None]:
louvain_com
# check log_odds distribution of the communities
for community in louvain_com:
    com_nodes_lor = []
    for node in louvain_com[community]:
        com_nodes_lor.append(G.nodes[node][f'log_odds_{drug}_resistance'])
    print(f'community {community} log_odds distribution:')

