# _E. coli_ terracycline analysis

This notebook aims to analyze the terracycline resistance in _E. coli_ bacteria.

* import modules for cluster, data, and network analysis
* load the data
* perform pairwise associations
* perform ML approaches
* construct the network 
* analyze the network

## Essentials

In [1]:
import ast # --for string to list conversion
import os
import sys
from IPython.display import display
import networkx as nx
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.utils import resample
from sklearn.metrics import mutual_info_score


os.chdir(os.path.expanduser('~/capstone-project'))
sys.path.append('src')

import cluster_analysis
import gene_associations
import network_construction
import network_analysis

species='Escherichia_coli'; drug = 'tetracycline'

## Loading data

- presence absence data (filtered SxG)
- processed phenotypic readings
- ARGs for the dfrug

In [2]:
presence_path = f'data/presence_matrices/{species}_filtered_GxS.csv' #needed later on for resistance lors
presence_df = pd.read_csv(presence_path, index_col=0)
pheno_path = f'data/processed_phenotypes/{species}_{drug}.csv' 

# ----------------- ARGs -----------------
ARG_df = pd.read_csv(f'data/ARG/{species}_ARG_sp_drugs_products.csv', index_col=0)
ARG_products_list = ast.literal_eval(ARG_df.loc[drug].tolist()[0]) #bcs its saved as a '['a','b']' str instead of a list, and tolist() puts this str in a one item list

# ----------------- SxG presence -----------------
X_df = pd.read_csv(f'data/presence_matrices/{species}_filtered_SxG.csv', index_col=0) 

# ----------------- phenotypes -----------------
pheno_df= pd.read_csv(f'data/processed_phenotypes/{species}_{drug}.csv', index_col=0)
y_df=pheno_df
y_df.index = y_df.index.astype('float')

y_df = y_df.sort_index()

y_indices=list(y_df.index)

# ----------------- intersection to created a labeled matrix -----------------

X_df = X_df.sort_index()
y_df = y_df.sort_index()

y_indices=list(y_df.index)
X_indices=list(X_df.index)

intersection = [i for i in y_indices if i in X_indices]
y_df = y_df.loc[intersection]
X_df = X_df.loc[intersection]

X_df = X_df.sort_index()
y_df = y_df.sort_index() # -- just making sure bcs im paranoid :)

X = X_df.values
y = y_df.values


labeled_matrix = pd.concat([X_df, y_df], axis=1)
labeled_matrix.shape

(225, 18876)

## Pairwise associations

In [3]:
mi_scores = {}
for col in labeled_matrix.columns[:-1]:
    mi_scores[col]=mutual_info_score(labeled_matrix[col], labeled_matrix['SIR'])

mi_ranked_ARGS = gene_associations.get_ranked_ARGs_from_association(mi_scores, ARG_products_list, n=1000)
mi_ranked_ARGS

{1: 'Tetracycline resistance, MFS efflux pump => Tet(A)_6',
 7: 'Tetracycline resistance, MFS efflux pump => Tet(B)_1',
 175: 'Transcriptional regulator of acrAB operon, AcrR_13'}

In [10]:
lor_resistance_scores = cluster_analysis.get_cluster_resistance_LOR(presence_df, pheno_path)
lor_resistance_scores_dict =lor_resistance_scores.to_dict()['log_odds']

lor_ranked_ARGS = gene_associations.get_ranked_ARGs_from_association(lor_resistance_scores_dict, ARG_products_list, n =1000)
lor_ranked_ARGS

  result = getattr(ufunc, method)(*inputs, **kwargs)


{407: 'iron aquisition outermembrane yersiniabactin receptor (FyuA,Psn,pesticin receptor) @ Outer membrane receptor for ferric siderophore_2',
 998: 'Multidrug efflux system AcrAB-TolC, inner-membrane proton/drug antiporter AcrB (RND type)_1'}

## Feature selection

### SVM

In [4]:
# -- model training
n_models = 200
n_samples = int(0.8 * len(X)) # boostrap on 80% of the data

models=[]
for i in range(n_models): # this should give the same 200 models when retrained 
    X_boot, y_boot = resample(X, y, n_samples=n_samples, random_state=i)
    model=SGDClassifier(loss='hinge', penalty= 'l1', max_iter=1000, tol=1e-3)
    model.fit(X_boot, y_boot.ravel())
    models.append(model)



In [24]:
weights = np.zeros((X.shape[1], n_models))
for i, m in enumerate(models):
    weights[:,i] = m.coef_[0]

avg_coef = np.mean(weights, axis=1)
avg_of_avg = np.mean(np.abs(avg_coef))
features = X_df.columns.tolist()

avg_coef_dict = dict(zip(features, avg_coef))

weights_df = pd.DataFrame(weights, index=features)
weights_df.columns = [f'model_{i+1}' for i in range(n_models)]

print(f'-- avg of the absolute value of the feature s coef avgs is {avg_of_avg}')

avg_coef_dict = {k: v for k, v in avg_coef_dict.items() if abs(v) > avg_of_avg}
print(f'-- number of genes having abs avg coef higher than {avg_of_avg} is {len(avg_coef_dict)} (will be taken as nodes)')

print('-- querying for ARGs amongs these nodes:')
temp_ARG = gene_associations.get_ranked_ARGs_from_association(avg_coef_dict, ARG_products_list, n=len(avg_coef_dict))
temp_ARG

-- avg of the absolute value of the feature s coef avgs is 11.88752422394227
-- number of genes having abs avg coef higher than 11.88752422394227 is 5847 (will be taken as nodes)
-- querying for ARGs amongs these nodes:


{1: 'Tetracycline resistance, MFS efflux pump => Tet(A)_6',
 13: 'Tetracycline resistance, MFS efflux pump => Tet(B)_1',
 110: 'Transcriptional regulator of acrAB operon, AcrR_12',
 874: 'Transcriptional regulator of acrAB operon, AcrR_18',
 2234: 'Multiple antibiotic resistance protein MarB_2',
 2313: 'Multidrug efflux system EmrKY-TolC, inner-membrane proton/drug antiporter EmrY (MFS type)_13',
 2334: 'Multidrug efflux pump MdfA/Cmr (of MFS type), broad spectrum_1',
 2639: 'Right origin-binding protein_3',
 3160: 'iron aquisition outermembrane yersiniabactin receptor (FyuA,Psn,pesticin receptor) @ Outer membrane receptor for ferric siderophore_2',
 3377: 'Multidrug efflux system AcrAB-TolC, inner-membrane proton/drug antiporter AcrB (RND type)_1',
 3744: 'DNA-binding protein H-NS_2',
 4860: 'Multidrug efflux system, inner membrane proton/drug antiporter (RND type) => MexB of MexAB-OprM_3',
 5428: 'Multidrug efflux system, inner membrane proton/drug antiporter (RND type) => MexB of Me

In [10]:
node_clusters = list(avg_coef_dict.keys())
weights_df = weights_df.loc[node_clusters]
weights_df = weights_df.T #-- genes as cols

corr_SVM = weights_df.corr()

gene_pairs_corr = []; t=0.5; SVM_G = nx.Graph()

for i in range(corr_SVM.shape[0]):
    for j in range(i+1, corr_SVM.shape[0]):
        if abs(corr_SVM.iloc[i,j]) > t:
            gene_pairs_corr.append((corr_SVM.index[i], corr_SVM.index[j], corr_SVM.iloc[i,j]))
            SVM_G.add_edge(corr_SVM.index[i], corr_SVM.index[j], weight=corr_SVM.iloc[i,j])

nx.write_graphml(SVM_G,f'data/temp/{species}_{drug}_SVM_{t}.graphml')

In [15]:
network_analysis.plot_degree_distribution(SVM_G)
network_analysis.plot_log_log(SVM_G)

R squared: 0.6443243130840258


0.6443243130840258

In [14]:
network_construction.set_pheno_path(pheno_path)
LOR_coocc=network_construction.compute_cooccurence_LOR(gene_pairs_corr)


invalid value encountered in log



In [30]:
G = network_construction.construct_network(LOR_coocc)
nx.write_graphml(G,f'results/Escherichia_coli_tetracycline/{species}_{drug}_LOR_0.5.graphml')

In [29]:
network_analysis.plot_degree_distribution(G,f'LOR network for {species} {drug}')
network_analysis.plot_log_log(G,f'LOR network for {species} {drug}')

network_analysis.compute_network_stats(G)

R squared: 0.8273620907856172
R squared: 0.0401765261279996


Unnamed: 0_level_0,|V|,|E|,density,k,k weighted,|components|,cc,s_path,d,|communities|,Q,r_squared
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
G,2780,10639,0.002754,7.653957,8.104563,105,0.262378,7.237377,21,135,0.884911,0.040177


In [None]:
len(gene_pairs_corr)

19900