# _E. coli_ terracycline analysis

This notebook aims to analyze the terracycline resistance in _E. coli_ bacteria.

* import modules for cluster, data, and network analysis
* load the data
* perform pairwise associations
* perform ML approaches
* construct the network 
* analyze the network

## Essentials

In [1]:
import ast # --for string to list conversion
import os
import sys
import networkx as nx
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.utils import resample
from sklearn.metrics import mutual_info_score


os.chdir(os.path.expanduser('~/capstone-project'))
sys.path.append('src')

import cluster_analysis
import gene_associations
import network_construction
import network_analysis

species='Escherichia_coli'; drug = 'tetracycline'

## Loading data

- presence absence data (filtered SxG)
- processed phenotypic readings
- ARGs for the dfrug

In [2]:
# ----------------- ARGs -----------------
ARG_df = pd.read_csv(f'data/ARG/{species}_ARG_sp_drugs_products.csv', index_col=0)
ARG_products_list = ast.literal_eval(ARG_df.loc[drug].tolist()[0]) #bcs its saved as a '['a','b']' str instead of a list, and tolist() puts this str in a one item list

# ----------------- SxG presence -----------------
X_df = pd.read_csv(f'data/presence_matrices/{species}_filtered_SxG.csv', index_col=0) 

# ----------------- phenotypes -----------------
pheno_df= pd.read_csv(f'data/processed_phenotypes/{species}_{drug}.csv', index_col=0)
y_df=pheno_df
y_df.index = y_df.index.astype('float')

y_df = y_df.sort_index()

y_indices=list(y_df.index)

# ----------------- intersection to created a labeled matrix -----------------

X_df = X_df.sort_index()
y_df = y_df.sort_index()

y_indices=list(y_df.index)
X_indices=list(X_df.index)

intersection = [i for i in y_indices if i in X_indices]
y_df = y_df.loc[intersection]
X_df = X_df.loc[intersection]

X_df = X_df.sort_index()
y_df = y_df.sort_index() # -- just making sure bcs im paranoid :)

X = X_df.values
y = y_df.values


labeled_matrix = pd.concat([X_df, y_df], axis=1)
labeled_matrix.shape

(225, 18876)

## Pairwise associations

In [4]:
mi_scores = {}
for col in labeled_matrix.columns[:-1]:
    mi_scores[col]=mutual_info_score(labeled_matrix[col], labeled_matrix['SIR'])

mi_ranked_ARGS = gene_associations.get_ranked_ARGs_from_association(mi_scores, ARG_products_list)

## Feature selection

### SVM

In [6]:
# -- model training
n_models = 200
n_samples = int(0.8 * len(X)) # boostrap on 80% of the data

models=[]
for i in range(n_models): # this should give the same 200 models when retrained
    X_boot, y_boot = resample(X, y, n_samples=n_samples, random_state=i)
    model=SGDClassifier(loss='hinge', penalty= 'l1', max_iter=1000, tol=1e-3)
    model.fit(X_boot, y_boot.ravel())
    models.append(model)



In [None]:
# make models a 2d numpy array
models = np.array(models)

In [None]:
# check if all items in models have the same shape
shapes = [model.coef_.shape for model in models]
shapes = set(shapes)
shapes

In [None]:
models[0]

In [None]:
# models = models.reshape(models.shape[0], 18875)
models.shape
# enforce the second shape to be 18875

In [None]:
# -- feature selection
feature_importances = pd.DataFrame()
model.coef_.shape

In [None]:
# -- corr of weights
weights=np.zeros((n_models, X.shape[1]))
for i in range(n_models):
    weights[i]=models[i].coef_
weights_df = pd.DataFrame(weights, columns=X_df.columns)

corr_SVM = weights_df.corr()