## Essentials

In [1]:
drug='ciprofloxacin'

In [3]:
import numpy as np
import pandas as pd
import networkx as nx


from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.base import clone
from sklearn.metrics import roc_auc_score

import statsmodels.api as sm
from statsmodels.stats.multitest import fdrcorrection
from statsmodels.formula.api import ols

import sys
sys.path.append("..")
sys.path.append("../..")
from cluster_analysis import *
# from LOR_calculation import *

import warnings

species='Escherichia_coli'

X_df = pd.read_csv(f'../../data/temp/X_{species}_SxG_filtered_hypo_unknowns_freq5.csv', index_col=0) # filtering

drugs = ['streptomycin',
 'sulfamethoxazole',
 'tetracycline',
 'cefalothin',
 'trimethoprim_sulphamethoxazole',
 'amoxicillin_clavulanic_acid',
 'trimethoprim',
 'amoxicillin',
 'ampicillin',
 'doripenem',
 'levofloxacin',
 'ciprofloxacin']


G=nx.read_graphml('../../data/nets/Escherichia_coli/200_randomized_SVM_0.2unsigned_corr_trimethoprim.graphml')
edge_list = list(G.edges)


pheno_df= pd.read_csv(f'../../metadata/{species}/{species}_{drug}.csv', index_col=0)
y_df=pheno_df
y_df.index = y_df.index.astype('float')

y_df = y_df.sort_index()

y_indices=list(y_df.index)


X_df = X_df.sort_index()
y_df = y_df.sort_index()

y_indices=list(y_df.index)
X_indices=list(X_df.index)

intersection = [i for i in y_indices if i in X_indices]
y_df = y_df.loc[intersection]
X_df = X_df.loc[intersection]

X_df = X_df.sort_index()
y_df = y_df.sort_index() # -- just making sure bcs im paranoid

X = X_df.values
y = y_df.values



labeled_matrix = pd.concat([X_df, y_df], axis=1)
labeled_matrix.shape

(1672, 18876)

## Models

### formula

In [30]:
import statsmodels.formula.api as smf
import statsmodels
import warnings

# -- to memoize the following: 
#       num_pairs: number of pairs given as input
#       num_colinear: number of errors that occurred during the process (colinearity & ML convergence)
#       num_sig: number of sig interactions (rejected)
#       weights: strengths of interactions


X_pairs = [(i.replace(' ', '_'), j.replace(' ', '_')) for i, j in edge_list]
interaction_labeled_matrix = labeled_matrix.rename(columns={col: col.replace(' ', '_') for col in labeled_matrix.columns})

# formula = f'SIR ~ {a} + {b} + {a}:{b}'

test_pairs = X_pairs[:1]
count_errors=0

warnings.filterwarnings('ignore')

non_col_interaction = {}  #saving interaction p value of each pair that is run in a logit

for pair in X_pairs:
    a, b = pair
    # print(" --- looking into the edge between", a, "and", b)

    logit_model = smf.logit(formula=f'SIR ~ {a} + {b} + {a}:{b}', data=interaction_labeled_matrix)
    try:
        results = logit_model.fit(disp=0)
        inter_p = results.pvalues.iloc[3,]
        non_col_interaction[pair]=inter_p
    except Exception as e:
        # print(f"An error occurred: {e}")
        count_errors+=1



In [None]:
# performing FDR correction

valid_pairs=list(non_col_interaction.keys())
p_values = list(non_col_interaction.values())

reject, pvals_corrected = fdrcorrection(p_values, alpha=0.05, method='indep')

In [28]:
inter_p = results.pvalues.iloc[3,]

0.9992495597164684

In [None]:
X_pairs = [(i.replace(' ', '_'), j.replace(' ', '_')) for i, j in edge_list]
interaction_labeled_matrix = labeled_matrix.rename(columns={col: col.replace(' ', '_') for col in labeled_matrix.columns})

# formula = f'SIR ~ {a} + {b} + {a}:{b}'

test_pairs = X_pairs[:1]
count_errors=0

warnings.filterwarnings('ignore')

for pair in X_pairs:
    a, b = pair
    print(" --- looking into the edge between", a, "and", b)

    logit_model = smf.logit(formula=f'SIR ~ {a} + {b} + {a}:{b}', data=interaction_labeled_matrix)
    try:
        results = logit_model.fit(disp=0)
    except Exception as e:
        print(f"An error occurred: {e}")
        count_errors+=1

In [12]:
results.summary()

0,1,2,3
Dep. Variable:,SIR,No. Observations:,1672.0
Model:,Logit,Df Residuals:,1668.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 01 May 2024",Pseudo R-squ.:,0.02221
Time:,15:37:47,Log-Likelihood:,-946.23
converged:,False,LL-Null:,-967.72
Covariance Type:,nonrobust,LLR p-value:,2.467e-09

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.0587,0.059,-17.852,0.000,-1.175,-0.942
Cluster_0,-0.0736,0.193,-0.380,0.704,-0.453,0.306
Cluster_20,15.8568,1634.466,0.010,0.992,-3187.639,3219.352
Cluster_0:Cluster_20,-12.7787,1634.467,-0.008,0.994,-3216.274,3190.717


In [6]:
results.summary()

0,1,2,3
Dep. Variable:,SIR,No. Observations:,1672.0
Model:,Logit,Df Residuals:,1668.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 01 May 2024",Pseudo R-squ.:,0.02221
Time:,15:36:25,Log-Likelihood:,-946.23
converged:,False,LL-Null:,-967.72
Covariance Type:,nonrobust,LLR p-value:,2.467e-09

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.0587,0.059,-17.852,0.000,-1.175,-0.942
Cluster_0,-0.0736,0.193,-0.380,0.704,-0.453,0.306
Cluster_20,15.8568,1634.466,0.010,0.992,-3187.639,3219.352
Cluster_0:Cluster_20,-12.7787,1634.467,-0.008,0.994,-3216.274,3190.717


### vanilla

In [None]:
X_pairs=edge_list
error_test=[("Cluster 0", "Cluster 20")]

p_values_list = []

for pair in error_test:
    a, b = pair
    print(" --- looking into the edge between", a, "and", b)

    X_selected = X_df.loc[:, [a, b]]

    print(X_selected.shape)

    X_selected = sm.add_constant(X_selected)

    X_selected['interaction'] = X_selected.iloc[:, a] * X_selected.iloc[:, b]

    try:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            logit_model = sm.Logit(y, X_selected)
            result = logit_model.fit()
            p_values_list.append(result.pvalues['interaction'])

    except:
        print('singular matrix; deleted')