In [1]:
import numpy as np
import pandas as pd
import networkx as nx


from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.base import clone
from sklearn.metrics import roc_auc_score

import statsmodels.api as sm
from statsmodels.stats.multitest import fdrcorrection
from statsmodels.formula.api import ols

import sys
sys.path.append("..")
sys.path.append("../..")
from cluster_analysis import *
# from LOR_calculation import *

import warnings

# Mtb

## Presence matrix

In [None]:
species='Mycobacterium_tuberculosis'

genome_ids_file=f"../../pangenome-repo/Pangenome-Analysis-Workflow/genome_ids/{species}_genome_ids.csv"
clstr_freq_file=f"../../pangenome-repo/Pangenome-Analysis-Workflow/codes/{species}/{species}_cluster_frequencies.csv"
clstr_file=f"../../pangenome-repo/Pangenome-Analysis-Workflow/codes/{species}/{species}.fasta.clstr"
clstr_fasta_file=f"../../pangenome-repo/Pangenome-Analysis-Workflow/codes/{species}/{species}.fasta"

samples_df = pd.read_csv(genome_ids_file, dtype=str)
samples_list=samples_df['genome.genome_ids'].tolist()

clstr_freq_df=pd.read_csv(clstr_freq_file, index_col=1)
clstr_freq_df=clstr_freq_df.drop(clstr_freq_df.columns[0], axis=1)
clstr_list=clstr_freq_df.index.tolist()

df = pd.DataFrame(index=samples_list, columns=clstr_list)
df = df.fillna(0)

with open(clstr_file) as f:
    for line in f:
        if line.startswith(">Cluster"):
            #if its a > line get the cluster id and start counting its occurences in which samples by saving it to the local var
            cluster_id=line[1:].strip()
        else:
            #else its a line designating the sample genome, one that the last cluster_id is present in
            # get the 1st group matching here \d\t\d+aa, >fig\|([\d\.\d]+).+

            genome_id=re.match(r"\d+\t\d+aa, >fig\|([\d\.\d]+).+", line).group(1)
            genome_id=genome_id[:-1]  #removing the trailing .
            df.loc[genome_id, cluster_id]+=1

df = df.T
df.to_csv(f'../../data/presence_matrices/{species}_GxS.csv')

## SVM

In [None]:
species='Mycobacterium_tuberculosis'; drug=''

presence_df = pd.read_csv(f'../../data/presence_matrices/{species}_GxS.csv', index_col=0); 
X_df=presence_df.T
X_df.index = X_df.index.astype('float')
# X_df.index = X_df.index.astype('object')


pheno_df= pd.read_csv(f'../../data/phenotypes/{species}_{drug}.csv', index_col=0)
y_df=pheno_df
y_df.index = y_df.index.astype('float')

for gene in X_df.columns:
    
    if X_df[gene].std() == 0:
        X_df.drop(gene, axis=1, inplace=True)

X_df = X_df.sort_index()
y_df = y_df.sort_index()


y_indices=list(y_df.index)
X_indices=list(X_df.index)

intersection = [i for i in y_indices if i in X_indices]
y_df = y_df.loc[intersection]
X_df = X_df.loc[intersection]

# -- removing duplicate rows? --          to check th e nature of duplicates it could be an error through type conversion
X_df = X_df.loc[~X_df.index.duplicated(keep='first')]

X = X_df.values
y = y_df.values

labeled_matrix = pd.concat([X_df, y_df], axis=1)

# E coli

## presence

In [None]:
# species='Escherichia_coli'

# genome_ids_file=f"../../pangenome-repo/Pangenome-Analysis-Workflow/genome_ids/{species}_genome_ids.csv"
# clstr_freq_file=f"../../pangenome-repo/Pangenome-Analysis-Workflow/codes/{species}/{species}_cluster_frequencies.csv"
# clstr_file=f"../../pangenome-repo/Pangenome-Analysis-Workflow/codes/{species}/{species}.fasta.clstr"
# clstr_fasta_file=f"../../pangenome-repo/Pangenome-Analysis-Workflow/codes/{species}/{species}.fasta"

# samples_df = pd.read_csv(genome_ids_file, dtype=str)
# samples_list=samples_df['genome.genome_ids'].tolist()

# clstr_freq_df=pd.read_csv(clstr_freq_file, index_col=1)
# clstr_freq_df=clstr_freq_df.drop(clstr_freq_df.columns[0], axis=1)
# clstr_list=clstr_freq_df.index.tolist()

# df = pd.DataFrame(index=samples_list, columns=clstr_list)
# df = df.fillna(0)

# with open(clstr_file) as f:
#     for line in f:
#         if line.startswith(">Cluster"):
#             #if its a > line get the cluster id and start counting its occurences in which samples by saving it to the local var
#             cluster_id=line[1:].strip()
#         else:
#             #else its a line designating the sample genome, one that the last cluster_id is present in
#             # get the 1st group matching here \d\t\d+aa, >fig\|([\d\.\d]+).+

#             genome_id=re.match(r"\d+\t\d+aa, >fig\|([\d\.\d]+).+", line).group(1)
#             genome_id=genome_id[:-1]  #removing the trailing .
#             df.loc[genome_id, cluster_id]+=1

# df = df.T
# df.to_csv(f'../../data/presence_matrices/{species}_GxS.csv')

## filteration

In [2]:
species='Escherichia_coli'


clstr_df= get_cluster_representatives(f'../../pangenome-repo/Pangenome-Analysis-Workflow/codes/{species}/{species}.fasta.clstr')
_product_df=get_representative_products(f'../../pangenome-repo/Pangenome-Analysis-Workflow/codes/{species}/{species}.fasta')
product_df = combine_cluster_product(clstr_df, _product_df)

#  --counting those thar have hypotehtical in the product name
product_df['hypothetical'] = product_df['product_name'].str.contains('hypothetical', case=False)
product_df['hypothetical'].sum()

#  --same for unknown
product_df['unknown'] = product_df['product_name'].str.contains('unknown', case=False)
product_df['unknown'].sum()

# now will get a list of clusters that are either hypo or unknown to filter out
hypothetical_clusters = product_df[product_df['hypothetical']==True].index.tolist()
unknown_clusters = product_df[product_df['unknown']==True].index.tolist()

In [None]:
# -- plotting all pheno data for all available drugs

df = pd.DataFrame(columns=['drug', 'R', 'S'])

for file in os.listdir('../metadata/Escherichia_coli/'):
    if file.endswith('.csv'):
        drug = re.match(r'Escherichia_coli_(.*).csv', file).group(1)
        data = pd.read_csv(f'../metadata/Escherchia_coli/{file}')
        # count the number of 1s in SIR col and save in a count var
        count = data['SIR'].value_counts()
        count_R=count.get(1, 0)
        count_S=count.get(0, 0)
        # print(f'{drug}: {count}')
        # print('count of 1s:', count_R)
        # print('count of 0s:', count_S)
        df = df.append({'drug': drug, 'R': count_R, 'S': count_S}, ignore_index=True)
df.set_index('drug', inplace=True)
# df

In [3]:
species='Escherichia_coli'; drug='amikacin'

presence_df = pd.read_csv(f'../../data/presence_matrices/{species}_GxS.csv', index_col=0); 
X_df=presence_df.T
X_df.index = X_df.index.astype('float')
# X_df.index = X_df.index.astype('object')

# -- removing hypothetical and unknown clusters
X_df = X_df.drop(hypothetical_clusters, axis=1)
# X_df = X_df.drop(unknown_clusters, axis=1)

#  -- removing genes with zero variance
for gene in X_df.columns:
    
    if X_df[gene].std() == 0:
        X_df.drop(gene, axis=1, inplace=True)

# X_df = X_df.sort_index()

# X_indices=list(X_df.index)

# # -- removing duplicate rows? --          to check th e nature of duplicates it could be an error through type conversion
# # X_df = X_df.loc[~X_df.index.duplicated(keep='first')]



: 

In [None]:
X_df.shape

In [None]:
pheno_df= pd.read_csv(f'../../data/phenotypes/{species}_{drug}.csv', index_col=0)
y_df=pheno_df
y_df.index = y_df.index.astype('float')

y_df = y_df.sort_index()

y_indices=list(y_df.index)

intersection = [i for i in y_indices if i in X_indices]
y_df = y_df.loc[intersection]
X_df = X_df.loc[intersection]

X_df = X_df.sort_index()
y_df = y_df.sort_index()

X = X_df.values
y = y_df.values

labeled_matrix = pd.concat([X_df, y_df], axis=1)

## SVM