In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

presence_path="../data/presence_matrices/Campylobacter_coli_presence_absence.csv"
presence_df= pd.read_csv(presence_path, index_col=0)
presence_df.index = presence_df.index.astype(str)

pheno_path="../data/phenotypes/Campylobacter_coli_ciprofloxacin.csv"
pheno_df = pd.read_csv(pheno_path, index_col=0).astype(int)

In [18]:
pheno_df.index

Float64Index([195.2047, 195.2058, 195.2057, 195.2056, 195.2055, 195.2052,
              195.2053, 195.2051, 195.2049, 195.2059,
              ...
              195.2164, 195.2165, 195.2166, 195.2169,  195.217, 195.2171,
              195.2172, 195.2159, 195.2178, 195.2298],
             dtype='float64', name='genome_id', length=283)

## Helpers:


By convention the presence_matrix will always deignate a GxS table where rows represent genes and columns represent samples

- split_matrix_by_phenotype(presence_matrix, phenotype_matrix): splits the presence matrix into two matrices, one R and one for S

In [12]:
def split_matrix_by_phenotype(unlabeled_presence_df:pd.DataFrame, pheno_df: pd.DataFrame)->(pd.DataFrame, pd.DataFrame):
    '''
    takes a gene absence presence dataframe (n x m) with the samples classification and splits it to 2 dataframes:
        - one that represents all R samples (samples found in R_list)
        - one that represents all S samples (found in S_list)
    and returns them in this order: R then S

    param:
        - unlabeled_presence_df: (pd.DataFrame) the data frame output of data_utils.get_gene_presence_matrix
        - R_list: (list) samples genome_id that have a R (1) phenotype
        - S_list: (list) samples having the 0 phenotype
    
    return:
        - R_df: (pd.DataFrame) data frame where columns are only for R samples
        - S_df: (pd.DataFrame) df where cols are only for S samples

    _this is a helper function to create a contigency table for each gene_
    '''

    # get the list of R and S samples:
    R=[];S=[];U=[]
    for sample in pheno_df.index:

        if pheno_df.loc[sample].values[0]==1:
            sample=str(sample)
            if len(sample)==7:
                sample+='0'
            elif len(sample)==6:
                sample+='00'
            elif len(sample)==5:
                sample+='000'
            elif len(sample)==4:
                sample+='0000'
            R.append(str(sample))
        elif pheno_df.loc[sample].values[0]==0:
            sample=str(sample)
            sample=str(sample)
            if len(sample)==7:
                sample+='0'
            elif len(sample)==6:
                sample+='00'
            elif len(sample)==5:
                sample+='000'
            elif len(sample)==4:
                sample+='0000'
            S.append(str(sample))
        else:
            sample=str(sample)
            if len(sample)==7:
                sample+='0'
            elif len(sample)==6:
                sample+='00'
            elif len(sample)==5:
                sample+='000'
            elif len(sample)==4:
                sample+='0000'
            U.append(str(sample))

    print(R in list(unlabeled_presence_df.index))
            
    # All we care for are R and S which designate the list of resistant and susceptible samples respectively

    R_df= unlabeled_presence_df[R]
    S_df= unlabeled_presence_df[S]

    return R_df, S_df

split_matrix_by_phenotype(presence_df, pheno_df)

False


KeyError: "None of [Index(['195.2047', '195.2058', '195.2052', '195.2054', '195.2070', '195.2071',\n       '195.2074', '195.2043', '195.2678', '195.2676', '195.2673', '195.2672',\n       '195.2671', '195.2670', '195.2669', '195.2668', '195.2667', '195.2024',\n       '195.2027', '195.2028', '195.2030', '195.2032', '195.2035', '195.2037',\n       '195.2040', '195.2041', '195.2666', '195.2246', '195.2248', '195.2250',\n       '195.2251', '195.2252', '195.2256', '195.2257', '195.2258', '195.2261',\n       '195.2212', '195.2213', '195.2217', '195.2221', '195.2263', '195.2229',\n       '195.2234', '195.2266', '195.2300', '195.2302', '195.2304', '195.2305',\n       '195.2306', '195.2307', '195.2310', '195.2311', '195.2663', '195.2664',\n       '195.2665', '195.2268', '195.2278', '195.2282', '195.2283', '195.2289',\n       '195.2281', '195.2208', '195.2119', '195.2124', '195.2126', '195.2127',\n       '195.2131', '195.2136', '195.2137', '195.2139', '195.2116', '195.2115',\n       '195.2090', '195.2094', '195.2099', '195.2100', '195.2142', '195.2101',\n       '195.2105', '195.2107', '195.2110', '195.2112', '195.2191', '195.2193',\n       '195.2195', '195.2146', '195.2147', '195.2152', '195.2158', '195.2160',\n       '195.2163', '195.2164', '195.2169', '195.2172', '195.2159', '195.2298'],\n      dtype='object')] are in the [columns]"