In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

## Helpers:


By convention the presence_matrix will always deignate a GxS table where rows represent genes and columns represent samples

- _split_matrix_by_phenotype(presence_matrix, phenotype_matrix): splits the presence matrix into two matrices, one R and one for S

In [None]:
def _split_matrix_by_phenotype(unlabeled_presence_df:pd.DataFrame, pheno_df: pd.Dataframe):
    '''
    takes a gene absence presence dataframe (n x m) with the samples classification and splits it to 2 dataframes:
        - one that represents all R samples (samples found in R_list)
        - one that represents all S samples (found in S_list)
    and returns them in this order: R then S

    param:
        - unlabeled_presence_df: (pd.DataFrame) the data frame output of data_utils.get_gene_presence_matrix
        - R_list: (list) samples genome_id that have a R (1) phenotype
        - S_list: (list) samples having the 0 phenotype
    
    return:
        - R_df: (pd.DataFrame) data frame where columns are only for R samples
        - S_df: (pd.DataFrame) df where cols are only for S samples

    _this is a helper function to create a contigency table for each gene_
    '''

    # get the list of R and S samples:
    R=[];S=[];U=[]
    for sample in pheno_df.index:

        if pheno_df.loc[sample].values[0]==1:
            sample=str(sample)
            if len(sample)==7:
                sample+='0'
            elif len(sample)==6:
                sample+='00'
            elif len(sample)==5:
                sample+='000'
            elif len(sample)==4:
                sample+='0000'
            R.append(str(sample))
        elif pheno_df.loc[sample].values[0]==0:
            sample=str(sample)
            sample=str(sample)
            if len(sample)==7:
                sample+='0'
            elif len(sample)==6:
                sample+='00'
            elif len(sample)==5:
                sample+='000'
            elif len(sample)==4:
                sample+='0000'
            S.append(str(sample))
        else:
            sample=str(sample)
            if len(sample)==7:
                sample+='0'
            elif len(sample)==6:
                sample+='00'
            elif len(sample)==5:
                sample+='000'
            elif len(sample)==4:
                sample+='0000'
            U.append(str(sample))

    R_df=_get_subdf_cols(unlabeled_presence_df, R_list)
    S_df=_get_subdf_cols(unlabeled_presence_df, S_list)

    return R_df, S_df