In [1]:
import pandas as pd
import sys
import os
import numpy as np
import itertools
from importlib import reload
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
)
from PIL import Image
import io
import pyBigWig

from sklearn.decomposition import TruncatedSVD
from scipy.sparse import issparse

sys.path.append("../")
import utils as ut
import plotting as plt2
reload(ut)
reload(plt2)

<module 'plotting' from '/home/cstansbu/git_repositories/epi2me-pore-c-snakemake/notebooks/hyperedge_prediction/../plotting.py'>

In [2]:
dpath = "/scratch/indikar_root/indikar1/shared_data/population_pore_c/align_table/"

df = []
for f in os.listdir(dpath):
    if not 'parquet' in f:
        continue

    fpath = f"{dpath}{f}"
    tmp = pd.read_parquet(fpath)
    print(f, tmp.shape)
    df.append(tmp)

df = pd.concat(df)
print(f"{df.shape=}")
df.head()

batch04.GRCm39.align_table.parquet (8434829, 16)
batch02.GRCm39.align_table.parquet (907344, 16)
batch03.GRCm39.align_table.parquet (4579554, 16)
batch01.GRCm39.align_table.parquet (7508851, 16)
df.shape=(21430578, 16)


Unnamed: 0,read_name,read_start,read_end,monomer_length,chrom,ref_start,ref_end,fragment_id,fragment_start,fragment_end,is_duplicate,is_mapped,is_close,read_unique,read_group,mapping_quality
0,00000202-49cf-47b2-83bf-5eb3f6d98373,0,460,460,10.0,79553913,79554361.0,10:369876,79553685.0,79554367.0,False,True,False,True,1025705.0,60
1,00000202-49cf-47b2-83bf-5eb3f6d98373,460,687,227,10.0,79553679,79553895.0,10:369876,79553685.0,79554367.0,False,True,True,True,1025705.0,35
2,00000202-49cf-47b2-83bf-5eb3f6d98373,687,853,166,10.0,79553496,79553677.0,10:369875,79553502.0,79553685.0,False,True,True,True,1025705.0,26
3,00000202-49cf-47b2-83bf-5eb3f6d98373,853,1490,637,10.0,79552809,79553496.0,10:369874,79552815.0,79553502.0,False,True,True,True,1025705.0,60
4,00000202-49cf-47b2-83bf-5eb3f6d98373,1490,1714,224,,-1,,,,,False,False,False,True,1025705.0,0


In [3]:
resolution = 1000000
resolution_str = ut.human_readable_bp(resolution)

mapq = 60

pdf = df[df['mapping_quality'] == mapq] # copied from raw
pdf = pdf[pdf['fragment_id'].notna()]
pdf['bin'] = pdf['ref_start'].apply(lambda x: ut.bin_loci(x, resolution))
print(f"{pdf.shape=}")
pdf.head()

pdf.shape=(9934765, 17)


Unnamed: 0,read_name,read_start,read_end,monomer_length,chrom,ref_start,ref_end,fragment_id,fragment_start,fragment_end,is_duplicate,is_mapped,is_close,read_unique,read_group,mapping_quality,bin
0,00000202-49cf-47b2-83bf-5eb3f6d98373,0,460,460,10,79553913,79554361.0,10:369876,79553685.0,79554367.0,False,True,False,True,1025705.0,60,80.0
3,00000202-49cf-47b2-83bf-5eb3f6d98373,853,1490,637,10,79552809,79553496.0,10:369874,79552815.0,79553502.0,False,True,True,True,1025705.0,60,80.0
6,0000131a-4f27-4dc5-839d-09720b024db9,0,236,236,4,45374671,45374875.0,4:195912,45374673.0,45374877.0,False,True,False,True,432517.0,60,46.0
7,0000131a-4f27-4dc5-839d-09720b024db9,236,799,563,4,45374875,45375437.0,4:195913,45374877.0,45375439.0,True,True,True,True,432517.0,60,46.0
9,0000131a-4f27-4dc5-839d-09720b024db9,830,1171,341,4,45375469,45375808.0,4:195914,45375439.0,45375810.0,False,True,True,True,432517.0,60,46.0


In [4]:
sample_size = 10000

chroms = [
    '1',
]

for chrom in chroms:
    group = pdf[pdf['chrom'] == chrom].reset_index(drop=True)
    group['order'] = group.groupby('read_name')['bin'].transform('nunique')
    group = group[group['order'] > 2].reset_index(drop=True)
    group['read_code'] = group['read_name'].astype('category').cat.codes

    group['min_start'] = group.groupby('read_code')['bin'].transform('min')
    group = group.sort_values(by=['min_start', 'read_code'])

    sorted_read_codes = group['read_code'].unique()
    sample_ind = np.random.choice(list(range(len(sorted_read_codes))), sample_size, replace=False)
    sample_ind = sorted(sample_ind)

    val = np.ones(len(group))
    I = ut.incidence_by_pivot(group, 'read_code', 'bin', val)
    I = I[sorted_read_codes[sample_ind]]

I.shape

(193, 10000)

In [5]:
def tuples_to_one_hot(tuples_list):
    """Converts a list of tuples to a one-hot encoded pandas DataFrame.

    Args:
        tuples_list (list): A list of tuples, where each tuple represents a set of labels.

    Returns:
        pandas.DataFrame: A one-hot encoded DataFrame.
    """

    mlb = MultiLabelBinarizer()
    one_hot_data = mlb.fit_transform(tuples_list)
    df = pd.DataFrame(one_hot_data, columns=mlb.classes_)
    return df
    
def generate_random_unobserved_hyperedges(df, num_hyperedges):
    """Generates random hyperedges that were not observed in the DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame representing hyperedges.
        num_hyperedges (int): The number of random hyperedges to generate.

    Returns:
        list: A list of tuples, where each tuple represents a random unobserved hyperedge.
    """

    all_elements = np.array(df.index)
    num_elements = len(all_elements)

    generated_hyperedges = []
    while len(generated_hyperedges) < num_hyperedges:
        # Sample a random hyperedge size based on observed distributions
        hyperedge_size = np.random.choice(df.sum(axis=0))  

        # Sample elements without replacement
        random_hyperedge = tuple(np.random.choice(all_elements, size=hyperedge_size, replace=False))

        temp_df = pd.DataFrame(index=random_hyperedge, columns=['temp'])
        if not temp_df.index.isin(df.columns).any():
            generated_hyperedges.append(random_hyperedge)

    df = tuples_to_one_hot(generated_hyperedges)
    return df.T

Ihat = generate_random_unobserved_hyperedges(I, sample_size)
Ihat.shape

(126, 10000)

In [6]:
data = pd.concat([I, Ihat], axis=1).T
data = data.fillna(0)
labels = np.concatenate([np.ones(sample_size), -1*np.ones(sample_size)])

print(f"{data.shape=} {labels.shape=}")

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33)
print(f"{X_train.shape=}")
print(f"{X_test.shape=}")
print(f"{y_train.shape=}")
print(f"{y_test.shape=}")

data.shape=(20000, 193) labels.shape=(20000,)
X_train.shape=(13400, 193)
X_test.shape=(6600, 193)
y_train.shape=(13400,)
y_test.shape=(6600,)


In [7]:
def train_evaluate_random_forest(X_train, X_test, y_train, y_test):
    """Trains a Random Forest classifier and evaluates its performance on test data.
    
    Args:
      X_train (array-like): The training features.
      X_test (array-like): The testing features.
      y_train (array-like): The training labels.
      y_test (array-like): The testing labels.
    
    Returns:
      tuple: A tuple containing the following metrics:
          * accuracy (float): Accuracy score.
          * f1 (float): F1 score (macro average).
          * precision (float): Precision score (macro average).
          * recall (float): Recall score (macro average).
    """
    # Hyperparameters (adjust as needed)
    n_estimators = 1000  # Number of trees in the forest
    max_depth = 1000 # Maximum depth of individual trees
    
    # Train the Random Forest model
    model = RandomForestClassifier(n_estimators=n_estimators, 
                                   max_depth=max_depth)
    model.fit(X_train, y_train)
    
    # Make predictions on test data
    y_pred = model.predict(X_test)
    
    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    auc_roc = roc_auc_score(y_test, y_pred)  # Calculate AUC-ROC
    
    return accuracy, f1, precision, recall, auc_roc

train_evaluate_random_forest(X_train, X_test, y_train, y_test)

(0.9174242424242425,
 0.9168058997916076,
 0.9238316279818659,
 0.9156525735294118,
 0.9156525735294118)

In [8]:
Ihat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
5.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
