In [1]:
import os
import sys
import pandas as pd
import numpy as np
import pyranges as pr
import pyBigWig
import glob
import time
from scipy.sparse import csr_matrix
import anndata as an
import scanpy as sc

# Load chrom sizes table

In [2]:
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/chrom_sizes.csv"
cdf = pd.read_csv(fpath)
cdf.head()

Unnamed: 0,chrom,size,bp_start
0,1,195154279,0
1,2,181755017,195154279
2,3,159745316,376909296
3,4,156860686,536654612
4,5,151758149,693515298


# load restriction fragments

In [3]:
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/fragments.parquet"
df = pd.read_parquet(fpath)
print(f"{df.shape=}")
df = df[df['Chromosome'].isin(cdf['chrom'])]
df.head()

df.shape=(12876595, 5)


Unnamed: 0,Chromosome,Start,End,fragment_id,fragment_length
0,1,0,3050051,0,3050051
1,1,3050051,3050056,1,5
2,1,3050056,3050844,2,788
3,1,3050844,3050907,3,63
4,1,3050907,3050947,4,40


# load gene annotations

In [4]:
# load the gene annotations
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/gene_table.parquet"
gdf = pd.read_parquet(fpath)
print(f"Raw {gdf.shape=}")
gdf = gdf[gdf['gene_biotype'] == 'protein_coding']
print(f"Protein Coding {gdf.shape=}")
gdf.head()

Raw gdf.shape=(56655, 10)
Protein Coding gdf.shape=(21608, 10)


Unnamed: 0,gene_id,gene_name,gene_source,gene_biotype,Chromosome,Start,End,length,midpoint,is_tf
11,ENSMUSG00000051285,Pcmtd1,ensembl_havana,protein_coding,1,7159143,7243852,84709,7201497,False
21,ENSMUSG00000026312,Cdh7,ensembl_havana,protein_coding,1,109910160,110067887,157727,109989023,False
24,ENSMUSG00000039748,Exo1,ensembl_havana,protein_coding,1,175708146,175741055,32909,175724600,False
26,ENSMUSG00000104158,Becn2,havana,protein_coding,1,175747894,175749791,1897,175748842,False
40,ENSMUSG00000033007,Asic4,ensembl_havana,protein_coding,1,75427079,75450987,23908,75439033,False


# load expression

In [5]:
# load the gene annotations
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/expression_table/rna_table.parquet"
edf = pd.read_parquet(fpath, columns=['ens_gene_id', 'TPM'])
edf.columns = ['gene_id', 'TPM']
print(f"Raw {edf.shape=}")

gdf = pd.merge(gdf, edf, how='left')
print(f"{gdf.shape=}")
gdf.head()

Raw edf.shape=(51883, 2)
gdf.shape=(21608, 11)


Unnamed: 0,gene_id,gene_name,gene_source,gene_biotype,Chromosome,Start,End,length,midpoint,is_tf,TPM
0,ENSMUSG00000051285,Pcmtd1,ensembl_havana,protein_coding,1,7159143,7243852,84709,7201497,False,18.45
1,ENSMUSG00000026312,Cdh7,ensembl_havana,protein_coding,1,109910160,110067887,157727,109989023,False,0.0
2,ENSMUSG00000039748,Exo1,ensembl_havana,protein_coding,1,175708146,175741055,32909,175724600,False,43.11
3,ENSMUSG00000104158,Becn2,havana,protein_coding,1,175747894,175749791,1897,175748842,False,0.0
4,ENSMUSG00000033007,Asic4,ensembl_havana,protein_coding,1,75427079,75450987,23908,75439033,False,0.1


# Merge genes on Restriction Fragments

In [6]:
# Convert to PyRanges objects
pr_df = pr.PyRanges(df)
pr_gdf = pr.PyRanges(gdf)

# Perform the join. 
merged_pr = pr_df.join(
    pr_gdf, 
    how="left",
    suffix="_gene",
    report_overlap=True,
    preserve_order=True,
) # left join keeps all rows from df

# Convert back to pandas DataFrame
merged_df = merged_pr.df
print(f"{df.shape=}")
print(f"{gdf.shape=}")
print(f"{merged_df.shape=}")

# Select the best overlap for each interval
merged_df = merged_df.sort_values(by=['Chromosome', 'Start', 'Overlap'], ascending=[True, True, False])
merged_df = merged_df.drop_duplicates(subset=['fragment_id',], keep='first')
print(f"{merged_df.shape=}")
merged_df.head()

df.shape=(12472365, 5)
gdf.shape=(21608, 11)
merged_df.shape=(12614520, 16)
merged_df.shape=(12472365, 16)


Unnamed: 0,Chromosome,Start,End,fragment_id,fragment_length,gene_id,gene_name,gene_source,gene_biotype,Start_gene,End_gene,length,midpoint,is_tf,TPM,Overlap
0,1,0,3050051,0,3050051,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1.0,-1
1,1,3050051,3050056,1,5,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1.0,-3050052
2,1,3050056,3050844,2,788,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1.0,-3050057
3,1,3050844,3050907,3,63,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1.0,-3050845
4,1,3050907,3050947,4,40,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1.0,-3050908


In [7]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
break

# load feature paths

In [8]:
# load the feature paths
feature_paths = "../config/linear_features.txt"
feature_paths = pd.read_csv(feature_paths)

wigs = {}
for jidx, feature_row in feature_paths.iterrows():
    file_id = feature_row['file_id']
    file_path = feature_row['file_path']
    bigwig = pyBigWig.open(file_path)

    wigs[file_id] = bigwig

wigs

{'ATACSeq_1': <pyBigWig.bigWigFile at 0x1505e33b6b50>,
 'ATACSeq_2': <pyBigWig.bigWigFile at 0x1505e33ae220>,
 'ATACSeq_3': <pyBigWig.bigWigFile at 0x1505e33b1590>,
 'CTCF': <pyBigWig.bigWigFile at 0x1505e33d3e10>,
 'H3K27ac': <pyBigWig.bigWigFile at 0x1505e33bd950>,
 'H3K27me3': <pyBigWig.bigWigFile at 0x1505e33b7ae0>,
 'RNA_1': <pyBigWig.bigWigFile at 0x1505e34f4bd0>,
 'RNA_2': <pyBigWig.bigWigFile at 0x1505e34b9080>,
 'RNA_3': <pyBigWig.bigWigFile at 0x1506cd427420>,
 'RNA_4': <pyBigWig.bigWigFile at 0x1505e338e8e0>,
 'RNA_5': <pyBigWig.bigWigFile at 0x1505e34ea580>,
 'RNA_6': <pyBigWig.bigWigFile at 0x1506fd890300>,
 'PolII': <pyBigWig.bigWigFile at 0x1505e34ac120>}

# merge features

In [11]:
feature_rows = []

for idx, row in merged_df.iterrows():
    chrom = row['Chromosome']
    start = row['Start']
    end = row['End']

    new_row = {'index' : idx}

    for k, bw in wigs.items():
        try:
            value = bw.stats(f"chr{chrom}", start, end, type='mean')[0]
        except:
            value = pd.NA

        new_row[k] = value
    feature_rows.append(new_row)
    break
    
features = pd.DataFrame(feature_rows)
features = features.set_index('index')
features.head()

Unnamed: 0_level_0,ATACSeq_1,ATACSeq_2,ATACSeq_3,CTCF,H3K27ac,H3K27me3,RNA_1,RNA_2,RNA_3,RNA_4,RNA_5,RNA_6,PolII
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0.057623,0.017762,0.018069,0.009718,0.024774,0.015503,0.239711,,0.13278,0.271721,,0.104429,0.008725


In [None]:
break

# Load genes

In [None]:
# load the gene annotations
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/gene_table.parquet"
gdf = pd.read_parquet(fpath)
print(f"Raw {gdf.shape=}")
print(f"Protein Coding {gdf.shape=}")
gdf.head()

In [None]:
break

In [None]:
bw.chroms()

In [None]:
break

In [None]:
df['Chromosome'].value_counts()

In [None]:
break

In [None]:
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/pore_c/population_mESC.read_level.parquet"

columns = [
    'read_name', 
    'chrom',
    'ref_start',
    'ref_end',
    'mapping_quality',
    'basename',
]

df = pd.read_parquet(fpath, columns=columns)
print(f"{df.shape=}")
df = df.rename(columns={
    'chrom' : 'Chromosome',
    'ref_start' : 'Start',
    'ref_end' : 'End',
})
df.head()

In [None]:
df.drop_duplicates(subset=['Start', 'End']).shape

In [None]:
12155831 / 14877807

In [None]:
for idx, row in df.iterrows():

    print(row['read_name'])

    break

In [None]:
break

In [None]:
# load the gene annotations
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/gene_table.parquet"
gdf = pd.read_parquet(fpath)
print(f"Raw {gdf.shape=}")
gdf = gdf[gdf['gene_biotype'] == 'protein_coding']
print(f"Protein Coding {gdf.shape=}")
gdf.head()