In [1]:
import numpy as np
import pandas as pd

from scipy.stats import pearsonr
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns

## Read in data from GS storage

- `enfo_data`: enformer skew predictions generated using `enformer-usage.ipynb`
- `mpra_data`: MPRA measured skew and BODA model predicted skews
- `enfo_metadata`: enformer output column metadata

In [None]:
enfo_data = pd.read_table('gs://korvaz/mpra_model_manuscript/data/enformer__mpra_test_set_preds.csv', sep='\t', header=0)
mpra_data = pd.read_table('gs://korvaz/mpra_model_manuscript/data/boda2__single_variant_pairs__raw.txt', sep=' ', header=0)

enfo_metadata = pd.read_table('gs://basenji_barnyard/data/human/targets.txt', header=0)

## Filter MPRA measurements

Remove systematically noisy measurements by removing sequences with low oligo counts and near 0 effective activities

In [None]:
plasmid_filter = np.logical_or(
    (mpra_data.loc[:,['ctrl_mean_k562_ref']] >= 100), 
    (mpra_data.loc[:,['ctrl_mean_k562_alt']] >= 100)
).values

activity_filter = np.logical_or(
    (mpra_data.loc[:,['K562_mean_alt']].abs() > 1), 
    (mpra_data.loc[:,['K562_mean_ref']].abs() > 1)
).values

mpra_data = mpra_data.loc[ plasmid_filter & activity_filter ]

In [None]:
enfo_data.columns

## Build filters for Enformer predictions
We want K562 related predictions

In [None]:
k562_idxs = [ rec['index'] for i, rec in enfo_metadata.iterrows() if 'K562' in rec['description'] ]
k562_tags = [ tag for tag in enfo_data.columns[5:] if int(tag.split('_')[0]) in k562_idxs ]
k562_flags= [ int(tag.split('_')[0]) in k562_idxs for tag in enfo_data.columns[5:] ]

cage_tags = [ tag for tag in enfo_data.columns if 'CAGE' in tag ]

In [None]:
k562_tags

In [None]:
cage_tags

## Refine K562 features
Based on the paper, use DNase and CAGE preds for VEP, and summarize using the first dim from PCA

In [None]:
strict_k562_filter = k562_tags[3:7] + k562_tags[-2:]

In [None]:
pca = PCA(n_components=2)
enfo_pca = pca.fit_transform(enfo_data.loc[:, strict_k562_filter])
for dim in range(enfo_pca.shape[1]):
    enfo_data[f'PC_{dim}'] = enfo_pca[:,dim]
    
if pearsonr( enfo_data.loc[:, strict_k562_filter].mean(axis=1), enfo_data.loc[:, 'PC_0'] )[0] >= 0:
    pass
else:
    enfo_data.loc[:, 'PC_0'] = enfo_data.loc[:, 'PC_0'] * -1

In [None]:
full_filter = list(enfo_data.columns[0:5]) + list(strict_k562_filter) + [f'PC_{dim}' for dim in range(enfo_pca.shape[1])]
k562_dnase_flags = [ tag in full_filter for tag in enfo_data.columns ]

In [None]:
full_filter

## Merge Enformer predictions with MPRA measurements \/ BODA predictions

In [None]:
merged_data = pd.concat(
    [
        enfo_data.loc[:, k562_dnase_flags].set_index('id'), 
        mpra_data.loc[:,['IDs_ref','K562_mean_alt','K562_mean_ref','K562_pred_aggreg_alt','K562_pred_aggreg_ref']].set_index('IDs_ref')
    ],axis=1,join='inner'
)

In [None]:
merged_data

## Plot skew: predictions (y axis) vs MPRA measured (x axis)

In [None]:
fig, axes = plt.subplots(1,2, figsize=[12,5])

enfo_corr = pearsonr(
    merged_data.loc[:,'K562_mean_alt'] - merged_data.loc[:,'K562_mean_ref'],
    merged_data.loc[:,strict_k562_filter].mean(axis=1)
)

mpra_corr = pearsonr(
    merged_data.loc[:,'K562_mean_alt'] - merged_data.loc[:,'K562_mean_ref'],
    merged_data.loc[:,'K562_pred_aggreg_alt'] - merged_data.loc[:,'K562_pred_aggreg_ref']
)


sns.scatterplot( 
    x=merged_data.loc[:,'K562_mean_alt'] - merged_data.loc[:,'K562_mean_ref'],
    y=merged_data.loc[:,strict_k562_filter].mean(axis=1),
    ax=axes[0]
)

sns.scatterplot( 
    x=merged_data.loc[:,'K562_mean_alt'] - merged_data.loc[:,'K562_mean_ref'],
    y=merged_data.loc[:,'K562_pred_aggreg_alt'] - merged_data.loc[:,'K562_pred_aggreg_ref'],
    ax=axes[1]
)

axes[0].title.set_text(f'Enformer mean K562 features Pearsons R: {enfo_corr[0]:.4f}')
axes[1].title.set_text(f'BODA K562 predictions Pearsons R: {mpra_corr[0]:.4f}')


In [None]:
fig, axes = plt.subplots(1,2, figsize=[12,5])

enfo_corr = pearsonr(
    merged_data.loc[:,'K562_mean_alt'] - merged_data.loc[:,'K562_mean_ref'],
    merged_data.loc[:,'PC_0']
)

mpra_corr = pearsonr(
    merged_data.loc[:,'K562_mean_alt'] - merged_data.loc[:,'K562_mean_ref'],
    merged_data.loc[:,'K562_pred_aggreg_alt'] - merged_data.loc[:,'K562_pred_aggreg_ref']
)


sns.scatterplot( 
    x=merged_data.loc[:,'K562_mean_alt'] - merged_data.loc[:,'K562_mean_ref'],
    y=merged_data.loc[:,'PC_0'],
    ax=axes[0]
)

sns.scatterplot( 
    x=merged_data.loc[:,'K562_mean_alt'] - merged_data.loc[:,'K562_mean_ref'],
    y=merged_data.loc[:,'K562_pred_aggreg_alt'] - merged_data.loc[:,'K562_pred_aggreg_ref'],
    ax=axes[1]
)

axes[0].title.set_text(f'Enformer K562 features, first PC Pearsons R: {enfo_corr[0]:.4f}')
axes[1].title.set_text(f'BODA K562 predictions Pearsons R: {mpra_corr[0]:.4f}')