In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from rnacappredictor.predict_cap import generate_fingerprint_mixes

In [None]:
df = pd.concat([
    pd.read_csv("../data/FM205BIS/no_sample_id/20250705_1815_MD-101425_FBC20638_41e6243c/fastq_pass/U1-1/fingerprints.csv"),
    pd.read_csv("../data/FM205BIS/no_sample_id/20250705_1815_MD-101425_FBC20638_41e6243c/fastq_pass/U1-11/fingerprints.csv"),
    pd.read_csv("../data/FM205BIS/no_sample_id/20250705_1815_MD-101425_FBC20638_41e6243c/fastq_pass/U1-138P/fingerprints.csv"),
    pd.read_csv("../data/FM205BIS/no_sample_id/20250705_1815_MD-101425_FBC20638_41e6243c/fastq_pass/U1-148P/fingerprints.csv"),
    pd.read_csv("../data/FM205BIS/no_sample_id/20250705_1815_MD-101425_FBC20638_41e6243c/fastq_pass/U6/fingerprints.csv")
])
df['barcode'] = df['barcode'].apply(lambda x: int(x.replace('barcode', '')))

barcode_isoform_to_rt = {
    (1, "U1-1"): "INDURO",
    (6, "U1-1"): "ProtoScript",
    (11, "U1-1"): "Marathon",
    (16, "U1-1"): "GoScript",
    (21, "U1-1"): "EpiScript",
    (4, "U1-11"): "INDURO",
    (9, "U1-11"): "ProtoScript",
    (14, "U1-11"): "Marathon",
    (19, "U1-11"): "GoScript",
    (24, "U1-11"): "EpiScript",
    (2, "U1-138P"): "INDURO",
    (7, "U1-138P"): "ProtoScript",
    (12, "U1-138P"): "Marathon",
    (17, "U1-138P"): "GoScript",
    (22, "U1-138P"): "EpiScript",
    (3, "U1-148P"): "INDURO",
    (8, "U1-148P"): "ProtoScript",
    (13, "U1-148P"): "Marathon",
    (18, "U1-148P"): "GoScript",
    (23, "U1-148P"): "EpiScript",
    (5, "U6"): "INDURO",
    (10, "U6"): "ProtoScript",
    (15, "U6"): "Marathon",
    (20, "U6"): "GoScript",
    (1, "U6"): "EpiScript",
}

df['RT'] = df.apply(lambda row: barcode_isoform_to_rt[(row['barcode'], row['isoform'])], axis=1)
df

In [None]:
df.to_csv('out_data/FM205_fingerprints.csv', index=False)

In [None]:
# Pivot the data to create a matrix of isoform x barcode
pivot_df = df.pivot(index='isoform', columns='RT', values='num_reads_ACGT')

# Create heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(pivot_df, cmap='viridis', annot=True, fmt='.0f', cbar_kws={'label': 'Number of reads'})
plt.title('Number of aligned reads per isoform and RT in FM205')
plt.xlabel('Barcode')
plt.ylabel('Isoform')
plt.show()

In [None]:
from rnacappredictor.predict_cap import predict_cap

df_test = df.copy()
df_test['cap'] = 'Unknown'
df_test['experiment'] = 'FM205_' + df_test['isoform']

df_train = pd.read_csv('../data/FM179-FM181_fingerprints.csv')
df_res = predict_cap(df_train, df_test, show_true_cap=True)
df_res

In [None]:
df_res.to_csv('out_data/FM205_res.csv', index=False)

In [None]:
nucleotide_cols = ['A%', 'C%', 'G%', 'T%']
colors = {'A%': 'green', 'C%': 'blue', 'G%': 'orange', 'T%': 'red'}

# Get number of experiments and caps for subplot grid
experiments = df_test['experiment'].unique()
n_rows = 1
n_cols = len(experiments)

# Create figure with subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*3, n_rows*3.3))
fig.tight_layout(pad=5.0)

for i, experiment in enumerate(experiments):
    df_experiment = df_test[df_test['experiment'] == experiment]
    
    subset = df_experiment.copy()
    subset = subset[['RT'] + nucleotide_cols].set_index('RT')
    subset = subset.reindex(sorted(subset.index))
    
    # Plot on appropriate subplot
    if i == n_cols-1:  # Rightmost plot
        ax = subset.plot(kind='bar', stacked=True, ax=axes[i],
                       color=colors, legend=True)
        # Move legend outside with more space
        legend = axes[i].legend(title='Nucleotide', bbox_to_anchor=(1.15, 1))
        # Remove _INSDEL suffix from legend labels
        for text in legend.get_texts():
            text.set_text(text.get_text().replace('_INSDEL', ''))
    else:
        subset.plot(kind='bar', stacked=True, ax=axes[i],
                   color=colors, legend=False)
    
    axes[i].set_title(f"{experiment}")
    axes[i].set_ylabel('%')
    axes[i].set_ylim(0, 1.0)
    
    # Rotate x-axis labels
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].set_xlabel('')

plt.show()

## Deconvolution

In [None]:
df_train = df_train[df_train['experiment'] == 'FM180 + FM181']
df_train_mixes = generate_fingerprint_mixes(df_train)
df_train_mixes

In [None]:
df_pred_mixes = predict_cap(df_train_mixes, df_test, include_insdel=False, print_top_k=10)
df_pred_mixes

In [None]:
# Get number of predictions by counting prediction_X columns
num_preds = len([col for col in df_pred_mixes.columns if col.startswith('prediction_')])

import re

# Function to extract percentages from prediction string
def extract_cap_percentages(pred_str):
    nad = float(re.search(r'NAD-U1 \(([\d.]+)%\)', pred_str).group(1))
    ap4a = float(re.search(r'Ap₄A-U1 \(([\d.]+)%\)', pred_str).group(1))
    m7g = float(re.search(r'm⁷Gp₃A-U1 \(([\d.]+)%\)', pred_str).group(1))
    tmg = float(re.search(r'TMG-U1 \(([\d.]+)%\)', pred_str).group(1))
    return nad, ap4a, m7g, tmg

# Process each row
all_results = []
for idx in df_pred_mixes.index:
    # Get predictions and similarities for this row
    predictions = [df_pred_mixes[f'prediction_{i}'].iloc[idx] for i in range(1, num_preds + 1)]
    similarities = [df_pred_mixes[f'similarity_{i}'].iloc[idx] for i in range(1, num_preds + 1)]
    
    # Extract percentages for each cap
    cap_percentages = [extract_cap_percentages(pred) for pred in predictions]
    
    # Create dataframe for this row
    df_row = pd.DataFrame({
        'experiment': df_pred_mixes['experiment'].iloc[idx],
        'prediction': predictions,
        'NAD-U1 (%)': [p[0] for p in cap_percentages],
        'Ap₄A-U1 (%)': [p[1] for p in cap_percentages],
        'm⁷Gp₃A-U1 (%)': [p[2] for p in cap_percentages],
        'TMG-U1 (%)': [p[3] for p in cap_percentages],
        'similarity': similarities
    }).sort_values('similarity', ascending=False)
    
    all_results.append(df_row)

# Combine all results
df_top = pd.concat(all_results, axis=0)
df_top = df_top.drop(columns=['prediction'])

for experiment in df_top['experiment'].unique():
    print(experiment)
    display(df_top[df_top['experiment'] == experiment].drop(columns=['experiment']).style.hide(axis='index'))

In [None]:
df_res.to_csv('out_data/FM205_res_deconvolution.csv', index=False)