In [1]:
import pandas as pd
import plotly.express as px

In [2]:
profiles = {
    "ORF": "c:\\Users\\ssivagur\\Documents\\GitHub\\ssivagur\\FeatureSpaceIntegration\\ORFValidation\\profiles_ORF.parquet",
    "CRISPR": "c:\\Users\\ssivagur\\Documents\\GitHub\\ssivagur\\FeatureSpaceIntegration\\ORFValidation\\profiles_crispr.parquet",
}

color_discrete_map = {
    "ORF": "rgb(0, 158, 115)",
    "CRISPR": "rgb(213, 94, 0)",
}

gene_labels = {
    
    "CORUM Complex": "corum_complex",
    
}

label_columns = {
   
    "CORUM Complex": "Metadata_corum_complex_list",
  
}

### ORF

Read phenotypic activity data

In [4]:
phenotypic_activity_df = pd.DataFrame()

for profile in profiles:
    df = pd.read_csv(f'c:\\Users\\ssivagur\\Documents\\GitHub\\ssivagur\\FeatureSpaceIntegration\\ORFValidation\\PhenotypicActivity_{profile}.csv.gz')
    df["profile_type"] = profile
    phenotypic_activity_df = pd.concat(
        [phenotypic_activity_df, df], ignore_index=True, axis=0
    )

In [12]:
fig = px.box(
    phenotypic_activity_df,
    x="profile_type",
    y="mean_average_precision",
    color="profile_type",
    color_discrete_map=color_discrete_map,
).update_layout(
    title="Phenotypic Activity",
    xaxis=dict(title="Modality", linecolor="black"),
    yaxis=dict(title="Mean average precision", linecolor="black"),
    plot_bgcolor="white",
    showlegend=False,
)
fig.update_layout(height=480, width=640)
fig.show()
#fig.write_image("figures/phenotypic-activity.png", height=480, width=640, scale=2)
#fig.write_image("figures/phenotypic-activity.svg", height=480, width=640, scale=2)

In [8]:
fraction_retrieved = (
    phenotypic_activity_df.groupby("profile_type")
    .below_corrected_p.apply(lambda x: (x == True).sum() / len(x))
    .reset_index()
    .rename(columns={"below_corrected_p": "fraction_retrieved"})
)

print(fraction_retrieved.to_markdown(index=False))

| profile_type   |   fraction_retrieved |
|:---------------|---------------------:|
| CRISPR         |             0.984193 |
| ORF            |             0.92736  |


In [13]:
n_retrieved = (
    phenotypic_activity_df.groupby("profile_type")
    .below_corrected_p.apply(lambda x: (x == True).sum())
    .reset_index()
    .rename(columns={"below_corrected_p": "n_retrieved"})
)

print(n_retrieved.to_markdown(index=False))

| profile_type   |   n_retrieved |
|:---------------|--------------:|
| CRISPR         |          7845 |
| ORF            |         12741 |


In [14]:
gene_label_retrieval_df = pd.DataFrame()

for gene_label in gene_labels:
    df = (
        pd.read_parquet(f"output/{gene_labels[gene_label]}_retrieval.parquet")[
            ["mean_average_precision", "below_corrected_p", "Modality"]
        ]
        .assign(gene_label=gene_label)
    )
    gene_label_retrieval_df = pd.concat(
        [gene_label_retrieval_df, df], ignore_index=True, axis=0
    )

In [17]:
fig = (
    px.box(
        gene_label_retrieval_df,
        x="Modality",
        y="mean_average_precision",
        color="Modality",
        color_discrete_map=color_discrete_map,
        facet_col="gene_label",
        facet_col_wrap=2,
    )
    .update_layout(
        title="Phenotypic consistency",
        xaxis1=dict(title="Modality",linecolor="black"),
        xaxis3=dict(linecolor="black"),
        xaxis4=dict(title="Modality", linecolor="black", showticklabels=True),
        xaxis5=dict(linecolor="black"),
        xaxis6=dict(linecolor="black"),
        yaxis1=dict(title=f"Mean average<br>precision", linecolor="black"),
        yaxis3=dict(title=f"Mean average<br>precision", linecolor="black"),
        yaxis4=dict(linecolor="black"),
        yaxis5=dict(title=f"Mean average<br>precision",linecolor="black"),
        yaxis6=dict(linecolor="black"),
        plot_bgcolor="white",
    )
)
fig.update_layout(height=480, width=640)
fig.show()
#fig.write_image("figures/phenotypic-consistency-gene-labels.png", height=480, width=640, scale=2)
#fig.write_image("figures/phenotypic-consistency-gene-labels.svg", height=480, width=640, scale=2)

In [18]:
print(
    gene_label_retrieval_df.query("below_corrected_p == True")
    .groupby(["Modality", "gene_label"])
    .mean_average_precision.count()
    .reset_index()
    .pivot(index="Modality", columns="gene_label", values="mean_average_precision")
    .sort_index()
    [list(gene_labels.keys())]
    .fillna(0)
    .to_markdown()
)

| Modality   |   CORUM Complex |
|:-----------|----------------:|
| CRISPR     |             244 |
| ORF        |              84 |


In [19]:
common_labels_df = pd.DataFrame()

for gene_label in gene_labels:
    df = pd.read_parquet(f"output/{gene_labels[gene_label]}_retrieval.parquet")[
        [f"{label_columns[gene_label]}", "below_corrected_p", "Modality"]
    ]
    orf_df = df.query("Modality == 'ORF' and below_corrected_p == True")
    crispr_df = df.query("Modality == 'CRISPR' and below_corrected_p == True")
    merged_df = orf_df.merge(crispr_df, on=f"{label_columns[gene_label]}", how="inner")

    common_labels_df = pd.concat(
        [
            common_labels_df,
            pd.DataFrame(
                {
                    "ORF": len(orf_df),
                    "CRISPR": len(crispr_df),
                    "Common": len(merged_df),
                },
                index=[gene_label],
            ),
        ],
        axis=0,
    )

print(common_labels_df.to_markdown())

|               |   ORF |   CRISPR |   Common |
|:--------------|------:|---------:|---------:|
| CORUM Complex |    84 |      244 |       62 |
