In [1]:
import pandas as pd
import utils
import numpy as np
import warnings
from tqdm import tqdm
from copairs.map import average_precision, mean_average_precision
import logging

logging.basicConfig(format="%(levelname)s:%(asctime)s:%(name)s:%(message)s")
logging.getLogger("copairs").setLevel(logging.INFO)

warnings.simplefilter(action="ignore", category=FutureWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
operations = "wellpos_cc_var_mad_outlier_featselect_sphering_harmony"
batch_size = 20000
null_size = 20000
fdr = 0.1

### Prepare the data

#### Read the ORF parquet file

In [3]:
#Reading the output from the ORF pipeline
raw_orf_df = pd.read_parquet('c:\\Users\\ssivagur\\Documents\\GitHub\\ssivagur\\FeatureSpaceIntegration\\ORFValidation\\profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony.parquet')

In [4]:
# Filtering only the ORF from the raw output 
orf_df = raw_orf_df[raw_orf_df['Metadata_PlateType'] == 'ORF']

In [5]:
orf_df.shape

(82483, 636)

In [6]:
#dropping the duplicate metadata columns
orf_df_cols = [c for c in orf_df.columns if c.startswith("Metadata_")]

In [7]:
orf_df_cols

['Metadata_Source',
 'Metadata_Plate',
 'Metadata_Well',
 'Metadata_JCP2022',
 'Metadata_broad_sample',
 'Metadata_Name',
 'Metadata_Vector',
 'Metadata_Transcript',
 'Metadata_Symbol_x',
 'Metadata_NCBI_Gene_ID_x',
 'Metadata_Taxon_ID',
 'Metadata_Gene_Description',
 'Metadata_Prot_Match',
 'Metadata_Insert_Length',
 'Metadata_pert_type',
 'Metadata_NCBI_Gene_ID_y',
 'Metadata_Symbol_y',
 'Metadata_Batch',
 'Metadata_PlateType',
 'Metadata_Row',
 'Metadata_Column',
 'Metadata_Microscope']

In [8]:
# there are duplicate columns of NCBI_Gene_ID and the Symbol and the columns with the NaN are dropped
# orf_df['Metadata_NCBI_Gene_ID_y'] has NaN and it is dropped
orf_df['Metadata_NCBI_Gene_ID_y']

334537   NaN
334538   NaN
334539   NaN
334540   NaN
334541   NaN
          ..
417015   NaN
417016   NaN
417017   NaN
417018   NaN
417019   NaN
Name: Metadata_NCBI_Gene_ID_y, Length: 82483, dtype: float64

In [9]:
orf_df['Metadata_NCBI_Gene_ID_y'].isnull().all()

True

In [9]:
orf_df = orf_df.drop(['Metadata_NCBI_Gene_ID_y'], axis=1)

In [10]:
# orf_df['Metadata_Symbol_y'] has NaN and it is dropped
orf_df['Metadata_Symbol_y']

334537    NaN
334538    NaN
334539    NaN
334540    NaN
334541    NaN
         ... 
417015    NaN
417016    NaN
417017    NaN
417018    NaN
417019    NaN
Name: Metadata_Symbol_y, Length: 82483, dtype: category
Categories (7977, object): ['A2M', 'A3GALT2', 'A4GALT', 'A4GNT', ..., 'ZSCAN9', 'ZSWIM2', 'no-guide', 'non-targeting']

In [11]:
orf_df['Metadata_Symbol_y'].isnull().all()

True

In [11]:
orf_df = orf_df.drop(['Metadata_Symbol_y'], axis=1)

In [12]:
orf_df.shape

(82483, 634)

In [13]:
#Renaming the ['Metadata_Symbol_y'] and ['Metadata_NCBI_Gene_ID_y'] to be used in the further steps 
orf_df = orf_df.rename(columns={'Metadata_NCBI_Gene_ID_x':'Metadata_NCBI_Gene_ID', 'Metadata_Symbol_x':'Metadata_Symbol'})

In [14]:
orf_df.shape

(82483, 634)

In [15]:
orf_df.to_parquet('C:\\Users\\ssivagur\\Documents\\GitHub\\ssivagur\\FeatureSpaceIntegration\\ORFValidation\\profiles_ORF.parquet')

#### Add annotations - I did not run this since the profiles that I have already has the annotations in it

In [4]:
#orf_metdata_df = pd.read_csv(
    #"../00.download-and-process-annotations/output/orf_metadata.tsv.gz", sep="\t"
#)
#compound_metadata_df = pd.read_csv(
    #"../datasets/metadata/compound.csv.gz", usecols=["Metadata_JCP2022"]
#).#assign(
    #Metadata_pert_type=lambda x: np.where(
     #   x["Metadata_JCP2022"] == "JCP2022_999999", "empty", "poscon"
 #   )
#)

#metadata_df = pd.concat(
 #   [
        #orf_metdata_df,
        #compound_metadata_df
  #  ],
    #join="outer",
    #ignore_index=True,
#)

#orf_df = orf_df.merge(metadata_df, on="Metadata_JCP2022", how="inner")
#orf_df.shape

(81660, 750)

#### Remove empty wells

In [47]:
orf_df = utils.remove_empty_wells(orf_df)
orf_df.shape

(81493, 634)

#### Remove `poscon` wells.

In [48]:
orf_df = orf_df.query('Metadata_pert_type!="poscon"').reset_index(drop=True)
orf_df.shape

(79563, 634)

#### Remove `BAD CONSTRUCT` profiles

In [49]:
orf_df = orf_df.query('Metadata_broad_sample!="BAD CONSTRUCT"').reset_index(drop=True)
orf_df.shape

(79563, 634)

#### Remove featues with `nan` values.
These need to be removed as the `nan` values will cause the mean average precision calculation to fail.

In [50]:
orf_df = utils.remove_nan_features(orf_df)

Removed nan features: []


#### Remove low infection effiency wells

In [51]:
# Add platemap name

platemap_df = pd.read_csv(
    "../00.download-and-process-annotations/input/experiment-metadata.tsv",
    sep="\t",
    usecols=["Plate_Map_Name", "Assay_Plate_Barcode"],
).rename(
    columns={
        "Plate_Map_Name": "Metadata_plate_map_name",
        "Assay_Plate_Barcode": "Metadata_Plate",
    }
)

orf_df = orf_df.merge(platemap_df, on="Metadata_Plate", how="left")

orf_df = utils.remove_low_infection_efficiency_wells(orf_df)
orf_df.shape

(72345, 635)

### Calculate mAP for each ORF perturbation

In [52]:
# Adding a new column for negative control
orf_df["Metadata_negcon"] = np.where(orf_df["Metadata_pert_type"] == "negcon", 1, 0)

In [53]:
pos_sameby = ["Metadata_JCP2022"]
pos_diffby = []
neg_sameby = ["Metadata_Plate"]
neg_diffby = ["Metadata_negcon"]

In [54]:
metadata_df = utils.get_metadata(orf_df)
feature_df = utils.get_featuredata(orf_df)
feature_values = feature_df.values

In [55]:
result = average_precision(
    metadata_df, feature_values, pos_sameby, pos_diffby, neg_sameby, neg_diffby, batch_size=batch_size
)

INFO:2024-08-15 15:14:00,659:copairs:Indexing metadata...
INFO:2024-08-15 15:14:00,781:copairs:Finding positive pairs...
INFO:2024-08-15 15:14:01,619:copairs:Finding negative pairs...
INFO:2024-08-15 15:14:02,718:copairs:Computing positive similarities...
INFO:2024-08-15 15:14:08,619:copairs:Computing negative similarities...
INFO:2024-08-15 15:14:11,835:copairs:Building rank lists...
INFO:2024-08-15 15:14:13,746:copairs:Computing average precision...
INFO:2024-08-15 15:14:13,850:copairs:Creating result DataFrame...
INFO:2024-08-15 15:14:13,856:copairs:Finished.


In [56]:
# Remove negcon
result = result.query('Metadata_pert_type!="negcon"').reset_index(drop=True)

In [57]:
agg_result = (
    mean_average_precision(result, pos_sameby, null_size=null_size, threshold=fdr, seed=12527)
    .rename(columns={'average_precision': 'mean_average_precision'})
)

INFO:2024-08-15 15:14:23,559:copairs:Computing null_dist...
INFO:2024-08-15 15:14:23,625:copairs:Computing p-values...
                                                       

In [58]:
agg_result.to_csv(f"C:\\Users\\ssivagur\\Documents\\GitHub\\ssivagur\\FeatureSpaceIntegration\\ORFValidation\\PhenotypicActivity_ORF.csv.gz", index=False)