In [1]:
def filter_by_completeness(prot_data, prot_info, completeness_threshold=0.8, metadata_indices=None):
    """
    Filters proteins based on completeness threshold.
    
    Args:
        prot_data: DataFrame containing protein expression data
        prot_info: DataFrame containing protein metadata
        completeness_threshold: Minimum proportion of non-NaN values required (default: 0.8)
        metadata_indices: List of column indices that contain metadata
    
    Returns:
        DataFrame with proteins filtered by completeness
    """
    # Separate metadata and expression data
    meta, expr = separate_metadata(prot_data, metadata_indices)
    
    # Calculate completeness for each protein column
    completeness = expr.notna().mean()
    
    # Filter proteins that meet the completeness threshold
    complete_proteins = completeness[completeness >= completeness_threshold].index.tolist()
    
    # Filter the expression data to only include complete proteins
    filtered_expr = expr[complete_proteins]
    
    # Update protein info to match filtered dataset
    filtered_info = prot_info[prot_info['proteins'].isin(complete_proteins)]
    
    # Recombine metadata and filtered expression data
    filtered_data = recombine_data(meta, filtered_expr)
    
    print(f"Filtered from {len(expr.columns)} to {len(filtered_expr.columns)} proteins")
    print(f"Completeness threshold: {completeness_threshold}")
    
    return filtered_data, filtered_info


In [2]:
def filter_by_control_values(prot_data, control_id='control', metadata_indices=None):
    """
    Filters columns to only keep those where the control sample has a value.
    
    Args:
        prot_data: DataFrame containing protein expression data
        control_id: Identifier for control samples (default: 'control')
        metadata_indices: List of column indices that contain metadata
    
    Returns:
        DataFrame with only columns that have control values
    """
    # Get control row by finding the row with control pert_id before separating metadata
    control_row = prot_data.loc[prot_data['pert_id'] == control_id]
    
    # Separate metadata and expression data
    meta, expr = separate_metadata(prot_data, metadata_indices)
    
    # Get protein columns (excluding pert_id)
    protein_cols = expr.columns
    
    # Check which columns have values in the control row
    valid_columns = []
    for col in protein_cols:
        if not pd.isna(control_row[col].iloc[0]):
            valid_columns.append(col)
    
    # Filter dataset to only include columns that had entries in control row
    filtered_expr = expr[valid_columns]
    
    # Recombine with metadata and expression data
    filtered_data = recombine_data(meta, filtered_expr)
    
    return filtered_data


In [3]:
def convert_to_log_ratios(prot_data, control_id='control',metadata_indices=None):
    """
    Converts protein expression data to log ratios relative to control samples.
    Args:
        prot_data: DataFrame containing protein expression data
        control_id: Identifier for control samples (default: 'control')
        metadata_indices: List of column indices that contain metadata
    
    Returns:
        DataFrame containing log ratios
    """
    # Separate metadata and expression data
    meta, expr = separate_metadata(prot_data, metadata_indices)
    
    # Identify numeric columns
    numeric_cols = expr.select_dtypes(include=[np.number]).columns
    
    # Calculate mean control values for numeric columns only
    # Use meta to identify control samples since pert_id is in metadata
    control_mask = meta['pert_id'] == control_id
    control_means = expr[control_mask][numeric_cols].mean()
    
    # Calculate log ratios for all samples including control
    log_ratios = expr.copy()
    
    # Calculate log ratios for numeric columns
    for col in numeric_cols:
        log_ratios[col] = np.log10(expr[col] / control_means[col])
    
    print(f"Converted {len(expr)} samples to log ratios")
    
    # Recombine with metadata
    return recombine_data(meta, log_ratios)


In [4]:
def get_targeted_proteins(prot_data, prot_info, id_key='Uniprot.ID'):
    """
    Identifies proteins that are targeted by drugs in the dataset.
    
    Args:
        prot_data: DataFrame containing protein expression data
        prot_info: DataFrame containing protein metadata
        id_key: Column name containing target IDs (default: 'Uniprot.ID')
    
    Returns:
        List of protein IDs that are targeted by drugs
    """
    # Get all unique uniprot ID's for each trial from data.csv
    #This is from data.csv file and the uniprot ID says which proteins are targeted by drugs
    all_targets = prot_data[id_key].dropna().unique()
    
    # Split targets that may be comma or semicolon-separated and flatten the list
    target_list = []
    #if there are multiple targets, split them by comma or semicolon, then convert to uppercase and get rid of whitespace and then add to target_list
    for targets in all_targets:
        if isinstance(targets, str):
            target_list.extend([t.strip().upper() for t in re.split(r'[;,]', targets)])
    
    # Get unique targets, (just looking at the set which gets rid of duplicates)
    unique_targets = list(set(target_list))
    
    # Filter to only include targets that exist in our protein measurements
    valid_targets = [t for t in unique_targets if t in prot_data.columns]
    
    print(f"Found {len(valid_targets)} unique targeted proteins out of {len(unique_targets)} total targets")
    return valid_targets


In [5]:
def sort_proteins_by_variability(log_ratios, protein_cols):
    """
    Sorts proteins based on their variability across samples.
    
    Args:
        log_ratios: DataFrame containing log ratio values
        protein_cols: List of protein column names
    
    Returns:
        DataFrame with proteins sorted by their standard deviation
    """
    # Calculate standard deviation for each protein
    protein_std = log_ratios[protein_cols].std()
    
    # Sort proteins by standard deviation in descending order
    sorted_proteins = protein_std.sort_values(ascending=False)
    
    # Create DataFrame with protein names and their standard deviations
    variability_df = pd.DataFrame({
        'protein': sorted_proteins.index,
        'std_dev': sorted_proteins.values
    })
    
    print(f"Sorted {len(protein_cols)} proteins by variability")
    return variability_df

# Example usage:
# variability_df = sort_proteins_by_variability(log_ratios, protein_cols)


In [None]:
def reorganize_data(df_raw, meta_data_cols):
    """
    Apply transformations to the raw data including extracting cell viability and reorganizing columns.
    
    Parameters:
    -----------
    df_raw : pandas.DataFrame
        Raw input dataframe containing all data
    meta_data_cols : list
        List of column indices for metadata columns
        
    Returns:
    --------
    pandas.DataFrame
        Reorganized dataframe with protein data first, then cell viability, then metadata
    """
    # Extract cell viability and organize data columns
    cell_viability = df_raw['Cell_viability%_(cck8Drug-blk)/(control-blk)*100']
    metadata_cols = df_raw.columns[meta_data_cols]
    non_metadata_cols = df_raw.columns.difference(metadata_cols)
    non_metadata_cols = non_metadata_cols.difference(['Cell_viability%_(cck8Drug-blk)/(control-blk)*100'])

    # Reorganize dataframe with protein data first, then cell viability, then metadata
    df_meta_data_at_end = pd.concat([
        df_raw[non_metadata_cols],  # Protein expression data
        cell_viability,            # Cell viability
        df_raw[metadata_cols]      # Metadata columns
    ], axis=1)
    
    return df_meta_data_at_end

def separate_metadata(data_df, metadata_indices=None):
    """
    Separates metadata columns from protein expression data based on provided indices.
    Expression columns are all columns not in metadata_indices.
    
    Args:
        data_df: DataFrame containing both metadata and protein expression data
        metadata_indices: List of column indices for metadata (optional)
    
    Returns:
        tuple: (metadata_df, expression_df) containing separated DataFrames
    """
    if metadata_indices is None:
        # Fallback to original behavior if no indices provided
        metadata_cols = data_df.select_dtypes(include=['object', 'bool']).columns
        expression_cols = data_df.select_dtypes(include=['float64', 'int64']).columns
    else:
        # Use provided indices to split columns
        metadata_cols = data_df.columns[metadata_indices]
        expression_cols = data_df.columns[~data_df.columns.isin(metadata_cols)]
    
    # Split the data
    metadata_df = data_df[metadata_cols].copy()
    expression_df = data_df[expression_cols].copy()
    
    return metadata_df, expression_df

def recombine_data(metadata_df, expression_df):
    """
    Recombines metadata and expression data into a single DataFrame.
    
    Args:
        metadata_df: DataFrame containing metadata
        expression_df: DataFrame containing protein expression data
    
    Returns:
        DataFrame with recombined data
    """
    # Ensure indices match
    if not metadata_df.index.equals(expression_df.index):
        raise ValueError("Metadata and expression data must have matching indices")
    
    # Combine the data, the order ensures that the metadata is at the end
    combined_df = pd.concat([expression_df, metadata_df], axis=1)
    
    return combined_df


In [7]:
def make_pert_id_to_targets_dict(prot_log):
    #make blank dictionary
    pert_id_to_targets_dict={}
    #get the pert id and uniprot id from the prot_log dataframe
    for index, row in prot_log.iterrows():
        pert_id=row['pert_id']
        uniprot_id=row['Uniprot.ID']
        id_list=[]
        #split the uniprot ID by comma or semicolon, doing some basic checks to make sure it's not empty:
        if uniprot_id is not np.nan:
            if 'not' not in str(uniprot_id).lower():
                id_list.extend([t.strip().upper() for t in re.split(r'[;,]', uniprot_id)])
        #associate all the uniprot ID's with the pert id in the dict.
        pert_id_to_targets_dict[pert_id]=id_list
    return pert_id_to_targets_dict


In [8]:
def make_activity_nodes(targeted_proteins_with_metadata,pert_id_to_targets_dict):
    activity_nodes=targeted_proteins_with_metadata.copy()
    #scan through each protein column
    for protein in targeted_proteins_with_metadata.columns:

        #scan through each row and get the pert_id, then look it up in the dictionary to see if the protein is targeted;
        for index, row in targeted_proteins_with_metadata.iterrows():

            pert_id=row['pert_id']

            #check if the pert_id is in the pert_ID_list
            if pert_id in pert_id_to_targets_dict.keys():
                #check if the protein is in the dictionary
                if protein in pert_id_to_targets_dict[pert_id]:
                    pass
                else:
                    if protein in activity_nodes.columns:
                        activity_nodes.loc[index,protein]=0

    #cleave off last 12 columns for metadata:
    activity_nodes=activity_nodes.iloc[:,:-12]
    return activity_nodes

In [9]:
def get_targeted_indices(targeted_proteins_with_metadata):
    protein_list=targeted_proteins_with_metadata.columns
    targeted_indices = []
    #first check that it is proteins:
    for index, Uniprots in enumerate(targeted_proteins_with_metadata['Uniprot.ID']):
        if Uniprots is not np.nan:
            if 'not' not in str(Uniprots).lower():
                id_list=[]
                id_list.extend([t.strip().upper() for t in re.split(r'[;,]', Uniprots)])
                #then check that the protein is in the list of targeted proteins:
                for i in id_list:
                    if i in protein_list:
                        targeted_indices.append(index)
                        break
    return targeted_indices

In [10]:
#cellbox takes in three files: expr.csv, pert.csv, node_Index.csv
#expr.csv is the expression data, of size drug trials x(proteins+phenotypes+activity nodes)
#pert.csv is the perturbation data, of size drug trials x(proteins+phenotypes+activity nodes); the proteins and phenotypes are zeroed out
#the activity nodes indicate the activity of each protein in each drug trial, and so therefore should be of size #of targeted proteins, 
#these are activated.

def make_cellbox_files(prot_log, acti_df, file_prefix, file_path):
    """
    Creates CellBox input files from processed data.
    
    Args:
        prot_log: DataFrame containing log ratios
        acti_df: DataFrame containing activity nodes
        file_prefix: Prefix for output files
        file_path: Path to save output files
    
    Returns: cellbox_files
    """

    expr_csv = prot_log.merge(acti_df, left_index=True, right_index=True)

    # Create perturbation data
    zeros_pert = pd.DataFrame(np.zeros_like(prot_log), columns=prot_log.columns, index=prot_log.index)
    acti_df_arctanh = pd.DataFrame(
        np.arctanh(acti_df.to_numpy().astype(float)),
        columns=acti_df.columns, index=acti_df.index
    )
    pert_csv = pd.merge(zeros_pert, acti_df_arctanh, left_index=True, right_index=True)

    # Create node index
    columns = pert_csv.columns.tolist()
    node_index_csv = pd.DataFrame({"A": columns})

    # Save files
    expr_csv.to_csv(
        (file_path + file_prefix + "expr.csv"),
        header=False,
        index=False
    )
    pert_csv.to_csv(
        (file_path + file_prefix + "pert.csv"),
        header=False,
        index=False
    )
    node_index_csv.to_csv(
        (file_path + file_prefix + "node_Index.csv"),
        sep=" ",
        header=False,
        index=False
    )
    return expr_csv, pert_csv, node_index_csv

In [11]:
### This file contains code to conduct the preprocessing for data
# and has been functionalized to allow for more flexible data handling
# Derived from Elastic_net.ipynb
# Key parameters are included below:
# 1.Filter by completeness; (check the number of absent entries and filter proteins below a certain threshold)
# 5.Apply a signal to noise filter to remove proteins that have a high signal to noise ratio
# 2.change expression values into log ratios as compared to the control test
# 3.Fill in the missing values using various methods (1st method is to fill in with the mean of the column)
# 3.Pull out proteins that are targeted by drugs
# 4.Sort proteins according to how much they vary
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')


In [None]:
#start main method:
#reading in data
#NOTE Things to change for different datasets:
data_path=r'C:\Users\abdul\OneDrive - University of Cambridge\Desktop\MDRA\cellbox_torch\preprocessing\data.csv'
prot_info_path=r'C:\Users\abdul\OneDrive - University of Cambridge\Desktop\MDRA\cellbox_torch\preprocessing\prots_info.csv'
df_prot_info=pd.read_csv(prot_info_path)
df_raw=pd.read_csv(data_path)
df_control_row = df_raw[df_raw['pert_id']=='control'] #pulls out the control row
meta_data_cols = [0] + list(range(-12, -2))+[-1] #The metadata columns, must be changed for different datasets
#NOTE the function explicitly pulls out cell viability, that would also need to be changed for different datasets
df_raw=reorganize_data(df_raw, meta_data_cols)
#applying transforms:
#NOTE now that the metadata is at the end, we need to update the metadata indices
meta_data_cols = range(-12,0)
#DONE READING IN DATA AND RESHAPING

In [None]:
#FILTERING AND GENERATING STARTING DATA STRUCTURES
filtered_by_completeness, filtered_by_completeness_info=filter_by_completeness(df_raw, df_prot_info,completeness_threshold=0.95,metadata_indices=meta_data_cols) 
filtered_by_control_values=filter_by_control_values(filtered_by_completeness, control_id='control',metadata_indices=meta_data_cols)
log_ratios=convert_to_log_ratios(filtered_by_control_values, control_id='control',metadata_indices=meta_data_cols) 

targeted_proteins=get_targeted_proteins(log_ratios, df_prot_info) #seems like it works
# Get the targeted proteins from the log_ratios dataframe
targeted_proteins_df = log_ratios[targeted_proteins]

# Combine with metadata columns
targeted_proteins_with_metadata = pd.concat([
    targeted_proteins_df,
    log_ratios.iloc[:, meta_data_cols]
], axis=1)

# Display the shape of the resulting dataframe
print(f"Shape of targeted proteins dataframe: {targeted_proteins_with_metadata.shape}")
# targeted_proteins_with_metadata.to_csv('targeted_proteins_with_metadata.csv', index=False)

pert_id_to_targets_dict=make_pert_id_to_targets_dict(targeted_proteins_with_metadata)

tgt_indices=get_targeted_indices(targeted_proteins_with_metadata)
#
targeted_proteins_with_metadata=targeted_proteins_with_metadata.loc[tgt_indices]


Filtered from 8544 to 5733 proteins
Completeness threshold: 0.95
Converted 94 samples to log ratios
Found 61 unique targeted proteins out of 277 total targets
Shape of targeted proteins dataframe: (94, 73)


In [25]:
#select targeted rows
tgtd_log_ratios=log_ratios.loc[tgt_indices]
#select targeted proteins:
tgtd_log_ratios=tgtd_log_ratios[targeted_proteins +['Cell_viability%_(cck8Drug-blk)/(control-blk)*100']]
tgtd_log_ratios.fillna(0,inplace=True)

activity_nodes=make_activity_nodes(targeted_proteins_with_metadata,pert_id_to_targets_dict)
activity_nodes=activity_nodes.loc[tgt_indices]
activity_nodes.fillna(0,inplace=True)

In [None]:
tgtd_prots_cellbox=make_cellbox_files(tgtd_log_ratios,activity_nodes,
                   file_prefix='directly_targeted_proteins',
                   file_path=r'C:\Users\abdul\OneDrive - University of Cambridge\Desktop\MDRA\cellbox_torch\Abdullah_kuziez\Data\run1'
)

In [17]:
#CUTTING DOWN TO ONLY ROWS WITH TARGETED PROTEINS:
df_of_neighbors=pd.read_csv(r'C:\Users\abdul\OneDrive - University of Cambridge\Desktop\MDRA\cellbox_torch\Abdullah_kuziez\Testing_encodings\String_adjacency\protein_neighbors_and_degrees.csv')
filtered_log_ratios=log_ratios.loc[tgt_indices]
second_deg_neighbors=df_of_neighbors['second_order'].tolist()
second_deg_neighbors_in_prots = [neighbor for neighbor in second_deg_neighbors if neighbor in filtered_log_ratios.columns]
filtered_log_ratios=filtered_log_ratios[second_deg_neighbors_in_prots +['Cell_viability%_(cck8Drug-blk)/(control-blk)*100']]

#shitty patching of holes but for now sufficient:
filtered_log_ratios.fillna(0,inplace=True)
activity_nodes.fillna(0,inplace=True)
#should be good now to run the cellbox_file_maker
x=make_cellbox_files(filtered_log_ratios,activity_nodes,
                   file_prefix='_test_',
                   file_path=r'C:\Users\abdul\OneDrive - University of Cambridge\Desktop\MDRA\cellbox_torch\Abdullah_kuziez\Data\STRINGDB_encodings'
)