# Process the sqY2H data

This is the main PPI profiling assay. This script loads the raw scores and processes them into consensus scores. Throughout the script, "ad" means "activation domain" and "db" means "DNA binding domain" of a transcription factor. A protein is fused to each of these; if the two proteins interact, then the TF is able to up-regulate the expression of a gene that allows the yeast to survive on the media. Thus, the yeast growth score is correlated to the protein-protein interaction strength.

In [121]:
import pandas as pd 
import polars as pl
import numpy as np
from striprtf.striprtf import rtf_to_text
import re

In [122]:
inputs_dir = "../1_inputs/sqY2H"
outputs_dir = "../3_outputs"
meta_outputs = "../../../1_allele_collection/3_outputs"
meta_inputs = "../../../1_allele_collection/1_inputs"

In [123]:
# read in a list of ORF IDs and swim seq validation results
raw_scores = pd.read_table(f"{inputs_dir}/VarChAMP_1percent_Y2H_scores_v2.tsv")

In [124]:
large_plates = raw_scores["large_plate_name"].unique()
raw_scores = raw_scores[[
    "growth_score_id", "scoring_pla", "scoring_pos", "manual_score_growth"
]]
raw_scores[['retest_batch', 'condition', 'retest_pla']] = raw_scores['scoring_pla'].str.extract(r"([A-Z0-9]+)(r\d+)_0*(\d+)")
raw_scores['retest_pla'] = raw_scores['retest_pla'].astype(int)

In [125]:
# Control positions
ctrl_wells = ['A01', 'A02', 'B01', 'B02',
              'C01', 'C02', 'D01', 'D02',
              'E01', 'E02', 'F01', 'F02',
              'G01', 'G02', 'H01', 'H02',
              'I01', 'I02', 'J01', 'J02',
              'K01', 'K02', 'L01', 'L02']

# Get first plates
first_plates = np.unique([x.split("_")[1] for x in large_plates])
first_plates = first_plates.astype(int).tolist()

# Add column indicating if its a media control well
raw_scores['media_control'] = raw_scores['retest_pla'].isin(first_plates) & raw_scores['scoring_pos'].isin(ctrl_wells)

In [126]:
# Make the scores match an allele to the 96 wells of the source plate 
allele_map = pd.read_csv(f"{inputs_dir}/cp_files/mapping_y2h.csv").drop(columns=['ad_symbol', 'db_symbol'])

# create dictionary mapping 96 well position to 384 well position
with open(f"{inputs_dir}/mapping_384_to_96.rtf", "r") as f:
    rtf_content = f.read()

plain_text = rtf_to_text(rtf_content)
match = re.search(r"mapping_384_to_96_d\s*=\s*({.*})", plain_text, re.DOTALL)
dict_str = match.group(1)
dict_str = dict_str.replace("\n", "").replace("\r", "")
map_384_96 = eval(dict_str)
map_df = pd.DataFrame(list(map_384_96.items()), columns=['scoring_pos', 'retest_pos'])

# Map the 96-pos to 384-pos
allele_map = allele_map.merge(map_df, on="retest_pos")
allele_map['retest_pla'] = allele_map['retest_pla'].astype(int)

In [127]:
condition_df = pd.DataFrame({
    "condition":  ["r07", "r08", "r09", "r10", "r11", "r12", "r13", "r14", "r15", "r16","r17", "r18"],
    "condition_name": ["LW", "LWA", "LWH1", "LWH10", "LWH25", "LWAH1", "LW", "LWA", "LWH1", "LWH10", "LWH25", "LWAH1"]
})

In [128]:
# add orf ids to scores
scores = raw_scores.merge(allele_map, on=["scoring_pos", "retest_batch", "retest_pla"], how="left")
scores['db_orf_id'] = scores['db_orf_id'].astype(str)
scores['db_mut_id'] = scores['db_mut_id'].astype(str)
scores['ad_orf_id'] = scores['ad_orf_id'].astype(str)
scores['ad_mut_id'] = scores['ad_mut_id'].astype(str)


In [129]:

# give media controls their own IDs
mask = scores['media_control'] == True
scores.loc[mask, 'db_orf_id'] = (
    scores.loc[mask, 'retest_pos'].astype(str) + '_' + scores.loc[mask, 'retest_pla'].astype(str)
)
scores.loc[mask, 'ad_orf_id'] = (
    scores.loc[mask, 'retest_pos'].astype(str) + '_' + scores.loc[mask, 'retest_pla'].astype(str)
)

# Filter out r01-r06 and convert to other condition names
exclude_list = ["r01", "r02", "r03", "r04", "r05", "r06"]
scores = scores[~scores['condition'].isin(exclude_list)]

# get only plates from relevant batches
scores = scores[scores.standard_batch.isin(["VUSAPWT1B1","VUSAPWT1B2","VUSAPWT2B1","VUSAPWT6B1"])]


In [130]:
# both db_orf_id and ad_orf_id must have values (otherwise they are empty wells)
scores = scores[
    (~scores.db_orf_id.isin(['nan', 'None'])) &
    (~scores.ad_orf_id.isin(['nan', 'None']))
]

# add named condition column
scores = scores.merge(condition_df, on="condition")

# write out file
scores.to_csv(f"{outputs_dir}/sqY2H/1_raw_individual_scores.csv", index=False)

In [131]:
# Calculate the most frequent score (consensus_score_list)
scores = pl.DataFrame(scores)
consensus_scores = scores.group_by(['ad_orf_id', 'ad_mut_id', 'db_orf_id', 'db_mut_id', 'condition_name', 'retest_batch', 'media_control']).agg(
    pl.col("manual_score_growth").mode().alias("consensus_score_list")
)

# Determine whether there was a tie for the most frequent score
consensus_scores = consensus_scores.with_columns(
    pl.col("consensus_score_list").list.len().alias("num_most_freq")
)

# Easy cases - clear most frequent (num_most_freq == 1) or clear disagreement (num_most_freq > 2)
consensus_easy = consensus_scores.filter(pl.col("num_most_freq") != 2)
consensus_easy = consensus_easy.with_columns(
    pl.when(pl.col("num_most_freq") == 1)
      .then(pl.col("consensus_score_list").list.first())
      .when(pl.col("num_most_freq") > 2)
      .then(pl.lit(99999))
      .otherwise(None)
      .alias("consensus_score")
)

# Hard cases - two scores were most frequent. Rules:
# If they are consecutive (score_diff == 1), then take the maximum score
# If they aren't, mark as failure (99999)
# If one value is NaN, use the other value
consensus_hard = consensus_scores.filter(pl.col("num_most_freq") == 2)
consensus_hard = consensus_hard.with_columns(
    (pl.col("consensus_score_list").list.get(0) - pl.col("consensus_score_list").list.get(1))
    .abs()
    .alias("score_diff")
)
consensus_hard = consensus_hard.with_columns(
    pl.when(pl.col("score_diff") > 1)
      .then(pl.lit(99999))
      .when(pl.col("score_diff").is_null())
      .then(
          pl.col("consensus_score_list")
            .list.eval(pl.element().drop_nulls())
            .list.first()
      )
      .when(pl.col("score_diff") == 1)
      .then(
          pl.col("consensus_score_list")
            .list.eval(pl.element().max())
            .list.first()
      )
      .otherwise(None)
      .alias("consensus_score")
).drop("score_diff")

# Concatenate together
consensus_scores = pl.concat([consensus_easy, consensus_hard]).drop([
    "consensus_score_list", "num_most_freq"
])

In [132]:
# map ad_orf_id to ad_symbol

mapping_file = pd.read_csv(f"{inputs_dir}/cp_files/mapping_y2h.csv")
mapping_file.dropna(subset=['ad_orf_id'], inplace=True)
mapping_dict = mapping_file.set_index('ad_orf_id')['ad_symbol'].to_dict()

consensus_scores = consensus_scores.with_columns(
    pl.col("ad_orf_id").replace_strict(mapping_dict, default=None).alias("ad_symbol")
)

In [133]:
# merge with metadata and write out table
metadata = pl.read_csv(f"{meta_outputs}/slim_metadata.csv").rename({
    "orf_id": "db_orf_id",
    "mut_id": "db_mut_id"
}).with_columns(
    pl.col("db_orf_id").cast(pl.Float64).cast(pl.String).alias("db_orf_id"),
    pl.col("db_mut_id").cast(pl.Float64).cast(pl.String).alias("db_mut_id")
)

consensus_scores = consensus_scores.join(metadata, on=["db_orf_id", "db_mut_id"], how="left")
consensus_scores.write_csv(f"{outputs_dir}/sqY2H/2_raw_consensus_scores.csv")

In [134]:
### Media conditions

# 1 - Cells should either be all there or all not. Control media - are the two proteins present
# 3 - 5, 2: strength of the interaction from low to high. 2 condition has a different media where it's harder for yeast to grow.
# 6 - combination of 1 & 2 medias, but very complicated to logically process so currently not used but retained in case we use it in the future

# 16 possible highest score: 4/4 for each of 3, 4, 5, 2, then divide by 16

# dbX, empty abY. If yeast can still grow, it's called autoactivation
# DB-X+0AD, DB-X+AD-Y. We get score for both. If both equal 4, then we discard. If 0 and 4, worked well. 
# If somewhere in between (ie. 2 and 4), we sometimes filter out (if the same value), sometimes keep with a score adjustment (check Georges code)

# Postitions with both empty ad and db orf ids are media controls. 
# 6*96 well plate = 6*4 positions on 384, 6*384 on mega plate, but ctrls only spotted once per mega plate