In [1]:
import numpy as np
import pandas as pd
from scipy.stats import zscore
import os
import matplotlib.pyplot as plt
import scipy as sp
import seaborn as sns

In [5]:
# >> User Inputs <<
import_file = "20250625_WD_counts1_zscore_20250625_PBv6_BaitPreyInfo_20240529_combineIsoforms_20240521_summary_baitCre_LWcorrected_expGelNums.xlsx"
import_flagged = "20250625_WDflagged_counts1_zscore_20250625_PBv6_BaitPreyInfo_20240531_flagged_20240529_combineIsoforms_20240521_summary_baitCre_LWcorrected_expGelNums.xlsx"

out_name = 'optimized_flag_matrix_%s' %(import_flagged)

In [6]:
# plot formatting
fontSize = 20
fontName = 'Arial';

plt.rcParams.update({'font.family': fontName,
                     'font.size': fontSize,
                     'axes.titlesize': fontSize,
                     'xtick.labelsize': fontSize,
                     'ytick.labelsize': fontSize,
                     'axes.formatter.use_mathtext': False})

plotStyle = {'markersize': 8, 'markeredgewidth': 3, 'linewidth': 3, 'markerfacecolor':'w'}
gridStyle = {'linewidth':0.9, 'color': 'silver'}

# colorblind friendly color scheme
color_spec = ['#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628',
              '#984ea3', '#999999', '#e41a1c', '#dede00']


In [7]:
# import data -- summary file with experiment numbers (takes a few mins)

full_sheet = pd.read_excel(import_file, 
                           sheet_name = 0,
                           index_col = 0,
                           skiprows = 0) # load from sheet 1, start at 1st row
# note: first column is old row IDs (before removal of decoy preys)
# row indexes are current row IDs

In [8]:
# import flagged data -- summary file with experiment numbers (takes a few mins)

flag_full_sheet = pd.read_excel(import_flagged, 
                           sheet_name = 0,
                           index_col = 0,
                           skiprows = 0) # load from sheet 1, start at 1st row
# note: first column is old row IDs (before removal of decoy preys)
# row indexes are current row IDs

### create matrix of whether WD nonzero for both sheets
--> if something changed to zero, consider "flagged"

In [9]:
# process full_sheet -- create sheet and matrix of WD scores
# sheet is dataframe bait CreID as column names, prey CreID as row indices
# matrix is numpy 2D array with just WD values

start_row = 13
start_col = 14

bait_creID = full_sheet.loc['Bait CreID'][start_col:]
prey_creID = full_sheet['Prey CreID'].iloc[start_row:]

bait_loc = full_sheet.loc['Bait Experimental Localization'][start_col:]
bait_loc.index = bait_creID.to_numpy()
bait_loc = bait_loc.replace(np.nan, 'N/A') # change NaNs to N/A

bait_pred_loc = full_sheet.loc['Bait PB-Chlamy Predicted Localization'][start_col:]
bait_pred_loc.index = bait_creID.to_numpy()

prey_pred_loc = full_sheet['Prey PB-Chlamy Predicted Localization'].iloc[start_row:]
prey_pred_loc.index = prey_creID.to_numpy()

In [10]:
# create WD matrix
WD_mat = full_sheet.iloc[start_row:,start_col:].to_numpy().astype('float')
WD_nonzero = WD_mat[WD_mat > 0]

# create WD dataframe with column named with bait CreID, rows named with prey creID
WD_sheet = full_sheet.iloc[start_row:,start_col:].copy()
WD_sheet.columns = bait_creID.to_numpy()
WD_sheet.index = prey_creID.to_numpy()

num_interactions = WD_mat.shape[0]*WD_mat.shape[1]

In [11]:
WD_mat

array([[ 8.58095166,  5.92141921,  6.48658974, ...,  8.1621131 ,
         6.08825239,  5.00451264],
       [ 4.94974747, 10.36822068,  5.04975247, ...,  4.69041576,
         5.33853913,  7.51664819],
       [ 5.        ,  6.20483682,  5.19615242, ...,  6.74536878,
         4.63680925,  4.35889894],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [12]:
WD_sheet

Unnamed: 0,Cre01.g002150,Cre01.g002300,Cre01.g002500,Cre01.g003200,Cre01.g004300,Cre01.g004400,Cre01.g005001,Cre01.g005050,Cre01.g007050,Cre01.g007850,...,Cre17.g734500,Cre17.g738000,Cre17.g740950,Cre17.g741000,Cre17.g741050,Cre17.g743747,Cre17.g745847,Cre17.g745997,Cre24.g755197,Venus-3xFLAG
Cre12.g551050,8.580952,5.921419,6.48659,4.531779,9.335797,6.97048,5.878971,6.97048,6.250634,4.852055,...,6.751457,4.954215,3.394226,5.66196,6.250634,5.706022,7.785195,8.162113,6.088252,5.004513
Venus-FLAG,4.949747,10.368221,5.049752,5.656854,3.605551,5.612486,5.656854,4.527693,9,7.449832,...,9.192388,5.147815,9.486833,4.358899,2.345208,3.316625,5.522681,4.690416,5.338539,7.516648
Cre06.g258800,5,6.204837,5.196152,3.464102,6.204837,4.84768,4.690416,7.071068,5.700877,4.898979,...,5.338539,4,3.872983,3.162278,4,4.1833,4.795832,6.745369,4.636809,4.358899
Cre13.g592500,5.622626,6.296257,6.256281,5.798375,7.72756,5.884282,6.093752,7.759961,5.667074,5.754941,...,5.754941,6.492449,3.47036,5.157116,5.157116,5.711177,3.879981,4.480216,4.069358,4.590855
Cre15.g635650,6.741201,4.180679,4.180679,0,3.552925,6.454208,0.891324,7.016464,6.355665,0,...,7.944581,2.956187,0,0.891324,0,0,3.565295,3.213714,5.388277,1.543818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cre03.g156600,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cre04.g216300,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cre16.g653650,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cre08.g377000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
WD_mat.shape

(13402, 1109)

In [15]:
full_sheet

Unnamed: 0,Identified Proteins,Accession Number,Alternate ID,Molecular Weight,Protein Grouping Ambiguity,Prey CreID,Prey Gene Symbol,Prey Experimental Localization,Prey PB-Chlamy Predicted Localization,Prey PredAlgo Predicted Localization,...,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108
Bait Gene Symbol,,,,,,,,,,,...,ATPVE1,AGG2,LHL4,Not Found,COV1,Not Found,Not Found,Not Found,RBD3,Not Found
Bait Experimental Localization,,,,,,,,,,,...,secretory pathway/contractile vacuole,secretory pathway/Golgi; secretory pathway/pla...,chloroplast/homogeneous/not pyrenoid depleted;...,shapes in cytoplasm/uncategorized shapes in cy...,shapes in cytoplasm/uncategorized shapes in cy...,cytosol/cytosol (strong signal),chloroplast/homogeneous/pyrenoid depleted,shapes in cytoplasm/uncategorized shapes in cy...,chloroplast/chloroplast punctate dots,Not Found
Bait PB-Chlamy Predicted Localization,,,,,,,,,,,...,Secretory,Other,Chloroplast,Secretory,Other,Secretory,Chloroplast,Other,Chloroplast,Not Found
Replicate 1 Sheet Number,,,,,,,,,,,...,50,70,34,57,57,67,50,72,45,34
Replicate 2 Sheet Number,,,,,,,,,,,...,49,77,74,57,38,53,10,10,10,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13398,pacid=52510722 transcript=Cre03.g156600_4532.1...,Cre03.g156600_4532.1.p,Cre03.g156600,32 kDa,0,Cre03.g156600,GTRBP1,chloroplast/not homogeneous/not pyrenoid depleted,Chloroplast,Chloroplast,...,0,0,0,0,0,0,0,0,0,0
13399,pacid=52540355 transcript=Cre04.g216300_4532.1...,Cre04.g216300_4532.1.p,Cre04.g216300,18 kDa,0,Cre04.g216300,Not Found,Not Found,Other,Chloroplast,...,0,0,0,0,0,0,0,0,0,0
13400,pacid=52517051 transcript=Cre16.g653650_4532.1...,Cre16.g653650_4532.1.p,Cre16.g653650,22 kDa,0,Cre16.g653650,Not Found,Not Found,Chloroplast,Other,...,0,0,0,0,0,0,0,0,0,0
13401,pacid=52534323 transcript=Cre08.g377000_4532.1...,Cre08.g377000_4532.1.p,Cre08.g377000,10 kDa,0,Cre08.g377000,Not Found,secretory pathway/contractile vacuole; shapes ...,Secretory,Other,...,0,0,0,0,0,0,0,0,0,0


In [16]:
flag_full_sheet

Unnamed: 0,Identified Proteins,Accession Number,Alternate ID,Molecular Weight,Protein Grouping Ambiguity,Prey CreID,Prey Gene Symbol,Prey Experimental Localization,Prey PB-Chlamy Predicted Localization,Prey PredAlgo Predicted Localization,...,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108
Bait Gene Symbol,,,,,,,,,,,...,ATPVE1,AGG2,LHL4,Not Found,COV1,Not Found,Not Found,Not Found,RBD3,Not Found
Bait Experimental Localization,,,,,,,,,,,...,secretory pathway/contractile vacuole,secretory pathway/Golgi; secretory pathway/pla...,chloroplast/homogeneous/not pyrenoid depleted;...,shapes in cytoplasm/uncategorized shapes in cy...,shapes in cytoplasm/uncategorized shapes in cy...,cytosol/cytosol (strong signal),chloroplast/homogeneous/pyrenoid depleted,shapes in cytoplasm/uncategorized shapes in cy...,chloroplast/chloroplast punctate dots,Not Found
Bait PB-Chlamy Predicted Localization,,,,,,,,,,,...,Secretory,Other,Chloroplast,Secretory,Other,Secretory,Chloroplast,Other,Chloroplast,Not Found
Replicate 1 Sheet Number,,,,,,,,,,,...,50,70,34,57,57,67,50,72,45,34
Replicate 2 Sheet Number,,,,,,,,,,,...,49,77,74,57,38,53,10,10,10,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13398,pacid=52510722 transcript=Cre03.g156600_4532.1...,Cre03.g156600_4532.1.p,Cre03.g156600,32 kDa,0,Cre03.g156600,GTRBP1,chloroplast/not homogeneous/not pyrenoid depleted,Chloroplast,Chloroplast,...,0,0,0,0,0,0,0,0,0,0
13399,pacid=52540355 transcript=Cre04.g216300_4532.1...,Cre04.g216300_4532.1.p,Cre04.g216300,18 kDa,0,Cre04.g216300,Not Found,Not Found,Other,Chloroplast,...,0,0,0,0,0,0,0,0,0,0
13400,pacid=52517051 transcript=Cre16.g653650_4532.1...,Cre16.g653650_4532.1.p,Cre16.g653650,22 kDa,0,Cre16.g653650,Not Found,Not Found,Chloroplast,Other,...,0,0,0,0,0,0,0,0,0,0
13401,pacid=52534323 transcript=Cre08.g377000_4532.1...,Cre08.g377000_4532.1.p,Cre08.g377000,10 kDa,0,Cre08.g377000,Not Found,secretory pathway/contractile vacuole; shapes ...,Secretory,Other,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# create flagged WD matrix
flag_WD_mat = flag_full_sheet.iloc[start_row:,start_col:].to_numpy().astype('float')
flag_WD_nonzero = flag_WD_mat[flag_WD_mat > 0]

# create WD dataframe with column named with bait CreID, rows named with prey creID
flag_WD_sheet = flag_full_sheet.iloc[start_row:,start_col:].copy()
flag_WD_sheet.columns = bait_creID.to_numpy()
flag_WD_sheet.index = prey_creID.to_numpy()

flag_num_interactions = flag_WD_mat.shape[0]*flag_WD_mat.shape[1]

In [18]:
flag_WD_sheet

Unnamed: 0,Cre01.g002150,Cre01.g002300,Cre01.g002500,Cre01.g003200,Cre01.g004300,Cre01.g004400,Cre01.g005001,Cre01.g005050,Cre01.g007050,Cre01.g007850,...,Cre17.g734500,Cre17.g738000,Cre17.g740950,Cre17.g741000,Cre17.g741050,Cre17.g743747,Cre17.g745847,Cre17.g745997,Cre24.g755197,Venus-3xFLAG
Cre12.g551050,8.690763,5.997196,6.569599,4.589772,9.455268,7.059681,5.954205,7.059681,6.330624,4.914147,...,6.837856,5.017614,0,5.734416,6.330624,5.779043,7.884822,8.266564,6.166164,5.068556
Venus-FLAG,5.889775,12.337293,3.699191,6.731171,2.781086,6.678377,6.731171,5.387566,10.709227,8.864661,...,10.938153,6.125458,11.288517,5.186716,0,2.314004,6.571516,2.987366,6.352403,8.944166
Cre06.g258800,5.022645,6.232938,5.219686,3.47979,6.232938,4.869635,3.945909,7.103093,5.726696,4.921167,...,3.613707,4.018116,2.350513,3.1766,4.018116,4.202246,4.817552,6.775918,4.657809,4.37864
Cre13.g592500,5.736633,6.423923,6.383136,5.915946,7.884248,6.003595,6.217312,7.917306,5.004192,5.871631,...,5.871631,6.624093,2.260663,5.261684,5.261684,5.826979,3.958653,2.144654,4.15187,4.683941
Cre15.g635650,9.234683,4.893158,4.893158,0,4.867105,8.841536,1.043225,9.611763,8.706543,0,...,10.883179,3.459985,0,1.043225,0,0,4.172899,0,2.332721,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cre03.g156600,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cre04.g216300,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cre16.g653650,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cre08.g377000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# difference between flagged and unflagged matrix (nonzero WD scores)
# flagged interactions will have -1, rest will have 0
np.array((flag_WD_mat>0), dtype=int) - np.array((WD_mat>0), dtype=int)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [20]:
# difference between flagged and unflagged matrix (nonzero WD scores)
# flagged interactions will have 1, rest will have 0
flag_mat = np.array((WD_mat>0), dtype=int) - np.array((flag_WD_mat>0), dtype=int)

# copy full_sheet formatting
flagged_sheet = full_sheet.copy()
flagged_sheet.iloc[start_row:, start_col:] = flag_mat

In [21]:
flagged_sheet.head(15)

Unnamed: 0,Identified Proteins,Accession Number,Alternate ID,Molecular Weight,Protein Grouping Ambiguity,Prey CreID,Prey Gene Symbol,Prey Experimental Localization,Prey PB-Chlamy Predicted Localization,Prey PredAlgo Predicted Localization,...,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108
Bait Gene Symbol,,,,,,,,,,,...,ATPVE1,AGG2,LHL4,Not Found,COV1,Not Found,Not Found,Not Found,RBD3,Not Found
Bait Experimental Localization,,,,,,,,,,,...,secretory pathway/contractile vacuole,secretory pathway/Golgi; secretory pathway/pla...,chloroplast/homogeneous/not pyrenoid depleted;...,shapes in cytoplasm/uncategorized shapes in cy...,shapes in cytoplasm/uncategorized shapes in cy...,cytosol/cytosol (strong signal),chloroplast/homogeneous/pyrenoid depleted,shapes in cytoplasm/uncategorized shapes in cy...,chloroplast/chloroplast punctate dots,Not Found
Bait PB-Chlamy Predicted Localization,,,,,,,,,,,...,Secretory,Other,Chloroplast,Secretory,Other,Secretory,Chloroplast,Other,Chloroplast,Not Found
Replicate 1 Sheet Number,,,,,,,,,,,...,50,70,34,57,57,67,50,72,45,34
Replicate 2 Sheet Number,,,,,,,,,,,...,49,77,74,57,38,53,10,10,10,11
Replicate 1 Plate ID,,,,,,,,,,,...,LW_1969_08_FE7,LW_1582_0701_67_11E11,LW_1937_04_FCB7,LW_1708_1216_64_11G12,LW_1708_1216_93_11E10,LW_1562_0603_81_12D11,LW_1969_41_3A2,LW_1893_59_6H1,LW_1312_0811_18_13A4,LW_1937_18_FD2
Replicate 2 Plate ID,,,,,,,,,,,...,LW_1869_0426_07_FE7,LW_1703_1202_70_11E11,LW_1683_1027_08_FCB7,LW_1708_1216_79_11G12,LW_1708_1207_03_11E10,LW_1687_1111_96_12D11,LW_1312_0813_65_3A2,LW_1312_0813_59_6H1,LW_1312_0813_49_13A4,LW_1582_0608_40_FD2
Bait PredAlgo Predicted Localization,,,,,,,,,,,...,Other,Other,Chloroplast,Secretory,Mitochondrial,Secretory,Chloroplast,Chloroplast,Chloroplast,Not Found
Bait Defline,,,,,,,,,,,...,Vacuolar ATP synthase subunit E,Aggregation 2,High intensity light-inducible Lhc-like gene,(1 of 1) K13249 - translocon-associated protei...,COV1-like protein,Not Found,Not Found,(1 of 546) 2.7.11.1 - Non-specific serine/thre...,Putative rubredoxin-like protein,Not Found
Bait Greencut,,,,,,,,,,,...,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found,Not Found


In [None]:
# export flag matrix

with pd.ExcelWriter(out_name, mode='w') as writer:  
    flagged_sheet.to_excel(writer, sheet_name='optimized')

In [23]:
np.array((WD_mat>0), dtype=int)

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [24]:
np.array((flag_WD_mat>0), dtype=int)

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])