## WD Score Calculator

In [1]:
import numpy as np
import pandas as pd
import os
from scipy.stats import zscore

### for mix of one and two replicates, all in one Excel sheet -- new data

#### formatting requirements for input spreadsheet:
- Run spreadsheet combiner script (combines and assigns CreIDs)
- Manually add CreIDs and remove Uncategorized Samples specified by Lianyong
- Identify and remove exact duplicates (have same plate ID and matching values)


In [54]:
# >> User Inputs <<
# original
# import_file = "20240709_PBv6_BaitPreyInfo_20240529_combineIsoforms_20240521_summary_baitCre_LWcorrected_expGelNums.xlsx"

# contamination flagged
import_file = "20240709_PBv6_BaitPreyInfo_20240531_flagged_20240529_combineIsoforms_20240521_summary_baitCre_LWcorrected_expGelNums.xlsx"

spectral_threshold = 1 # put 1 to keep data as is, put n > 1 to count values < n as 0
WD_out_name = 'WD_counts%d_zscore_%s' %(spectral_threshold, import_file)

In [55]:
# import data (takes a few mins)

full_sheet = pd.read_excel(import_file,
                           index_col = 0,
                           skiprows = 0)

In [56]:
full_sheet.iloc[0:15,0:15]

Unnamed: 0,Identified Proteins,Accession Number,Alternate ID,Molecular Weight,Protein Grouping Ambiguity,Taxonomy,geneName,exp,PB-Chlamy,PredAlgo,Info,Greencut,Conservation,ConservationDefline,8
Sheet Number,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
Gel Number,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
Bait CreID,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Cre01.g028350
Bait Plate ID,,,,,,,,,,,,,,,LW_1312_0830_113_13E6
Bait Gene Symbol,,,,,,,,,,,,,,,DEG8
Bait Experimental Localization,,,,,,,,,,,,,,,chloroplast/pyrenoid/pyrenoid traversing enric...
Bait PB-Chlamy Predicted Localization,,,,,,,,,,,,,,,Chloroplast
Bait PredAlgo Predicted Localization,,,,,,,,,,,,,,,Chloroplast
Bait Defline,,,,,,,,,,,,,,,Deg protease
Bait Greencut,,,,,,,,,,,,,,,Not Found


In [57]:
# set NaN (blank cells) to 0
sheet = full_sheet.replace(np.nan, 0)
print('Replaced NaNs with 0')

# apply spectral threshold (values below threshold are set to 0)
if spectral_threshold > 1:
    sheet = sheet.replace(range(spectral_threshold), 0)
    print('Replaced values below %d with 0' %spectral_threshold)

Replaced NaNs with 0


In [58]:
sheet.iloc[2,14:]

8       Cre01.g028350
9       Cre06.g278195
10      Cre01.g050950
11      Cre10.g466850
13      Cre08.g385200
            ...      
2727    Cre11.g467722
2728    Cre17.g723650
2730    Cre10.g419800
2731    Cre03.g169400
2733    Cre09.g396100
Name: Bait CreID, Length: 2218, dtype: object

In [59]:
# CreID (bait) analysis

CreID = sheet.iloc[2,14:].to_numpy()

(unique_ID, ID_counts) = np.unique(CreID, return_counts=True)

if np.any(unique_ID == 'Uncategorized Sample'):
    num_uncategorized = ID_counts[unique_ID == 'Uncategorized Sample']
    uncat_ID = np.argwhere(unique_ID == 'Uncategorized Sample')
    unique_ID2 = np.delete(unique_ID, [uncat_ID])
    ID_counts2 = np.delete(ID_counts, [uncat_ID])
else:
    num_uncategorized = 0

poly_replicates = unique_ID[ID_counts > 2]
poly_rep_counts = ID_counts[ID_counts > 2]
    
print('%d baits have more than 2 replicates' %(unique_ID[ID_counts > 2].shape[0]))
print('%d baits are Uncategorized Samples' %num_uncategorized)

0 baits have more than 2 replicates
0 baits are Uncategorized Samples


In [60]:
sheet.index

Index([                         'Sheet Number',
                                  'Gel Number',
                                  'Bait CreID',
                               'Bait Plate ID',
                            'Bait Gene Symbol',
              'Bait Experimental Localization',
       'Bait PB-Chlamy Predicted Localization',
        'Bait PredAlgo Predicted Localization',
                                'Bait Defline',
                               'Bait Greencut',
       ...
                                         13393,
                                         13394,
                                         13395,
                                         13396,
                                         13397,
                                         13398,
                                         13399,
                                         13400,
                                         13401,
                                         13402],
      dtype='object', length

In [61]:
# format sheet for WD score calculation
# label bait CreID row in dataframe index
sheet_2_reps = sheet.copy()

# sort columns by CreID and remove header rows
sorted_sheet = sheet_2_reps.drop(index=[0, 'Sheet Number', 'Gel Number', 'Bait Plate ID', 'Bait Gene Symbol',
                                        'Bait Experimental Localization', 'Bait PB-Chlamy Predicted Localization',
                                        'Bait PredAlgo Predicted Localization', 'Bait Defline', 'Bait Greencut', 
                                        'Bait Arabidopsis Conservation', 'Bait Arabidopsis Conservation Defline']).iloc[:,14:].sort_values(by = 'Bait CreID', axis = 1)


In [62]:
sorted_sheet

Unnamed: 0,2263,2616,1599,602,603,1566,555,534,540,804,...,2352,1788,1642,284,2513,278,1458,268,1105,327
Bait CreID,Cre01.g002150,Cre01.g002150,Cre01.g002300,Cre01.g002300,Cre01.g002500,Cre01.g002500,Cre01.g003200,Cre01.g003200,Cre01.g004300,Cre01.g004300,...,Cre17.g743747,Cre17.g743747,Cre17.g745847,Cre17.g745847,Cre17.g745997,Cre17.g745997,Cre24.g755197,Cre24.g755197,Venus-3xFLAG,Venus-3xFLAG
1,95,52,60,10,17,67,15,26,41,133,...,12,53,49,72,96,37,45,29,19,31
2,27,22,148,67,0,23,31,33,0,13,...,0,9,31,30,0,15,34,23,72,41
3,33,17,64,13,18,36,10,14,33,44,...,17,18,21,25,71,20,29,14,18,20
4,38,25,48,31,33,45,34,33,33,86,...,38,27,14,16,0,9,23,10,16,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
# create dataframes for each replicate (assumes data sorted by creID so replicates are adjacent, in pairs)
start_col = 0 # column index of first bait in dataframe
stop = sorted_sheet.shape[1]
rep1_indices = np.arange(start_col, stop, 2) # column indices to keep for rep 1 (skip every other column)
rep1_df = sorted_sheet.iloc[1:, rep1_indices] # removes CreID row

rep2_indices = np.arange(start_col+1, stop, 2) # shift start point over by 1 column to get other set of replicates
rep2_df = sorted_sheet.iloc[1:, rep2_indices]

# create matrices for each replicate
rep1 = rep1_df.to_numpy().astype('float')
rep2 = rep2_df.to_numpy().astype('float')

In [64]:
rep1_df

Unnamed: 0,2263,1599,603,555,540,754,1549,985,1290,1151,...,1612,2461,1091,1926,1955,2352,1642,2513,1458,1105
1,95,60,17,15,41,61,47,13,51,32,...,33,15,0,26,31,12,49,96,45,19
2,27,148,0,31,0,37,42,18,110,60,...,41,22,120,17,0,0,31,0,34,72
3,33,64,18,10,33,34,31,31,53,29,...,26,19,0,9,9,17,21,71,29,18
4,38,48,33,34,33,31,53,52,49,32,...,10,29,10,25,28,38,14,0,23,16
5,21,22,0,0,3,26,0,1,30,0,...,6,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
rep2_df

Unnamed: 0,2616,602,1566,534,804,668,2211,1586,1945,835,...,1563,2707,2565,1941,1219,1788,284,278,268,327
1,52,10,67,26,133,36,22,84,27,15,...,58,34,0,38,47,53,72,37,29,31
2,22,67,23,33,13,26,22,23,52,51,...,128,31,60,21,0,9,30,15,23,41
3,17,13,36,14,44,13,0,69,12,19,...,0,13,11,11,23,18,25,20,14,20
4,25,31,45,33,86,38,21,68,0,34,...,56,55,0,28,25,27,16,9,10,26
5,15,0,22,0,7,7,1,38,2,0,...,44,11,0,1,0,0,16,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
# check there's correct number of columns
print("Have expected number of columns: %r" %(sorted_sheet.shape[1] == len(unique_ID)*2))

Have expected number of columns: True


In [67]:
# calculate basic WD scores

data = (rep1 + rep2)/2 # average of 2 replicates
m = len(data) # number of prey/rows (j from 1 to m)
k = len(data[0]) # number of bait/cols (i from 1 to k)

data = (rep1 + rep2)/2

row_mean = np.mean(data,1)
row_std = np.std(data,1)

row_w = row_std / row_mean
for n in range(len(row_w)):
    if row_w[n] <= 1: row_w[n] = 1

f = np.zeros_like(data)
for j in range(m):
    for i in range(k):
        f[j][i] = data[j][i] > 0

p = np.greater(rep1, 0).astype(int) + np.greater(rep2, 0).astype(int)
        
        
WD = np.zeros_like(data)
f_sum = np.sum(f,1)
for j in range(m):
    for i in range(k):
        weight = (k/f_sum[j] * row_w[j])**p[j][i]
        WD[j][i] = np.sqrt(weight * data[j][i])
             

  row_w = row_std / row_mean
  weight = (k/f_sum[j] * row_w[j])**p[j][i]


In [68]:
# convert WD and p values to dataframe -- TO DO: FIX THIS

# take CreIDs from sorted sheet
WDdf = sorted_sheet[sorted_sheet.columns[::2]].copy()

# insert WD scores
WDdf.iloc[1:,:] = WD

# set bait and prey indices (start at 1)
rowIDs = np.arange(WDdf.shape[0]).astype('str')
rowIDs[0] = 'Bait CreID'
WDdf.index = rowIDs # prey
WDdf.columns = np.arange(WDdf.shape[1]) # bait

# add prey label columns
for ins in [0,1,2,3,4,5,6,7,8,9,10,11,12,13]:
    WDdf.insert(ins, sheet.iloc[12,ins], sheet.iloc[12:,ins].to_numpy())


In [69]:
WDdf.columns[:14]

Index(['Identified Proteins', 'Accession Number', 'Alternate ID',
       'Molecular Weight', 'Protein Grouping Ambiguity', 'Prey CreID',
       'Prey Gene Symbol', 'Prey Experimental Localization',
       'Prey PB-Chlamy Predicted Localization',
       'Prey PredAlgo Predicted Localization', 'Prey Defline', 'Prey Greencut',
       'Prey Arabidopsis Conservation',
       'Prey Arabidopsis Conservation Defline'],
      dtype='object')

In [70]:
# save sheet numbers and localizations
info_df = sheet[rep1_df.columns.to_numpy()].iloc[4:12,:].transpose() # bait info 8 rows
info_df.insert(3, 'Replicate 1 Sheet Number', sheet[rep1_df.columns.to_numpy()].iloc[0,:]) # rep 1 sheet nums
info_df.insert(4, 'Replicate 2 Sheet Number', sheet[rep2_df.columns.to_numpy()].iloc[0,:].to_numpy()) # rep 2 sheet nums
info_df.insert(5, 'Replicate 1 Plate ID', sheet[rep1_df.columns.to_numpy()].iloc[3,:]) # rep 1 plate IDs
info_df.insert(6, 'Replicate 2 Plate ID', sheet[rep2_df.columns.to_numpy()].iloc[3,:].to_numpy()) # rep 2 plate IDs
info_df.insert(0, 'CreID', sheet[rep1_df.columns.to_numpy()].iloc[2,:]) # CreID row

# use WD df format for p values
p_df = WDdf.copy()
p_df.iloc[1:,14:] = p

In [71]:
# convert WD and p values to dataframe -- TO DO: FIX THIS

# take CreIDs from sorted sheet
WDdf = sorted_sheet[sorted_sheet.columns[::2]].copy()

# insert WD scores
WDdf.iloc[1:,:] = WD

# set bait and prey indices (start at 1)
rowIDs = np.arange(WDdf.shape[0]).astype('str')
rowIDs[0] = 'Bait CreID'
WDdf.index = rowIDs # prey
WDdf.columns = np.arange(WDdf.shape[1]) # bait

# add prey label columns
for ins in [0,1,2,3,4,5,6,7,8,9,10,11,12,13]:
    WDdf.insert(ins, sheet.iloc[12,ins], sheet.iloc[12:,ins].to_numpy())
WDdf.columns[:14]
    
# save sheet numbers and localizations
info_df = sheet[rep1_df.columns.to_numpy()].iloc[4:12,:].transpose() # bait info 8 rows
info_df.insert(3, 'Replicate 1 Sheet Number', sheet[rep1_df.columns.to_numpy()].iloc[0,:]) # rep 1 sheet nums
info_df.insert(4, 'Replicate 2 Sheet Number', sheet[rep2_df.columns.to_numpy()].iloc[0,:].to_numpy()) # rep 2 sheet nums
info_df.insert(5, 'Replicate 1 Plate ID', sheet[rep1_df.columns.to_numpy()].iloc[3,:]) # rep 1 plate IDs
info_df.insert(6, 'Replicate 2 Plate ID', sheet[rep2_df.columns.to_numpy()].iloc[3,:].to_numpy()) # rep 2 plate IDs
info_df.insert(0, 'CreID', sheet[rep1_df.columns.to_numpy()].iloc[2,:]) # CreID row

# use WD df format for p values
p_df = WDdf.copy()
p_df.iloc[1:,14:] = p

In [72]:
WDdf

Unnamed: 0,Identified Proteins,Accession Number,Alternate ID,Molecular Weight,Protein Grouping Ambiguity,Prey CreID,Prey Gene Symbol,Prey Experimental Localization,Prey PB-Chlamy Predicted Localization,Prey PredAlgo Predicted Localization,...,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108
Bait CreID,Identified Proteins,Accession Number,Alternate ID,Molecular Weight,Protein Grouping Ambiguity,Prey CreID,Prey Gene Symbol,Prey Experimental Localization,Prey PB-Chlamy Predicted Localization,Prey PredAlgo Predicted Localization,...,Cre17.g734500,Cre17.g738000,Cre17.g740950,Cre17.g741000,Cre17.g741050,Cre17.g743747,Cre17.g745847,Cre17.g745997,Cre24.g755197,Venus-3xFLAG
1,pacid=52538056 transcript=Cre12.g551050_4532.1...,Cre12.g551050_4532.1.p,Cre12.g551050,42 kDa,0,Cre12.g551050,Not Found,Not Found,Other,Other,...,6.837856,5.017614,0.0,5.734416,6.330624,5.779043,7.884822,8.266564,6.166164,5.068556
2,Venus-FLAG,Venus-FLAG,Venus-FLAG,31 kDa,0,Venus-FLAG,Not Found,Not Found,Not Found,Not Found,...,10.938153,6.125458,11.288517,5.186716,0.0,2.314004,6.571516,2.987366,6.352403,8.944166
3,pacid=52531343 transcript=Cre06.g258800_4532.1...,Cre06.g258800_4532.1.p,Cre06.g258800,131 kDa,0,Cre06.g258800,CWP2,Not Found,Secretory,Secretory,...,3.613707,4.018116,2.350513,3.1766,4.018116,4.202246,4.817552,6.775918,4.657809,4.37864
4,pacid=52515583 transcript=Cre13.g592500_4532.1...,Cre13.g592500_4532.1.p,Cre13.g592500,61 kDa,0,Cre13.g592500,Not Found,Not Found,Other,Other,...,5.871631,6.624093,2.260663,5.261684,5.261684,5.826979,3.958653,2.144654,4.15187,4.683941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13398,pacid=52510722 transcript=Cre03.g156600_4532.1...,Cre03.g156600_4532.1.p,Cre03.g156600,32 kDa,0,Cre03.g156600,GTRBP1,chloroplast/not homogeneous/not pyrenoid depleted,Chloroplast,Chloroplast,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13399,pacid=52540355 transcript=Cre04.g216300_4532.1...,Cre04.g216300_4532.1.p,Cre04.g216300,18 kDa,0,Cre04.g216300,Not Found,Not Found,Other,Chloroplast,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13400,pacid=52517051 transcript=Cre16.g653650_4532.1...,Cre16.g653650_4532.1.p,Cre16.g653650,22 kDa,0,Cre16.g653650,Not Found,Not Found,Chloroplast,Other,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13401,pacid=52534323 transcript=Cre08.g377000_4532.1...,Cre08.g377000_4532.1.p,Cre08.g377000,10 kDa,0,Cre08.g377000,Not Found,secretory pathway/contractile vacuole; shapes ...,Secretory,Other,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
p_df

Unnamed: 0,Identified Proteins,Accession Number,Alternate ID,Molecular Weight,Protein Grouping Ambiguity,Prey CreID,Prey Gene Symbol,Prey Experimental Localization,Prey PB-Chlamy Predicted Localization,Prey PredAlgo Predicted Localization,...,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108
Bait CreID,Identified Proteins,Accession Number,Alternate ID,Molecular Weight,Protein Grouping Ambiguity,Prey CreID,Prey Gene Symbol,Prey Experimental Localization,Prey PB-Chlamy Predicted Localization,Prey PredAlgo Predicted Localization,...,Cre17.g734500,Cre17.g738000,Cre17.g740950,Cre17.g741000,Cre17.g741050,Cre17.g743747,Cre17.g745847,Cre17.g745997,Cre24.g755197,Venus-3xFLAG
1,pacid=52538056 transcript=Cre12.g551050_4532.1...,Cre12.g551050_4532.1.p,Cre12.g551050,42 kDa,0,Cre12.g551050,Not Found,Not Found,Other,Other,...,2,2,0,2,2,2,2,2,2,2
2,Venus-FLAG,Venus-FLAG,Venus-FLAG,31 kDa,0,Venus-FLAG,Not Found,Not Found,Not Found,Not Found,...,2,2,2,2,0,1,2,1,2,2
3,pacid=52531343 transcript=Cre06.g258800_4532.1...,Cre06.g258800_4532.1.p,Cre06.g258800,131 kDa,0,Cre06.g258800,CWP2,Not Found,Secretory,Secretory,...,1,2,1,2,2,2,2,2,2,2
4,pacid=52515583 transcript=Cre13.g592500_4532.1...,Cre13.g592500_4532.1.p,Cre13.g592500,61 kDa,0,Cre13.g592500,Not Found,Not Found,Other,Other,...,2,2,1,2,2,2,2,1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13398,pacid=52510722 transcript=Cre03.g156600_4532.1...,Cre03.g156600_4532.1.p,Cre03.g156600,32 kDa,0,Cre03.g156600,GTRBP1,chloroplast/not homogeneous/not pyrenoid depleted,Chloroplast,Chloroplast,...,0,0,0,0,0,0,0,0,0,0
13399,pacid=52540355 transcript=Cre04.g216300_4532.1...,Cre04.g216300_4532.1.p,Cre04.g216300,18 kDa,0,Cre04.g216300,Not Found,Not Found,Other,Chloroplast,...,0,0,0,0,0,0,0,0,0,0
13400,pacid=52517051 transcript=Cre16.g653650_4532.1...,Cre16.g653650_4532.1.p,Cre16.g653650,22 kDa,0,Cre16.g653650,Not Found,Not Found,Chloroplast,Other,...,0,0,0,0,0,0,0,0,0,0
13401,pacid=52534323 transcript=Cre08.g377000_4532.1...,Cre08.g377000_4532.1.p,Cre08.g377000,10 kDa,0,Cre08.g377000,Not Found,secretory pathway/contractile vacuole; shapes ...,Secretory,Other,...,0,0,0,0,0,0,0,0,0,0


In [74]:
# calculate WD z scores
WD_zscore = zscore(WD, 1)
WDz_df = WDdf.copy()
WDz_df.iloc[1:,14:] = WD_zscore

In [75]:
WDz_df

Unnamed: 0,Identified Proteins,Accession Number,Alternate ID,Molecular Weight,Protein Grouping Ambiguity,Prey CreID,Prey Gene Symbol,Prey Experimental Localization,Prey PB-Chlamy Predicted Localization,Prey PredAlgo Predicted Localization,...,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108
Bait CreID,Identified Proteins,Accession Number,Alternate ID,Molecular Weight,Protein Grouping Ambiguity,Prey CreID,Prey Gene Symbol,Prey Experimental Localization,Prey PB-Chlamy Predicted Localization,Prey PredAlgo Predicted Localization,...,Cre17.g734500,Cre17.g738000,Cre17.g740950,Cre17.g741000,Cre17.g741050,Cre17.g743747,Cre17.g745847,Cre17.g745997,Cre24.g755197,Venus-3xFLAG
1,pacid=52538056 transcript=Cre12.g551050_4532.1...,Cre12.g551050_4532.1.p,Cre12.g551050,42 kDa,0,Cre12.g551050,Not Found,Not Found,Other,Other,...,0.324703,-0.777006,-3.813937,-0.343158,0.017699,-0.316148,0.958383,1.189434,-0.081841,-0.746173
2,Venus-FLAG,Venus-FLAG,Venus-FLAG,31 kDa,0,Venus-FLAG,Not Found,Not Found,Not Found,Not Found,...,1.633123,0.213514,1.736471,-0.063389,-1.593324,-0.910758,0.345088,-0.712135,0.280456,1.044953
3,pacid=52531343 transcript=Cre06.g258800_4532.1...,Cre06.g258800_4532.1.p,Cre06.g258800,131 kDa,0,Cre06.g258800,CWP2,Not Found,Secretory,Secretory,...,-1.170106,-0.801726,-2.320762,-1.568271,-0.801726,-0.634,-0.073512,1.710383,-0.219023,-0.473321
4,pacid=52515583 transcript=Cre13.g592500_4532.1...,Cre13.g592500_4532.1.p,Cre13.g592500,61 kDa,0,Cre13.g592500,Not Found,Not Found,Other,Other,...,0.466096,0.955088,-1.880513,0.069718,0.069718,0.437079,-0.777064,-1.955902,-0.651501,-0.305731
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13398,pacid=52510722 transcript=Cre03.g156600_4532.1...,Cre03.g156600_4532.1.p,Cre03.g156600,32 kDa,0,Cre03.g156600,GTRBP1,chloroplast/not homogeneous/not pyrenoid depleted,Chloroplast,Chloroplast,...,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042
13399,pacid=52540355 transcript=Cre04.g216300_4532.1...,Cre04.g216300_4532.1.p,Cre04.g216300,18 kDa,0,Cre04.g216300,Not Found,Not Found,Other,Chloroplast,...,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042
13400,pacid=52517051 transcript=Cre16.g653650_4532.1...,Cre16.g653650_4532.1.p,Cre16.g653650,22 kDa,0,Cre16.g653650,Not Found,Not Found,Chloroplast,Other,...,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042
13401,pacid=52534323 transcript=Cre08.g377000_4532.1...,Cre08.g377000_4532.1.p,Cre08.g377000,10 kDa,0,Cre08.g377000,Not Found,secretory pathway/contractile vacuole; shapes ...,Secretory,Other,...,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042


In [76]:
# calculate spectral z scores
data_zscore = zscore(data, 1)
z_df = WDdf.copy()
z_df.iloc[1:,14:] = data_zscore

In [77]:
z_df

Unnamed: 0,Identified Proteins,Accession Number,Alternate ID,Molecular Weight,Protein Grouping Ambiguity,Prey CreID,Prey Gene Symbol,Prey Experimental Localization,Prey PB-Chlamy Predicted Localization,Prey PredAlgo Predicted Localization,...,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108
Bait CreID,Identified Proteins,Accession Number,Alternate ID,Molecular Weight,Protein Grouping Ambiguity,Prey CreID,Prey Gene Symbol,Prey Experimental Localization,Prey PB-Chlamy Predicted Localization,Prey PredAlgo Predicted Localization,...,Cre17.g734500,Cre17.g738000,Cre17.g740950,Cre17.g741000,Cre17.g741050,Cre17.g743747,Cre17.g745847,Cre17.g745997,Cre24.g755197,Venus-3xFLAG
1,pacid=52538056 transcript=Cre12.g551050_4532.1...,Cre12.g551050_4532.1.p,Cre12.g551050,42 kDa,0,Cre12.g551050,Not Found,Not Found,Other,Other,...,0.221295,-0.891465,-2.189685,-0.49405,-0.12313,-0.467556,1.016124,1.334055,-0.229107,-0.86497
2,Venus-FLAG,Venus-FLAG,Venus-FLAG,31 kDa,0,Venus-FLAG,Not Found,Not Found,Not Found,Not Found,...,1.97848,-0.10103,2.175675,-0.369932,-1.051151,-0.88981,0.042384,-0.782249,-0.029323,0.974578
3,pacid=52531343 transcript=Cre06.g258800_4532.1...,Cre06.g258800_4532.1.p,Cre06.g258800,131 kDa,0,Cre06.g258800,CWP2,Not Found,Secretory,Secretory,...,-1.134727,-0.850501,-1.845293,-1.418954,-0.850501,-0.708388,-0.187306,1.944393,-0.329419,-0.566274
4,pacid=52515583 transcript=Cre13.g592500_4532.1...,Cre13.g592500_4532.1.p,Cre13.g592500,61 kDa,0,Cre13.g592500,Not Found,Not Found,Other,Other,...,0.363156,0.98611,-1.574923,-0.086755,-0.086755,0.328547,-0.882752,-1.609532,-0.778927,-0.46745
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13398,pacid=52510722 transcript=Cre03.g156600_4532.1...,Cre03.g156600_4532.1.p,Cre03.g156600,32 kDa,0,Cre03.g156600,GTRBP1,chloroplast/not homogeneous/not pyrenoid depleted,Chloroplast,Chloroplast,...,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042
13399,pacid=52540355 transcript=Cre04.g216300_4532.1...,Cre04.g216300_4532.1.p,Cre04.g216300,18 kDa,0,Cre04.g216300,Not Found,Not Found,Other,Chloroplast,...,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042
13400,pacid=52517051 transcript=Cre16.g653650_4532.1...,Cre16.g653650_4532.1.p,Cre16.g653650,22 kDa,0,Cre16.g653650,Not Found,Not Found,Chloroplast,Other,...,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042
13401,pacid=52534323 transcript=Cre08.g377000_4532.1...,Cre08.g377000_4532.1.p,Cre08.g377000,10 kDa,0,Cre08.g377000,Not Found,secretory pathway/contractile vacuole; shapes ...,Secretory,Other,...,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042,-0.030042


In [93]:
# export WD scores, p values, z scores to Excel

with pd.ExcelWriter(WD_out_name, mode='w') as writer:  
    WDdf.to_excel(writer, sheet_name='WD scores')
    p_df.to_excel(writer, sheet_name='p values')
    z_df.to_excel(writer, sheet_name='z scores')
#     WDz_df.to_excel(writer, sheet_name='WD z scores')

In [27]:
# export bait info (manually insert into WD sheet)
with pd.ExcelWriter('bait_info' + WD_out_name, mode='w') as writer:  
    info_df.to_excel(writer, sheet_name='WD scores')