In [6]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import shephard

In [20]:
# Global variables for analysis

# Path of the txt folder
txt_path = '/Users/plutzer/Library/CloudStorage/Box-Box/CellBio-MajorLab/Users/Isaac/Experiments/009_CDKs_OXPO4/txt/'

# Path where outputs will go
output_folder_path = '/Users/plutzer/Library/CloudStorage/Box-Box/CellBio-MajorLab/Users/Isaac/Experiments/009_CDKs_OXPO4/analysis_output/'

# Other globals for analysis
# Experiment name to columns mapping
columns_mapping = {
    'Reporter intensity corrected 1 CDKs':'Ctrl 1',
    'Reporter intensity corrected 2 CDKs':'CDK2 1',
    'Reporter intensity corrected 3 CDKs':'Ctrl 2',
    'Reporter intensity corrected 4 CDKs':'CDK2 2',
    'Reporter intensity corrected 5 CDKs':'Ctrl 3',
    'Reporter intensity corrected 6 CDKs':'CDK2 3',
    'Reporter intensity corrected 7 CDKs':'CDK14 1',
    'Reporter intensity corrected 8 CDKs':'CDK16 1',
    'Reporter intensity corrected 9 CDKs':'CDK14 2',
    'Reporter intensity corrected 10 CDKs':'CDK16 2',
    'Reporter intensity corrected 11 CDKs':'CDK14 3',
    'Reporter intensity corrected 12 CDKs':'CDK16 3',
    'Reporter intensity corrected 13 CDKs':'CDK17 1',
    'Reporter intensity corrected 14 CDKs':'CDK18 1',
    'Reporter intensity corrected 15 CDKs':'CDK17 2',
    'Reporter intensity corrected 16 CDKs':'CDK18 2',
    'Reporter intensity corrected 17 CDKs':'CDK17 3',
    'Reporter intensity corrected 18 CDKs':'CDK18 3'
}

# Columns order for data tables
quant_cols_order = [
    'Ctrl 1',
    'Ctrl 2',
    'Ctrl 3',
    'CDK2 1',
    'CDK2 2',
    'CDK2 3',
    'CDK14 1',
    'CDK14 2',
    'CDK14 3',
    'CDK16 1',
    'CDK16 2',
    'CDK16 3',
    'CDK17 1',
    'CDK17 2',
    'CDK17 3',
    'CDK18 1',
    'CDK18 2',
    'CDK18 3'
]

In [14]:
# Functions

# Code for mixing correction:
def correct_mixing(dataset,columns):
    data = dataset[columns]
    sums = np.sum(data)
    corrected = data/(sums/np.mean(sums))
    return corrected



In [15]:
# Read in the phospho_data

phospho_path = txt_path + 'Phospho (STY)Sites.txt'

phospho_raw = pd.read_csv(phospho_path, sep='\t')

  phospho_raw = pd.read_csv(phospho_path, sep='\t')


In [16]:
# Filter the phospho data
phospho =  phospho_raw[phospho_raw['Localization prob'] >= 0.75]
phospho = phospho[phospho['Reverse'] != '+']
phospho = phospho[phospho['Potential contaminant'] != '+']
# phospho

In [24]:
# Preprocess the phospho data

data = phospho.copy()

cols_list = []
non_quant_cols = []
for col in data.columns:
    if 'Reporter intensity corrected' in col:
        cols_list.append(col)
    elif 'Reporter intensity' in col:
        skip = True
    else:
        non_quant_cols.append(col)

quant_subset = data[cols_list]

non_quant_subset = data[non_quant_cols]

column_names = [col.split('__')[0] for col in quant_subset.columns if '__' in col]

quant_summarized = pd.DataFrame()

for column_name in set(column_names):
    # Filter columns with the current column name
    related_columns = [col for col in quant_subset.columns if col.startswith(column_name)]

    # Sum the related columns and store the result in the summarized DataFrame
    quant_summarized[column_name] = quant_subset[related_columns].sum(axis=1)
    
quant_summarized_mixCorrected = correct_mixing(quant_summarized,quant_summarized.columns)

quant_summarized_mixCorrected.rename(columns=columns_mapping, inplace=True)

quant_summarized_mixCorrected_ordered = quant_summarized_mixCorrected[quant_cols_order]


# combine the non_quant and quant_summarized_mixCorrected dataframes
combined = pd.concat([non_quant_subset,quant_summarized_mixCorrected_ordered],axis=1)

combined.columns


Index(['Proteins', 'Positions within proteins', 'Leading proteins', 'Protein',
       'Protein names', 'Gene names', 'Fasta headers', 'Localization prob',
       'Score diff', 'PEP', 'Score', 'Delta score', 'Score for localization',
       'Localization prob CDKs', 'Score diff CDKs', 'PEP CDKs', 'Score CDKs',
       'Diagnostic peak', 'Number of Phospho (STY)', 'Amino acid',
       'Sequence window', 'Modification window', 'Peptide window coverage',
       'Phospho (STY) Probabilities', 'Phospho (STY) Score diffs',
       'Position in peptide', 'Charge', 'Mass error [ppm]', 'Intensity',
       'Intensity___1', 'Intensity___2', 'Intensity___3', 'Ratio mod/base',
       'Intensity CDKs', 'Ratio mod/base CDKs', 'Intensity CDKs___1',
       'Intensity CDKs___2', 'Intensity CDKs___3', 'Occupancy CDKs',
       'Occupancy ratioCDKs', 'Occupancy error scale CDKs', 'Taxonomy IDs',
       'Taxonomy names', 'Reverse', 'Potential contaminant', 'id',
       'Protein group IDs', 'Positions', 'Positi

In [8]:
# Read in the protein data

pg_path = txt_path + 'proteinGroups.txt'

pg = pd.read_csv(pg_path, sep='\t')
pg

  pg = pd.read_csv(pg_path, sep='\t')


Unnamed: 0,Protein IDs,Majority protein IDs,Peptide counts (all),Peptide counts (razor+unique),Peptide counts (unique),Protein names,Gene names,Fasta headers,Number of proteins,Peptides,...,Mod. peptide IDs,Evidence IDs,MS/MS IDs,Best MS/MS,Oxidation (M) site IDs,Phospho (STY) site IDs,Oxidation (M) site positions,Phospho (STY) site positions,Taxonomy IDs,Taxonomy names
0,P0DPI2;A0A0B4J2D5,P0DPI2;A0A0B4J2D5,9;9,9;9,9;9,,,sp|P0DPI2|GAL3A_HUMAN Glutamine amidotransfera...,2,9,...,15265;20510;24932;28159;34384;49888;51814;7590...,27208;36313;36314;44280;44281;50353;50354;6186...,32823;43812;43813;53420;53421;53422;60815;6081...,32823;43812;53421;60815;74762;107578;111786;16...,0,,243,,-1;-1,;
1,P0DPI3;A0A0U1RRI6;A0A0U1RR11,P0DPI3;A0A0U1RRI6;A0A0U1RR11,1;1;1,1;1;1,1;1;1,,,sp|P0DPI3|CENL2_HUMAN Centromere protein V-lik...,3,1,...,28983,51842;51843,62621;62622;62623,62623,,,,,-1;-1;-1,;;
2,A0AV96,A0AV96,3,3,3,RNA-binding protein 47,RBM47,sp|A0AV96|RBM47_HUMAN RNA-binding protein 47 O...,1,3,...,13145;65234;65235;75185,23520;118002;118003;136239,28398;141540;141541;163599,28398;141541;163599,1,,10,,-1,
3,A0AVF1,A0AVF1,7,7,7,Intraflagellar transport protein 56,TTC26,sp|A0AVF1|IFT56_HUMAN Intraflagellar transport...,1,7,...,782;9367;10588;24978;28393;52579;60160,1481;1482;16648;18876;18877;44349;50781;94047;...,1815;1816;1817;20170;22863;22864;53503;61317;1...,1815;20170;22864;53503;61317;113264;129707,,,,,-1,
4,A0AVK6,A0AVK6,1,1,1,Transcription factor E2F8,E2F8,sp|A0AVK6|E2F8_HUMAN Transcription factor E2F8...,1,1,...,8665,15474;15475,18743;18744,18744,,0,,102,-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7947,REV__Q9ULI3,REV__Q9ULI3,1,1,1,,,sp|Q9ULI3|HEG1_HUMAN Protein HEG homolog 1 OS=...,1,1,...,32525,58638,71000,71000,,11789;11790;11791;11792;11793,,804;805;806;807;812,-1,
7948,REV__Q9UM44,REV__Q9UM44,1,1,1,,,sp|Q9UM44|HHLA2_HUMAN HERV-H LTR-associating p...,1,1,...,48000,86101,103648,103648,,11794;14056,,260;262,-1,
7949,REV__Q9UPV0,REV__Q9UPV0,1,1,1,,,sp|Q9UPV0|CE164_HUMAN Centrosomal protein of 1...,1,1,...,593,1127,1374,1374,,14057;14058,,1126;1132,-1,
7950,REV__Q9Y490,REV__Q9Y490,1,1,1,,,sp|Q9Y490|TLN1_HUMAN Talin-1 OS=Homo sapiens O...,1,1,...,43104,77413,93424,93424,,,,,-1,
