In [2]:

import pandas as pd
import os
import numpy as np


In [3]:

def ensure_dirs_exists(path):
    if "." in path:
        path = os.path.dirname(path)
    if not os.path.exists(path):
        os.makedirs(path)
    return

def writeToFile(filename, dataframe):
    write = filename + '_Transformed.csv'
    dataframe.to_csv(write, index=True)
    print("file saved to: {}".format(write))
    return



In [4]:
def transform_raw(filename, sheetname):
    if "xlsx" in filename:
        df = pd.read_excel(filename, sheet_name= sheetname)
    if "csv" in filename:
        df = pd.read_csv(filename)
    pd.set_option('display.max_rows', None)
    colsdrop = [x for x in df.columns if '_CV' in x.upper() or 'SPQC' in x.upper()]
    df = df.drop(columns=colsdrop)
    
    print(df.columns, '\n', colsdrop)
    H = list(range(5, 8))
    S1 = list(range(8, 11))
    P2 = list(range(11, 14))
    LP1 = list(range(14, 17))
    long_df = pd.melt(df, 
            id_vars=['Accession', 'Description', 'Genes', 'Peptides', 'Detected_Imputed'],
            var_name='Sample', 
            value_name='Intensity') # raw intensity
    abundance_sum = long_df.groupby('Accession')['Intensity'].sum().reset_index(name='Total_Intensity')

    # Merge this sum back into the long dataframe
    long_df = long_df.merge(abundance_sum, on='Accession')
    long_df['Rel_Intensity'] = long_df['Intensity'] / long_df['Total_Intensity']
    long_df['Abundance']= np.log2(long_df['Intensity'])
    long_df.insert(1, 'Origin', long_df['Description'].str.split("org=").str.get(-1))
    long_df.insert(2, 'Mixture', long_df['Sample'].str.split("_").str.get(-1).str.split("Normalized").str.get(0))
    long_df.insert(3, 'Function', long_df['Description'].str.split(",").str.get(0))
    long_df.insert(4, 'Genotype', 'WildType')
    long_df.insert(5, 'BioFraction', long_df.groupby(['Accession', 'Mixture']).cumcount() + 1)
    long_df['BioFraction'] = 'r' + long_df['BioFraction'].astype(str)
    # long_df.drop(columns= ['Description'], inplace=True)
    long_df.rename(columns={'Accession': 'Protein',
                            'Genes': 'Gene'}, inplace=True)
    return long_df
filename = 'data_10297JBK/10297_SupplementalData_Proteome_122723.xlsx'
worksheet = 'Normalized Data'

currdir = os.getcwd()
parent = os.path.dirname(currdir)
gparent = os.path.dirname(os.path.dirname(currdir))
rawdatafile = f"{gparent}/example_data/rawdata/{filename}" # EXAMPLE DATA DIR, change for you. 
# do conversion
dataframe = transform_raw(rawdatafile, worksheet)

dataframe.head()

Index(['Accession', 'Description', 'Genes', 'Peptides', 'Detected_Imputed',
       '110129 Homo_H Normalized', '110130 Homo_H Normalized',
       '110131 Homo_H Normalized', '110132 Super_S1 Normalized',
       '110133 Super_S1 Normalized', '110134 Super_S1 Normalized',
       '110135 CrudeSyn_P2 Normalized', '110136 CrudeSyn_P2 Normalized',
       '110137 CrudeSyn_P2 Normalized', '110138 Syn_LP1 Normalized',
       '110139 Syn_LP1 Normalized', '110140 Syn_LP1 Normalized'],
      dtype='object') 
 ['110160_01 SPQC Normalized', '110160_02 SPQC Normalized', '110160_03 SPQC Normalized', 'Homo_H_CV', 'Super_S1_CV', 'CrudeSyn_P2_CV', 'Syn_LP1_CV', 'SPQC_CV']


Unnamed: 0,Protein,Origin,Mixture,Function,Genotype,BioFraction,Description,Gene,Peptides,Detected_Imputed,Sample,Intensity,Total_Intensity,Rel_Intensity,Abundance
0,A0A087WPF7,Mus musculus,H,Autism susceptibility gene 2 protein homolog,WildType,r1,"Autism susceptibility gene 2 protein homolog, ...",Auts2,2,2.2.2.2.2.2.1.2.-.-.1.1.1.2.2,110129 Homo_H Normalized,543.118684,3057.169232,0.177654,9.085124
1,A0A087WPF7,Mus musculus,H,Autism susceptibility gene 2 protein homolog,WildType,r2,"Autism susceptibility gene 2 protein homolog, ...",Auts2,2,2.2.2.2.2.2.1.2.-.-.1.1.1.2.2,110130 Homo_H Normalized,546.194965,3057.169232,0.17866,9.093272
2,A0A087WPF7,Mus musculus,H,Autism susceptibility gene 2 protein homolog,WildType,r3,"Autism susceptibility gene 2 protein homolog, ...",Auts2,2,2.2.2.2.2.2.1.2.-.-.1.1.1.2.2,110131 Homo_H Normalized,239.267457,3057.169232,0.078264,7.90248
3,A0A087WPF7,Mus musculus,S1,Autism susceptibility gene 2 protein homolog,WildType,r1,"Autism susceptibility gene 2 protein homolog, ...",Auts2,2,2.2.2.2.2.2.1.2.-.-.1.1.1.2.2,110132 Super_S1 Normalized,495.669246,3057.169232,0.162133,8.953234
4,A0A087WPF7,Mus musculus,S1,Autism susceptibility gene 2 protein homolog,WildType,r2,"Autism susceptibility gene 2 protein homolog, ...",Auts2,2,2.2.2.2.2.2.1.2.-.-.1.1.1.2.2,110133 Super_S1 Normalized,249.361333,3057.169232,0.081566,7.962094


In [5]:

def main():
    ## SNCA == Normalized Data, LRRK2 == Table 4 Normalized Data
    ## Vps35 == Table 3 Normalized Data
    filename = 'data_10297JBK/10297_SupplementalData_Proteome_122723.xlsx'
    worksheet = 'Normalized Data'
    
    currdir = os.getcwd()
    parent = os.path.dirname(currdir)
    gparent = os.path.dirname(os.path.dirname(currdir))
    rawdatafile = f"{gparent}/example_data/rawdata/{filename}" # EXAMPLE DATA DIR, change for you. 
    # do conversion
    dataframe = transform_raw(rawdatafile, worksheet)

    # write to file
    filename = filename.replace(".xlsx", "")
    newfilepath =  os.path.join(gparent, "transformeddata", filename)
    ensure_dirs_exists(newfilepath)
    writeToFile(newfilepath, dataframe)
if __name__ == "__main__":
    main()


Index(['Accession', 'Description', 'Genes', 'Peptides', 'Detected_Imputed',
       '110129 Homo_H Normalized', '110130 Homo_H Normalized',
       '110131 Homo_H Normalized', '110132 Super_S1 Normalized',
       '110133 Super_S1 Normalized', '110134 Super_S1 Normalized',
       '110135 CrudeSyn_P2 Normalized', '110136 CrudeSyn_P2 Normalized',
       '110137 CrudeSyn_P2 Normalized', '110138 Syn_LP1 Normalized',
       '110139 Syn_LP1 Normalized', '110140 Syn_LP1 Normalized'],
      dtype='object') 
 ['110160_01 SPQC Normalized', '110160_02 SPQC Normalized', '110160_03 SPQC Normalized', 'Homo_H_CV', 'Super_S1_CV', 'CrudeSyn_P2_CV', 'Syn_LP1_CV', 'SPQC_CV']
file saved to: /home/poojaparameswaran/Documents/SoderlingLab/SpatialProteomics/transformeddata/data_10297JBK/10297_SupplementalData_Proteome_122723_Transformed.csv
