# Multiconsensus Builder

The purpose is to build a multiconsensus file from single run files obtained from PD so to be loaded on MSStats for differential analysis

In [33]:
import numpy as np
import pandas as pd
import os

In [34]:
cd C:\Users\rmagni\Desktop\GMU_projects\Uganda\TB_uganda\data\PD

C:\Users\rmagni\Desktop\GMU_projects\Uganda\TB_uganda\data\PD


- First we combine all the PSM files so we have the full list of peptides

## Combine all PSM files one after the other

In [12]:
# Load all the files
path = os.getcwd()
files = os.listdir(path)
files[:5]

['.~lock.Ruben_Sample_117_PSMs.txt#',
 'Ruben_Sample_117_Proteins.txt',
 'Ruben_Sample_117_PSMs.txt',
 'Ruben_Sample_119_Proteins.txt',
 'Ruben_Sample_119_PSMs.txt']

In [16]:
df_psm = pd.DataFrame()
path =r"C:\Users\rmagni\Desktop\GMU_projects\Uganda\TB_uganda\data\PD"
os.chdir(path)
files = os.listdir(path)
files_txt = [f for f in files if f.endswith("PSMs.txt")]
for f in files_txt:
    data = pd.read_csv(f, delimiter="\t")
    df_psm = df_psm.append(data)

In [35]:
# Master accession proteins have multiple ids. If they are multiple I want to replicate the row, so then I can match with the psms files.
#Set the columns not to be touched as the index
df_psm2 = df_psm.assign(var1=df_psm['Master Protein Accessions'].str.split('; ')).explode('var1')

In [39]:
# make sure that the Master protein accession does not contain blank spaces
df_psm2['Master Protein Accessions'] = df_psm2['Master Protein Accessions'].str.strip()

## Combine all Protein files one after the other

In [53]:
df_proteins = pd.DataFrame()
path =r"C:\Users\rmagni\Desktop\GMU_projects\Uganda\TB_uganda\data\PD"
os.chdir(path)
files = os.listdir(path)
files_txt = [f for f in files if f.endswith("Proteins.txt")]
for f in files_txt:
    data = pd.read_csv(f, delimiter="\t")
    df_proteins = df_proteins.append(data)

## Find all proteins from TB and merge Proteins and Peptides

In [66]:
# Find Tb proteins
Pattern = r"[Mm]ycobacterium"
tb_proteins_mask = df_proteins.Description.str.contains(Pattern)
tb_proteins = df_proteins[tb_proteins_mask]

In [67]:
# Select relevant columns in database
tb_proteins = tb_proteins[['Accession', 'Description', 'Biological Process', 'Cellular Component', 'Molecular Function', 'WikiPathways', ]]

In [68]:
# merge peptides and proteins databases
merged = pd.merge(left=df_psm2, right=tb_proteins, how='left', left_on='Master Protein Accessions', right_on='Accession')

In [69]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 487758 entries, 0 to 487757
Data columns (total 46 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   PSMs Workflow ID            487758 non-null  int64  
 1   PSMs Peptide ID             487758 non-null  int64  
 2   Checked                     487758 non-null  bool   
 3   Confidence                  487758 non-null  object 
 4   Identifying Node            487758 non-null  object 
 5   PSM Ambiguity               487758 non-null  object 
 6   Annotated Sequence          487758 non-null  object 
 7   Modifications               48209 non-null   object 
 8   # Proteins                  487758 non-null  int64  
 9   Master Protein Accessions   487156 non-null  object 
 10  Protein Accessions          487758 non-null  object 
 11  # Missed Cleavages          487758 non-null  int64  
 12  Charge                      487758 non-null  int64  
 13  DeltaScore    

In [70]:
# Filter only the entries containing TB
Pattern = r"[Mm]ycobacterium"
merged_mask = merged.Description.notnull()
merged = merged[merged_mask]

In [77]:
# I need to drop duplcates based on 2 columns
merged = merged.drop_duplicates(subset=['Annotated Sequence', 'Spectrum File'])

In [78]:
# create dummy columns for each Spectrum file
merged_dummy_col = pd.get_dummies(merged, columns = ['Spectrum File'])

In [80]:
merged_dummy_col

Unnamed: 0,PSMs Workflow ID,PSMs Peptide ID,Checked,Confidence,Identifying Node,PSM Ambiguity,Annotated Sequence,Modifications,# Proteins,Master Protein Accessions,Protein Accessions,# Missed Cleavages,Charge,DeltaScore,DeltaCn,Rank,Search Engine Rank,m/z [Da],MH+ [Da],Theo. MH+ [Da],DeltaM [ppm],Deltam/z [Da],Activation Type,MS Order,Isolation Interference [%],Ion Inject Time [ms],RT [min],First Scan,File ID,Amanda Score,CharmeRT Combined Score,Search Space,MS Amanda Rank,Search Depth,XCorr,# Protein Groups,Percolator q-Value,Percolator PEP,var1,Accession,Description,Biological Process,Cellular Component,Molecular Function,WikiPathways,Spectrum File_Ruben_Sample_117.raw,Spectrum File_Ruben_Sample_120.raw,Spectrum File_Ruben_Sample_489.raw,Spectrum File_Ruben_Sample_491.raw,Spectrum File_Ruben_Sample_498.raw,Spectrum File_Ruben_Sample_500.raw,Spectrum File_Ruben_Sample_504.raw,Spectrum File_Ruben_Sample_514.raw,Spectrum File_Ruben_Sample_515.raw,Spectrum File_Ruben_Sample_527.raw,Spectrum File_Ruben_Sample_528.raw,Spectrum File_Ruben_Sample_535.raw,Spectrum File_Ruben_Sample_538.raw,Spectrum File_Ruben_Sample_542.raw,Spectrum File_Ruben_Sample_644.raw,Spectrum File_Ruben_Sample_651.raw,Spectrum File_Ruben_Sample_656.raw,Spectrum File_Ruben_Sample_667.raw,Spectrum File_Ruben_Sample_684.raw,Spectrum File_Ruben_Sample_700.raw,Spectrum File_Ruben_Sample_706.raw,Spectrum File_Ruben_Sample_718.raw,Spectrum File_Ruben_Sample_721.raw,Spectrum File_Ruben_Sample_722.raw,Spectrum File_Ruben_Sample_730.raw,Spectrum File_Ruben_Sample_732.raw,Spectrum File_Ruben_Sample_734.raw,Spectrum File_Ruben_Sample_737.raw,Spectrum File_Ruben_Sample_738.raw,Spectrum File_Ruben_Sample_745.raw,Spectrum File_Ruben_Sample_746.raw,Spectrum File_Ruben_Sample_754.raw,Spectrum File_Ruben_Sample_757.raw,Spectrum File_Ruben_Sample_758.raw,Spectrum File_Ruben_Sample_771.raw,Spectrum File_Ruben_Sample_785.raw,Spectrum File_Ruben_Sample_786.raw,Spectrum File_Ruben_Sample_787.raw,Spectrum File_Ruben_Sample_791.raw,Spectrum File_Ruben_Sample_794.raw,Spectrum File_Ruben_Sample_795.raw,Spectrum File_Ruben_Sample_800.raw,Spectrum File_Ruben_Sample_817.raw,Spectrum File_Ruben_Sample_818.raw,Spectrum File_Ruben_Sample_820.raw,Spectrum File_Ruben_Sample_822.raw,Spectrum File_Ruben_Sample_840.raw,Spectrum File_Ruben_Sample_845.raw,Spectrum File_Ruben_Sample_850 (2).raw,Spectrum File_Ruben_Sample_861.raw,Spectrum File_Ruben_Sample_866.raw,Spectrum File_Ruben_Sample_872.raw,Spectrum File_Ruben_Sample_876.raw,Spectrum File_Ruben_Sample_886.raw,Spectrum File_Ruben_Sample_888.raw
962,-429,512384,False,High,Sequest HT (A2),Unambiguous,VSGPDPVPGcLSSINGQPcRPPHcVASVSPARPSAGSP,C10(Carbamidomethyl); C19(Carbamidomethyl); C2...,1,A5U2I9,A5U2I9,0,4,0.3129,0.0,1,1,963.71503,3851.83828,3851.83742,0.22,0.00021,HCD,MS2,0.000000,22.180,61.2723,57776,F1,,,0,0,0,1.47,1,0.003236,0.18450,A5U2I9,A5U2I9,Coproporphyrin III ferrochelatase OS=Mycobacte...,metabolic process,cytoplasm,catalytic activity;metal ion binding,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4537,-448,115466,False,High,MS Amanda 2.0 (A4),Selected,VFDETIGGDAHTWLR,,1,Q10690,Q10690,0,3,0.1057,0.0,1,1,572.95001,1716.83548,1716.83401,0.86,0.00049,HCD,MS2,11.732860,22.166,35.4511,36584,F3,83.07,83.07,123,1,1,,1,0.003180,0.08999,Q10690,Q10690,Uncharacterized protein Rv2082 OS=Mycobacteriu...,regulation of biological process;response to s...,,,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21163,-519,913925,False,High,Sequest HT (A2),Selected,AFGISGEVTFDYGIAYDR,,1,P9WHH9,P9WHH9,0,3,0.3689,0.0,1,1,660.98175,1980.93070,1980.93378,-1.56,-0.00103,HCD,MS2,0.000000,22.161,80.9755,89801,F10,,,0,0,0,2.06,1,0.001017,0.04318,P9WHH9,P9WHH9,Dihydrolipoyl dehydrogenase OS=Mycobacterium t...,cellular homeostasis;metabolic process;regulat...,cytoplasm;cytosol;extracellular;membrane,antioxidant activity;catalytic activity;nucleo...,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25589,-538,720569,False,High,Sequest HT (A2),Selected,TTGTYLAVATIGTESDR,,1,P9WLH0,P9WLH0,0,2,0.4444,0.0,1,1,878.43903,1755.87078,1755.87593,-2.94,-0.00258,HCD,MS2,0.000000,22.202,61.0170,61987,F12,,,0,0,0,1.44,1,0.003934,0.03517,P9WLH0,P9WLH0,Uncharacterized protein MT2296 OS=Mycobacteriu...,,membrane,,,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36516,-579,318846,False,High,Sequest HT (A2),Selected,HLAEQGLR,,2,P9WPS4,P9WPS4; P9WPS5,0,3,0.0560,0.0,1,1,308.50726,923.50724,923.50574,1.62,0.00050,HCD,MS2,52.934190,22.172,8.6825,7630,F17,,,0,0,0,1.25,1,0.003429,0.08128,P9WPS4,P9WPS4,Probable cation-transporting ATPase I OS=Mycob...,,membrane,catalytic activity;metal ion binding;nucleotid...,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452576,-1988,555611,False,High,Sequest HT (A2),Selected,VHSETAIDAAGASVVSVALGmAER,M21(Oxidation),1,A1KFY5,A1KFY5,0,3,0.0720,0.0,1,1,786.39978,2357.18479,2357.17655,3.49,0.00275,HCD,MS2,7.886601,22.214,56.8018,55931,F156,,,0,0,0,1.25,1,0.001431,0.09492,A1KFY5,A1KFY5,Glutamyl-tRNA reductase OS=Mycobacterium bovis...,metabolic process,,catalytic activity;nucleotide binding,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
458093,-1999,422000,False,High,Sequest HT (A2),Unambiguous,SLDVLTAAR,,1,B2HHL4,B2HHL4,0,3,0.2645,0.0,1,1,315.85052,945.53702,945.53637,0.69,0.00022,HCD,MS2,75.167280,22.246,19.1116,19515,F158,,,0,0,0,1.21,1,0.000755,0.02968,B2HHL4,B2HHL4,Lipoyl synthase OS=Mycobacterium marinum (stra...,metabolic process,cytoplasm,catalytic activity;metal ion binding,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
467522,-2008,420130,False,High,Sequest HT (A2),Unambiguous,VGTEVIR,,2,P9WQL6,P9WQL6; P9WQL7,0,2,0.1131,0.0,1,1,387.23001,773.45275,773.45158,1.51,0.00058,HCD,MS2,50.542430,22.158,21.6020,21449,F159,,,0,0,0,1.68,1,0.002535,0.06967,P9WQL6,P9WQL6,Fluoroquinolones export ATP-binding protein MT...,response to stimulus;transport,membrane,catalytic activity;nucleotide binding;transpor...,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
475688,-2026,190183,False,High,MS Amanda 2.0 (A4),Unambiguous,AGLAAGQKPTAVIFGCADSR,,1,P9WPJ9,P9WPJ9,0,2,0.4678,0.0,1,1,967.00476,1933.00224,1932.99601,3.23,0.00312,HCD,MS2,91.255360,22.144,74.9947,76653,F161,70.61,70.61,201,1,1,,1,0.001718,0.15440,P9WPJ9,P9WPJ9,Carbonic anhydrase 2 OS=Mycobacterium tubercul...,,,catalytic activity;metal ion binding;protein b...,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [81]:
# Change name of dummy columns
def clean_col(col):
    col = col.replace("Spectrum File_Ruben_Sample_","")
    col = col.replace(".raw", "")
    return col

new_columns = []
for c in merged_dummy_col.columns:
    clean_c = clean_col(c)
    new_columns.append(clean_c)
    
merged_dummy_col.columns = new_columns

In [84]:
# copy database
merged_dummy_col = merged_dummy_col.drop(columns=['850 (2)'])

In [85]:
cd C:\Users\rmagni\Desktop\GMU_projects\Uganda\TB_uganda\data\

C:\Users\rmagni\Desktop\GMU_projects\Uganda\TB_uganda\data


In [86]:
# create a list of negatives
clinical_info = pd.read_csv('Uganda_LAM_clinical_data.csv')

In [87]:
list_of_neg = clinical_info[clinical_info['tbpos'] == 0].id.tolist()

In [88]:
# create a list with only neg samples as strings
list_of_neg = [str(i) for i in list_of_neg]

In [89]:
# remove rows where negative proteins were identified
for col in list_of_neg:
    if col in merged_dummy_col.columns:
        indexNames = merged_dummy_col[merged_dummy_col[col]> 0].index
        merged_dummy_col = merged_dummy_col.drop(indexNames)

In [95]:
merged_dummy_col

Unnamed: 0,PSMs Workflow ID,PSMs Peptide ID,Checked,Confidence,Identifying Node,PSM Ambiguity,Annotated Sequence,Modifications,# Proteins,Master Protein Accessions,Protein Accessions,# Missed Cleavages,Charge,DeltaScore,DeltaCn,Rank,Search Engine Rank,m/z [Da],MH+ [Da],Theo. MH+ [Da],DeltaM [ppm],Deltam/z [Da],Activation Type,MS Order,Isolation Interference [%],Ion Inject Time [ms],RT [min],First Scan,File ID,Amanda Score,CharmeRT Combined Score,Search Space,MS Amanda Rank,Search Depth,XCorr,# Protein Groups,Percolator q-Value,Percolator PEP,var1,Accession,Description,Biological Process,Cellular Component,Molecular Function,WikiPathways,117,120,489,491,498,500,504,514,515,527,528,535,538,542,644,651,656,667,684,700,706,718,721,722,730,732,734,737,738,745,746,754,757,758,771,785,786,787,791,794,795,800,817,818,820,822,840,845,861,866,872,876,886,888
962,-429,512384,False,High,Sequest HT (A2),Unambiguous,VSGPDPVPGcLSSINGQPcRPPHcVASVSPARPSAGSP,C10(Carbamidomethyl); C19(Carbamidomethyl); C2...,1,A5U2I9,A5U2I9,0,4,0.3129,0.0,1,1,963.71503,3851.83828,3851.83742,0.22,0.00021,HCD,MS2,0.0,22.18,61.2723,57776,F1,,,0,0,0,1.47,1,0.003236,0.1845,A5U2I9,A5U2I9,Coproporphyrin III ferrochelatase OS=Mycobacte...,metabolic process,cytoplasm,catalytic activity;metal ion binding,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4537,-448,115466,False,High,MS Amanda 2.0 (A4),Selected,VFDETIGGDAHTWLR,,1,Q10690,Q10690,0,3,0.1057,0.0,1,1,572.95001,1716.83548,1716.83401,0.86,0.00049,HCD,MS2,11.73286,22.166,35.4511,36584,F3,83.07,83.07,123,1,1,,1,0.00318,0.08999,Q10690,Q10690,Uncharacterized protein Rv2082 OS=Mycobacteriu...,regulation of biological process;response to s...,,,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21163,-519,913925,False,High,Sequest HT (A2),Selected,AFGISGEVTFDYGIAYDR,,1,P9WHH9,P9WHH9,0,3,0.3689,0.0,1,1,660.98175,1980.9307,1980.93378,-1.56,-0.00103,HCD,MS2,0.0,22.161,80.9755,89801,F10,,,0,0,0,2.06,1,0.001017,0.04318,P9WHH9,P9WHH9,Dihydrolipoyl dehydrogenase OS=Mycobacterium t...,cellular homeostasis;metabolic process;regulat...,cytoplasm;cytosol;extracellular;membrane,antioxidant activity;catalytic activity;nucleo...,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25589,-538,720569,False,High,Sequest HT (A2),Selected,TTGTYLAVATIGTESDR,,1,P9WLH0,P9WLH0,0,2,0.4444,0.0,1,1,878.43903,1755.87078,1755.87593,-2.94,-0.00258,HCD,MS2,0.0,22.202,61.017,61987,F12,,,0,0,0,1.44,1,0.003934,0.03517,P9WLH0,P9WLH0,Uncharacterized protein MT2296 OS=Mycobacteriu...,,membrane,,,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100002,-787,133454,False,High,MS Amanda 2.0 (A4),Unambiguous,VFDETIGGDAHTWLR,,1,Q10690,Q10690,0,3,0.4616,0.0,1,1,572.95099,1716.83841,1716.83401,2.56,0.00147,HCD,MS2,33.14251,22.195,35.6428,37711,F35,140.04,140.04,134,1,1,,1,0.001249,0.02875,Q10690,Q10690,Uncharacterized protein Rv2082 OS=Mycobacteriu...,regulation of biological process;response to s...,,,,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
101338,-787,999132,False,High,Sequest HT (A2),Unambiguous,QLVLGDNLDTEHIDAcYDAGVLR,C16(Carbamidomethyl),1,P46732,P46732,0,3,0.5464,0.0,1,1,863.09094,2587.25827,2587.24569,4.86,0.00419,HCD,MS2,52.37561,22.165,84.8522,92665,F35,,,0,0,0,0.97,1,0.004664,0.0636,P46732,P46732,18 kDa antigen 2 OS=Mycobacterium intracellula...,,,,,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
159614,-987,99260,False,High,MS Amanda 2.0 (A4),Unambiguous,VGTEVIR,,2,P9WQL6,P9WQL6; P9WQL7,0,2,0.2769,0.0,1,1,387.22964,773.45201,773.45158,0.56,0.00022,HCD,MS2,80.01502,22.191,23.8789,23590,F56,151.12,151.12,606,1,1,,1,0.001523,0.02758,P9WQL6,P9WQL6,Fluoroquinolones export ATP-binding protein MT...,response to stimulus;transport,membrane,catalytic activity;nucleotide binding;transpor...,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
191638,-1038,522930,False,High,Sequest HT (A2),Unambiguous,FGSPELQQGWGVSAVSGDR,,1,I6Y3Q0,I6Y3Q0,0,3,0.4194,0.0,1,1,659.6507,1976.93753,1976.94608,-4.32,-0.00285,HCD,MS2,22.35029,22.168,30.2839,31232,F62,,,0,0,0,1.24,1,0.001391,0.06051,I6Y3Q0,I6Y3Q0,Acyl-CoA dehydrogenase FadE27 OS=Mycobacterium...,metabolic process,,catalytic activity;nucleotide binding,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
197017,-1068,560498,False,High,Sequest HT (A2),Unambiguous,FGSPELQQGWGVSAVSGDR,,1,I6Y3Q0,I6Y3Q0,0,3,0.3077,0.0,1,1,659.65277,1976.94376,1976.94608,-1.17,-0.00077,HCD,MS2,78.21826,22.198,30.1519,30826,F63,,,0,0,0,1.3,1,0.002073,0.08867,I6Y3Q0,I6Y3Q0,Acyl-CoA dehydrogenase FadE27 OS=Mycobacterium...,metabolic process,,catalytic activity;nucleotide binding,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
219786,-1137,568797,False,High,Sequest HT (A2),Unambiguous,FPIIDDR,,1,Q02251,Q02251,0,2,0.2848,0.0,1,1,438.23505,875.46282,875.46214,0.77,0.00034,HCD,MS2,67.71341,22.16,33.3602,35692,F72,,,0,0,0,1.51,1,0.002768,0.04,Q02251,Q02251,Mycocerosic acid synthase OS=Mycobacterium bov...,metabolic process,membrane,catalytic activity,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [97]:
merged_dummy_col = merged_dummy_col.drop(['PSMs Workflow ID', 'Checked', 'Confidence', 'PSM Ambiguity', '# Proteins', 'Protein Accessions', 'DeltaScore', 'DeltaCn', 'Rank', 'Search Engine Rank', 'MS Order', 'Isolation Interference [%]', 'Ion Inject Time [ms]', 'Amanda Score', 'CharmeRT Combined Score', 'Search Space', 'MS Amanda Rank', 'Search Depth', 'XCorr', '# Protein Groups', 'var1', 'WikiPathways'], axis=1)

In [98]:
merged_dummy_col

Unnamed: 0,PSMs Peptide ID,Identifying Node,Annotated Sequence,Modifications,Master Protein Accessions,# Missed Cleavages,Charge,m/z [Da],MH+ [Da],Theo. MH+ [Da],DeltaM [ppm],Deltam/z [Da],Activation Type,RT [min],First Scan,File ID,Percolator q-Value,Percolator PEP,Accession,Description,Biological Process,Cellular Component,Molecular Function,117,120,489,491,498,500,504,514,515,527,528,535,538,542,644,651,656,667,684,700,706,718,721,722,730,732,734,737,738,745,746,754,757,758,771,785,786,787,791,794,795,800,817,818,820,822,840,845,861,866,872,876,886,888
962,512384,Sequest HT (A2),VSGPDPVPGcLSSINGQPcRPPHcVASVSPARPSAGSP,C10(Carbamidomethyl); C19(Carbamidomethyl); C2...,A5U2I9,0,4,963.71503,3851.83828,3851.83742,0.22,0.00021,HCD,61.2723,57776,F1,0.003236,0.1845,A5U2I9,Coproporphyrin III ferrochelatase OS=Mycobacte...,metabolic process,cytoplasm,catalytic activity;metal ion binding,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4537,115466,MS Amanda 2.0 (A4),VFDETIGGDAHTWLR,,Q10690,0,3,572.95001,1716.83548,1716.83401,0.86,0.00049,HCD,35.4511,36584,F3,0.00318,0.08999,Q10690,Uncharacterized protein Rv2082 OS=Mycobacteriu...,regulation of biological process;response to s...,,,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21163,913925,Sequest HT (A2),AFGISGEVTFDYGIAYDR,,P9WHH9,0,3,660.98175,1980.9307,1980.93378,-1.56,-0.00103,HCD,80.9755,89801,F10,0.001017,0.04318,P9WHH9,Dihydrolipoyl dehydrogenase OS=Mycobacterium t...,cellular homeostasis;metabolic process;regulat...,cytoplasm;cytosol;extracellular;membrane,antioxidant activity;catalytic activity;nucleo...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25589,720569,Sequest HT (A2),TTGTYLAVATIGTESDR,,P9WLH0,0,2,878.43903,1755.87078,1755.87593,-2.94,-0.00258,HCD,61.017,61987,F12,0.003934,0.03517,P9WLH0,Uncharacterized protein MT2296 OS=Mycobacteriu...,,membrane,,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100002,133454,MS Amanda 2.0 (A4),VFDETIGGDAHTWLR,,Q10690,0,3,572.95099,1716.83841,1716.83401,2.56,0.00147,HCD,35.6428,37711,F35,0.001249,0.02875,Q10690,Uncharacterized protein Rv2082 OS=Mycobacteriu...,regulation of biological process;response to s...,,,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
101338,999132,Sequest HT (A2),QLVLGDNLDTEHIDAcYDAGVLR,C16(Carbamidomethyl),P46732,0,3,863.09094,2587.25827,2587.24569,4.86,0.00419,HCD,84.8522,92665,F35,0.004664,0.0636,P46732,18 kDa antigen 2 OS=Mycobacterium intracellula...,,,,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
159614,99260,MS Amanda 2.0 (A4),VGTEVIR,,P9WQL6,0,2,387.22964,773.45201,773.45158,0.56,0.00022,HCD,23.8789,23590,F56,0.001523,0.02758,P9WQL6,Fluoroquinolones export ATP-binding protein MT...,response to stimulus;transport,membrane,catalytic activity;nucleotide binding;transpor...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
191638,522930,Sequest HT (A2),FGSPELQQGWGVSAVSGDR,,I6Y3Q0,0,3,659.6507,1976.93753,1976.94608,-4.32,-0.00285,HCD,30.2839,31232,F62,0.001391,0.06051,I6Y3Q0,Acyl-CoA dehydrogenase FadE27 OS=Mycobacterium...,metabolic process,,catalytic activity;nucleotide binding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
197017,560498,Sequest HT (A2),FGSPELQQGWGVSAVSGDR,,I6Y3Q0,0,3,659.65277,1976.94376,1976.94608,-1.17,-0.00077,HCD,30.1519,30826,F63,0.002073,0.08867,I6Y3Q0,Acyl-CoA dehydrogenase FadE27 OS=Mycobacterium...,metabolic process,,catalytic activity;nucleotide binding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
219786,568797,Sequest HT (A2),FPIIDDR,,Q02251,0,2,438.23505,875.46282,875.46214,0.77,0.00034,HCD,33.3602,35692,F72,0.002768,0.04,Q02251,Mycocerosic acid synthase OS=Mycobacterium bov...,metabolic process,membrane,catalytic activity,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [100]:
merged_dummy_col.to_csv('TB_proteins_PD.csv')

In [46]:
pd1 = pd.read_csv('peptide_5T.csv')
pd2 = pd.read_csv('peptide_6ANL.csv')
pd3 = pd.read_csv('peptide_6DNL.csv')

In [47]:
list_d = [pd1, pd2, pd3]

In [23]:
def clean_db(list_):
    list_db = []
    for db_ in list_:
        db_ = db_.iloc[:,[0, 7, 13]]
        list_db.append(db_)
    return list_db

In [24]:
def merging_db(list_db):
    df = list_db[0]
    for df_ in list_db[1:]:
        df = df.merge(df_, on = 'Peptide', how='outer')
    return df

In [50]:

df_final = merging_db(clean_db(list_d))

In [51]:
df_final.columns

Index(['Peptide', 'Area Sample 3', '#Feature Sample 3', 'Accession_x',
       'Area Sample 4', '#Feature Sample 4', 'Accession_y', 'Area Sample 5',
       '#Feature Sample 5', 'Accession'],
      dtype='object')

In [60]:
df_final


Unnamed: 0,Peptide,Area Sample 3,#Feature Sample 3,Accession_x,Area Sample 4,#Feature Sample 4,Accession_y,Area Sample 5,#Feature Sample 5,Accession
0,GLVEPVDVVDNADGTQTVNYVPSR,14800000.0,2.0,VFV42729.1:VFV42728.1:VFV42730.1,8.330000e+07,2.0,VFV42729.1:VFV42728.1,104000000.0,2.0,VFV42729.1:VFV42728.1
1,NLHQSGFSLSGAQIDDNIPR,21800000.0,2.0,VFV40282.1:VFV40281.1,9.020000e+06,1.0,VFV40282.1:VFV40281.1,10700000.0,1.0,VFV40282.1:VFV40281.1
2,LVQDVANNTNEEAGDGTTTATVLAR,7490000.0,2.0,NP_002147.2:NP_955472.1,1.800000e+07,2.0,NP_002147.2:NP_955472.1,40000000.0,2.0,NP_002147.2:NP_955472.1
3,GDLENAFLNLVQC(+57.02)IQNKPLYFADR,905000000.0,2.0,VFV25874.1,5.220000e+09,2.0,VFV25874.1,346000000.0,2.0,VFV25874.1
4,VGAIPANALDDGQWSQGLISAAR,1750000.0,1.0,VFV44382.1,5.930000e+06,1.0,VFV44382.1,9730000.0,1.0,VFV44382.1
...,...,...,...,...,...,...,...,...,...,...
3406,HELVVQAR,,,,,,,0.0,0.0,VFV46470.1:VFV46472.1:VFV46471.1
3407,VKGEYDVTVPK,,,,,,,6340000.0,1.0,VFV19178.1
3408,AHGPGLEGGLVGKPAEFTIDTK,,,,,,,1050000.0,1.0,VFV17541.1:VFV17543.1:VFV17540.1:VFV17542.1
3409,GQGVYLGMPGC(+57.02)LPVYDALAGEFIR,,,,,,,3470000.0,1.0,VFV23786.1


## 1% FDR for Multiconsensus

In [16]:
cd C:\Users\rmagni\Desktop\GMU\ASSIP_2020\Lung Cancer Microbiome Uniprot DB Results\For Ruben\PEAKS_DB_1_FDR_PEPTIDES_csv_ONLY

C:\Users\rmagni\Desktop\GMU\ASSIP_2020\Lung Cancer Microbiome Uniprot DB Results\For Ruben\PEAKS_DB_1_FDR_PEPTIDES_csv_ONLY


In [43]:
pd1 = pd.read_csv('peptide_Sample_6D_1%_FDR.csv')
pd2 = pd.read_csv('peptide_Sample_6T _1% _FDR.csv')
pd3 = pd.read_csv('peptide_Sample_7D_1%_FDR.csv')
pd4 = pd.read_csv('peptide_Sample_7T_1%_FDR.csv')
pd5 = pd.read_csv('peptide_Sample_8D_1%_FDR.csv')
pd6 = pd.read_csv('peptide_Sample_8T_1%_FDR.csv')
pd7 = pd.read_csv('peptide_Sample_10D_1%_FDR.csv')
pd8 = pd.read_csv('peptide_Sample_10T_1%_FDR.csv')
pd9 = pd.read_csv('peptide_Sample_11D_1%_FDR.csv')
pd10 = pd.read_csv('peptide_Sample_11T_1%_FDR.csv')

In [44]:
list_d = [pd1, pd2, pd3, pd4, pd5, pd6, pd7, pd8, pd9, pd10]

In [45]:
def clean_db(list_):
    list_db = []
    for db_ in list_:
        db_ = db_.iloc[:,[0, 7, 13]]
        list_db.append(db_)
    return list_db

In [46]:
cleaned_result = clean_db(list_d)

In [47]:
merged_result = merging_db(cleaned_result)

In [48]:
merged_result.columns

Index(['Peptide', 'Area Sample 6D', 'Accession_x', 'Area Sample 6T',
       'Accession_y', 'Area Sample 7D', 'Accession_x', 'Area Sample 7T',
       'Accession_y', 'Area Sample 8D', 'Accession_x', 'Area Sample 8T',
       'Accession_y', 'Area Sample 10D', 'Accession_x', 'Area Sample 10T',
       'Accession_y', 'Area Sample 11D', 'Accession_x', 'Area Sample 11T',
       'Accession_y'],
      dtype='object')

In [49]:
merged_result.to_csv("multiconsensus.csv")

PermissionError: [Errno 13] Permission denied: 'multiconsensus.csv'

In [53]:
def merging_db_2(list_db):
    df = list_db[0]
    for df_ in list_db[1:]:
        df = df.merge(df_, on = ['Peptide', 'Accession'] , how='outer')
    df = df.drop_duplicates('Peptide')
    return df

In [54]:
merged_result_2 = merging_db_2(cleaned_result)

In [56]:
merged_result_2.to_csv("multiconsensus_fdr_1.csv")

## 5% FDR Multiconsensus

In [57]:
cd C:\Users\rmagni\Desktop\GMU\ASSIP_2020\Lung Cancer Microbiome Uniprot DB Results\For Ruben\PEAKS_DB_5_FDR_PEPTIDES_csv_ONLY

C:\Users\rmagni\Desktop\GMU\ASSIP_2020\Lung Cancer Microbiome Uniprot DB Results\For Ruben\PEAKS_DB_5_FDR_PEPTIDES_csv_ONLY


In [68]:
pd11 = pd.read_csv('peptide_Sample_6D_5%_FDR.csv')
pd12 = pd.read_csv('peptide_Sample_6T _5% _FDR.csv')
pd13 = pd.read_csv('peptide_Sample_7D_5%_FDR.csv')
pd14 = pd.read_csv('peptide_Sample_7T_5%_FDR.csv')
pd15 = pd.read_csv('peptide_Sample_8D_5%_FDR.csv')
pd16 = pd.read_csv('peptide_Sample_8T_5%_FDR.csv')
pd17 = pd.read_csv('peptide_Sample_10D_5%_FDR.csv')
pd18 = pd.read_csv('peptide_Sample_10T_5%_FDR.csv')
pd19 = pd.read_csv('peptide_Sample_11D_5%_FDR.csv')
pd20 = pd.read_csv('peptide_Sample_11T_5%_FDR.csv')

In [69]:
list_d = [pd11, pd12, pd13, pd14, pd15, pd16, pd17, pd18, pd19, pd20]

In [70]:
cleaned_result_2 = clean_db(list_d)

In [71]:
merged_result_2 = merging_db_2(cleaned_result_2)

In [72]:
merged_result_2.columns

Index(['Peptide', 'Area Sample 6D', 'Accession', 'Area Sample 6T',
       'Area Sample 7D', 'Area Sample 7T', 'Area Sample 8D', 'Area Sample 8T',
       'Area Sample 10D', 'Area Sample 10T', 'Area Sample 11D',
       'Area Sample 11T'],
      dtype='object')

In [73]:
merged_result_2.to_csv("multiconsensus_fdr_5.csv")

## 1% FDR Proteins

In [111]:
cd C:\Users\rmagni\Desktop\GMU\ASSIP_2020\Lung Cancer Microbiome Uniprot DB Results\For Ruben\others

C:\Users\rmagni\Desktop\GMU\ASSIP_2020\Lung Cancer Microbiome Uniprot DB Results\For Ruben\others


In [112]:
pd1 = pd.read_csv('proteins_Sample_6D_1%_FDR.csv')
pd2 = pd.read_csv('proteins_Sample_6T _1% _FDR.csv')
pd3 = pd.read_csv('proteins_Sample_7D_1%_FDR.csv')
pd4 = pd.read_csv('proteins_Sample_7T_1%_FDR.csv')
pd5 = pd.read_csv('proteins_Sample_8D_1%_FDR.csv')
pd6 = pd.read_csv('proteins_Sample _8T_1%_FDR.csv')
pd7 = pd.read_csv('proteins_Sample_10D_1%_FDR.csv')
pd8 = pd.read_csv('proteins_Sample_10T_1%_FDR.csv')
pd9 = pd.read_csv('proteins_Sample_11D_1%_FDR.csv')
pd10 = pd.read_csv('proteins_Sample_11T_1%_FDR.csv')

In [113]:
list_d = [pd1, pd2, pd3, pd4, pd5, pd6, pd7, pd8, pd9, pd10]

In [114]:
def clean_db_prot(list_):
    list_db = []
    for db_ in list_:
        db_ = db_.iloc[:,[2, 6, 12]]
        list_db.append(db_)
    return list_db

In [115]:
cleaned_result_prot_1 = clean_db_prot(list_d)

In [116]:
def merging_db_prot(list_db):
    df = list_db[0]
    for df_ in list_db[1:]:
        df = df.merge(df_, on = ['Accession', 'Description'] , how='outer')
    return df

In [117]:
merged_result_prot_1 = merging_db_prot(cleaned_result_prot_1)

In [118]:
merged_result_prot_1.to_csv("multiconsensus_prot_fdr_1.csv")

## 5% FDR Proteins

In [119]:
pd11 = pd.read_csv('proteins_Sample_6D_5%_FDR.csv')
pd12 = pd.read_csv('proteins_Sample_6T _5% _FDR.csv')
pd13 = pd.read_csv('proteins_Sample_7D_5%_FDR.csv')
pd14 = pd.read_csv('proteins_Sample_7T_5%_FDR.csv')
pd15 = pd.read_csv('proteins_Sample_8D_5%_FDR.csv')
pd16 = pd.read_csv('proteins_Sample_8T_5%_FDR.csv')
pd17 = pd.read_csv('proteins_Sample_10D_5%_FDR.csv')
pd18 = pd.read_csv('proteins_Sample_10T_5%_FDR.csv')
pd19 = pd.read_csv('proteins_Sample_11D_5%_FDR.csv')
pd20 = pd.read_csv('proteins_Sample_11T_5%_FDR.csv')

In [120]:
list_d = [pd11, pd12, pd13, pd14, pd15, pd16, pd17, pd18, pd19, pd20]

In [121]:
cleaned_result_prot_5 = clean_db_prot(list_d)

In [122]:
merged_result_prot_5 = merging_db_prot(cleaned_result_prot_5)

In [123]:
merged_result_prot_5.to_csv("multiconsensus_prot_fdr_5.csv")