In [29]:
import pandas as pd
import numpy as np
import re

In [30]:
data = pd.read_csv('data/filtered_results.tsv', sep='\t', low_memory=False)

Peptidoforms intensities, with 40,921 resolved peptidoforms detected in at least one sample (used for identifiability models)

In [31]:
data.shape

(40921, 707)

In [32]:
data.columns

Index(['rowid', 'ccms_row_id', 'Peptidoform', 'Peptidoform ID',
       'Unmod peptidoform', 'Total', 'Total- Unmodified sequence',
       'Peptidoforms- Unmodified sequence', 'Proteins', 'Mass',
       ...
       '_dyn_#Patient_M2_healthyMale.Timepoint_2_unmod',
       '_dyn_#Patient_M2_healthyMale.Timepoint_3',
       '_dyn_#Patient_M2_healthyMale.Timepoint_3_unmod',
       '_dyn_#Patient_M3_healthyMale.Timepoint_1',
       '_dyn_#Patient_M3_healthyMale.Timepoint_1_unmod',
       '_dyn_#Patient_M3_healthyMale.Timepoint_2',
       '_dyn_#Patient_M3_healthyMale.Timepoint_2_unmod',
       '_dyn_#Patient_M3_healthyMale.Timepoint_3',
       '_dyn_#Patient_M3_healthyMale.Timepoint_3_unmod', 'id'],
      dtype='object', length=707)

In [21]:
data.head()

Unnamed: 0,rowid,ccms_row_id,Peptidoform,Peptidoform ID,Unmod peptidoform,Total,Total- Unmodified sequence,Peptidoforms- Unmodified sequence,Proteins,Mass,...,_dyn_#Patient_M2_healthyMale.Timepoint_2_unmod,_dyn_#Patient_M2_healthyMale.Timepoint_3,_dyn_#Patient_M2_healthyMale.Timepoint_3_unmod,_dyn_#Patient_M3_healthyMale.Timepoint_1,_dyn_#Patient_M3_healthyMale.Timepoint_1_unmod,_dyn_#Patient_M3_healthyMale.Timepoint_2,_dyn_#Patient_M3_healthyMale.Timepoint_2_unmod,_dyn_#Patient_M3_healthyMale.Timepoint_3,_dyn_#Patient_M3_healthyMale.Timepoint_3_unmod,id
0,1,1,.SPLFM+15.995GK.,SPLFM+15.995GK,.SPLFMGK.,11679,11681,2,sp|P01009|A1AT_HUMAN;tr|A0A024R6I7|A0A024R6I7_...,795.406,...,84453000.0,,92032000.0,,,,51720000.0,,49329000.0,0
1,2,2,.EPQVYTLPPSREEM+15.995TK.,EPQVYTLPPSREEM+15.995TK,.EPQVYTLPPSREEMTK.,11411,13133,46,sp|P01859|IGHG2_HUMAN;sp|P01860|IGHG3_HUMAN;tr...,1920.9381,...,,,,,,,,,,1
2,3,3,.AVM+15.995DDFAAFVEK.,AVM+15.995DDFAAFVEK,.AVMDDFAAFVEK.,10949,18495,81,sp|P02768-2|ALBU_HUMAN;sp|P02768-3|ALBU_HUMAN;...,1358.6318,...,1911700000.0,,1625600000.0,,43830000.0,,1484500000.0,,1147100000.0,2
3,4,4,.EFNAETFTFHADIC-33.988TLSEK.,EFNAETFTFHADIC-33.988TLSEK,.EFNAETFTFHADICTLSEK.,10198,22966,174,sp|P02768-2|ALBU_HUMAN;sp|P02768-3|ALBU_HUMAN;...,2169.0257,...,,,,,,,,,,3
4,5,5,.M+15.995ADEAGSEADHEGTHSTK.,M+15.995ADEAGSEADHEGTHSTK,.MADEAGSEADHEGTHSTK.,9608,18724,131,sp|P02671-2|FIBA_HUMAN;sp|P02671|FIBA_HUMAN,1888.7728,...,22109000.0,,1078800000.0,,6243600.0,,1199000000.0,,644940000.0,4


### Investigation

In [50]:
pattern = r'_dyn_#Patient_([^\.]+)\.Timepoint_(\d+)'
patient_timepoints = []
for col in data.columns:
	match = re.search(pattern, col)
	if match:
		patient_timepoints.append((match.group(1), match.group(2)))

In [51]:
pt_timepoints = pd.DataFrame(patient_timepoints, columns=['patient_id', 'timepoint'])

We have 52 patients + 6 healthy = 58

In [54]:
num_patients = pt_timepoints['patient_id'].nunique()
num_patients

58

- Patient 44 &rarr; Only has samples for 6 timepoints
- Patients 24, 31, 33, 35, 49, 50, 51, 53, 54 &rarr; Only have samples for 2 timepoints
- Healthy individuals &rarr; Have samples for 3 timepoints
- Others &rarr; Have samples for 7 timepoints

*Note*: We have patient IDs from range (1, 58)but only 52 actual patients (6 missing)

In [56]:
pt_timepoints['patient_id'].value_counts()

patient_id
01                  14
26                  14
28                  14
29                  14
30                  14
02                  14
36                  14
37                  14
38                  14
39                  14
40                  14
41                  14
42                  14
43                  14
45                  14
46                  14
47                  14
48                  14
52                  14
57                  14
58                  14
27                  14
34                  14
25                  14
15                  14
03                  14
05                  14
06                  14
07                  14
08                  14
09                  14
11                  14
12                  14
14                  14
10                  14
16                  14
22                  14
19                  14
20                  14
21                  14
17                  14
23                  14
44                  12


In [59]:
data['Unmod peptidoform'].nunique()

6677

In [60]:
data['Peptidoform'].nunique()

40921

In [61]:
data['Peptidoform ID'].nunique()

40921