/
infections.py
549 lines (437 loc) · 20.1 KB
/
infections.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
from abc import abstractmethod
from datetime import datetime
from functools import partial
from typing import List, Iterable, Union
from math import log2
from warnings import warn
from pandas import DataFrame, Series, read_table, read_excel
from rpy2.robjects import r
from sqlalchemy.orm.exc import NoResultFound
import imports.protein_data as importers
from imports.sites.site_importer import SiteImporter
from imports.sites.uniprot.importer import UniprotToRefSeqTrait, UniprotIsoformsTrait, UniprotSequenceAccessionTrait
from models import Site, EventModulatingPTM, RegulatorySiteAssociation
CANONICAL_PHOSPHOSITE_RESIDUES = {'S', 'T', 'Y'}
def maybe_split_ints(value: Union[str, int], sep: str):
return (
[int(number) for number in value.split(sep)]
if isinstance(value, str) else
[int(value)]
)
def read_excel_or_text(file_path: str, **excel_kwargs: dict):
"""Read table from a given excel sheet, unless a tsv file is given."""
if file_path.endswith('.xlsx'):
return read_excel(file_path, **excel_kwargs)
else:
assert file_path.endswith('.tsv')
return read_table(file_path)
def check_proper_probability(data: Series, scale: int):
# is probability 0-100?
assert data.max() <= scale
assert data.min() >= 0
if scale == 100:
# are we using 0-100 rather than 0-1?
assert data.max() > 1
def p_adjust(x, *args, **kwargs):
if not isinstance(x, list):
x = list(x)
return r['p.adjust'](x, *args, **kwargs)
def log2fc(before, after):
if before and after:
return log2(after) - log2(before)
if before == 0 and after == 0:
return 0
if before == 0:
return float('Inf')
if after == 0:
return -float('Inf')
class SiteModulatedUponEventImporter:
@property
@abstractmethod
def event_reference(self) -> str:
"""The reference to the study providing the event PTM modulation data, e.g. `(Somebody et al, 2022)`"""
@property
@abstractmethod
def event_name(self) -> str:
"""The name of the event modulating PTM site occupancy, e.g. `HIV infection`"""
@property
@abstractmethod
def effect_size_type(self) -> str:
"""The effect size type, e.g. `log2FC`"""
def get_or_create_event(self):
try:
event = EventModulatingPTM.query.filter_by(name=self.event_name).one()
event.reference = self.event_reference
except NoResultFound:
event = EventModulatingPTM(
name=self.event_name,
reference=self.event_reference
)
return event
def process_event_associated_sites(self, sites: DataFrame, canonical: set):
"""Process provided sites for a *single* PTM type.
- remove non-canonnical sites,
- add sequence accessions and identifiers,
- map sites to isoforms in the database
- add event data
Note: make sure to only provide significant sites to this method
as it does not implement any significance-based filterering.
"""
assert not sites[['protein_accession', 'residue', 'position']].duplicated().any()
is_canonical = sites['residue'].isin(canonical)
if any(~is_canonical):
warn(
f'Removing {sum(~is_canonical)} phosphorylation sites'
f' mapped to non-canonical aminoacids: {set(sites[~is_canonical].residue)}'
f' keeping {sum(is_canonical)} sites'
)
sites = sites[is_canonical].copy()
assert all(sites['position'] > 0)
assert len(self.site_types) == 1
sites['mod_type'] = self.site_types[0]
sites = self.add_sequence_accession(sites)
print(
f'After mapping to UniProt sequence accessions got: {len(sites)} sites'
f' (each protein has multiple UniProt isoforms)'
)
sites = self.add_nm_refseq_identifiers(sites)
print(
f'After mapping to RefSeq identifiers got: {len(sites)} sites'
f' (each UniProt isoform can be mapped to one or more RefSeq isoforms)'
)
mapped_sites = self.map_sites_to_isoforms(sites)
event = self.get_or_create_event()
mapped_sites['event'] = event
mapped_sites['pub_med_ids'] = self.pubmed_id
mapped_sites['pub_med_ids'] = mapped_sites['pub_med_ids'].apply(lambda pubmed_id: [pubmed_id])
import_time = datetime.utcnow().strftime('%Y-%m-%d_%H-%M')
mapped_sites.to_csv(f'{self.__class__.__name__}_{import_time}.csv')
return mapped_sites
def add_site(
self, refseq, position: int, residue, mod_type, pub_med_ids: Iterable[int],
adj_p_val: float, effect_size: float, event: EventModulatingPTM
):
site, created = super().add_site(refseq, position, residue, mod_type, pubmed_ids=pub_med_ids)
association = RegulatorySiteAssociation(
event=event,
effect_size=effect_size,
adjusted_p_value=adj_p_val,
effect_size_type=self.effect_size_type,
site_type=self.site_types_map[mod_type],
site=site
)
site.associations.add(association)
return site, created
class HerpesSimplexPhosphoImporter(
SiteModulatedUponEventImporter,
UniprotToRefSeqTrait, UniprotIsoformsTrait, UniprotSequenceAccessionTrait,
SiteImporter
):
"""Enterovirus infection related phosphorylation sites, based on:
Time-resolved Global and Chromatin Proteomics during Herpes Simplex Virus Type 1 (HSV-1) Infection
https://doi.org/10.1074/mcp.m116.065987
human foreskin fibroblast
"""
requires = {importers.proteins_and_genes, importers.sequences}
requires.update(SiteImporter.requires)
source_name = '(Kulej et al., 2017)'
site_types = ['phosphorylation (HSV-1)']
adj_p_threshold = 0.05
event_name = 'HSV-1 infection'
event_reference = '(Kulej et al., 2017)'
effect_size_type = 'log2FC'
pubmed_id = 28179408
def __init__(
self, sprot_canonical=None, sprot_splice=None, mappings_path=None,
):
SiteImporter.__init__(self)
UniprotToRefSeqTrait.__init__(self, mappings_path)
UniprotIsoformsTrait.__init__(self, sprot_canonical, sprot_splice)
def load_sites(self, file_path='data/sites/2017_Kulej/TABLE_S3_Time_course-Host_phosphosites.xlsx') -> List[Site]:
sites = read_excel_or_text(
file_path,
sheet_name='A)Total identified phosphosites',
skiprows=1
)
sites.rename(columns={
'Protein IDs': 'protein_accession',
'Localization prob': 'probability'
}, inplace=True)
sites['probability'] = sites['probability'] * 100
check_proper_probability(sites['probability'], scale=100)
# select only sites with >75% probability
sites = sites[sites['probability'] > 75]
print(f'Number of sites with probability > 75%: {len(sites)}')
sites["adj_p_val"] = list(p_adjust(sites["ANOVA p-value"], method='BH'))
is_site_significant = sites["adj_p_val"] < self.adj_p_threshold
print(f'Keeping {sum(is_site_significant)} out of {len(is_site_significant)} available sites')
# select significant sites only
sites = sites.loc[is_site_significant].copy()
sites['residue'] = sites['Mod site'].str[0]
sites['position'] = sites['Mod site'].str[1:].apply(int)
sites['effect_size'] = sites.apply(
lambda site: log2fc(before=site['Mock'], after=site['15hpi']),
axis=1
)
n_sites_not_phosphorylated_at_15h = sum(sites['effect_size'] != 0)
warn(f'{n_sites_not_phosphorylated_at_15h} sites were significant in ANOVA but not phosphorylated at 15hpi')
mapped_sites = self.process_event_associated_sites(
sites,
canonical=CANONICAL_PHOSPHOSITE_RESIDUES
)
return self.create_site_objects(
mapped_sites,
columns=[
'refseq', 'position', 'residue', 'mod_type',
'pub_med_ids', 'adj_p_val', 'effect_size', 'event'
]
)
class EnterovirusPhosphoImporter(
SiteModulatedUponEventImporter,
UniprotToRefSeqTrait, UniprotIsoformsTrait, UniprotSequenceAccessionTrait,
SiteImporter
):
"""Enterovirus infection related phosphorylation sites, based on:
Giansanti, P., Strating, J.R.P.M., Defourny, K.A.Y. et al.
Dynamic remodelling of the human host cell proteome and phosphoproteome upon enterovirus infection.
Nat Commun 11, 4332 (2020). https://doi.org/10.1038/s41467-020-18168-3
CC BY 4.0 - http://creativecommons.org/licenses/by/4.0/
Enterovirus CVB3 strain Nancy
6 cell lines were tested in this paper: HeLa R19, BGM, HAP1, HuH7, A549, BGM, Vero E6
We use Supplementary Data 4 which provides the phosphoproteome changes
"""
requires = {importers.proteins_and_genes, importers.sequences}
requires.update(SiteImporter.requires)
source_name = '(Giansanti et al., 2020)'
site_types = ['phosphorylation (enterovirus)']
adj_p_threshold = 0.05
event_name = 'CVB3 enterovirus infection'
event_reference = '(Giansanti et al., 2020)'
effect_size_type = 'log2FC'
pubmed_id = 32859902
def __init__(
self, sprot_canonical=None, sprot_splice=None, mappings_path=None,
):
SiteImporter.__init__(self)
UniprotToRefSeqTrait.__init__(self, mappings_path)
UniprotIsoformsTrait.__init__(self, sprot_canonical, sprot_splice)
def load_sites(self, file_path='data/sites/2020_Giansanti/41467_2020_18168_MOESM7_ESM.xlsx') -> List[Site]:
# 4.2 = Quantified phosphosites (in 3 out of 4 biological replica)
sites = read_excel_or_text(file_path, sheet_name='Supplementary Data 4.2')
is_site_significant = (
(sites["Welch's T-test q-value 10h_CT"] < self.adj_p_threshold)
&
(~(sites["Welch's T-test q-value Mock_10h_CT"] < self.adj_p_threshold))
)
print(f'Keeping {sum(is_site_significant)} out of {len(is_site_significant)} available sites')
# select significant sites only
sites = sites.loc[is_site_significant].copy()
# split rows where a single site was mapped to more than one protein into multiple rows
columns_to_split = {
'Positions within proteins': partial(maybe_split_ints, sep=';'),
'Proteins': partial(str.split, sep=';')
}
for column, func in columns_to_split.items():
sites[column] = sites[column].apply(func)
sites = sites.explode(list(columns_to_split.keys()))
print(
f'After splitting sites to mapped proteins we have {len(sites)} sites'
)
sites.rename(columns={
'Amino acid': 'residue',
'Proteins': 'protein_accession',
'Positions within proteins': 'position',
"Welch's T-test q-value 10h_CT": 'adj_p_val',
# note: while this is not directly named as "fold change",
# is it obvious that it is it after looking at the Source Data
# for Figure 1E (where interestingly authors used max(fold change))
'Log2 10h_CT': 'effect_size'
}, inplace=True)
# if multiple multiplicities were measured (and significant) for the same site,
# import the data for lower multiplicity
sites = (
sites.sort_values('Multiplicity')
.drop_duplicates(subset=['protein_accession', 'residue', 'position'])
)
print(f'After removing higher multiplicity duplicates: {len(sites)} sites')
mapped_sites = self.process_event_associated_sites(
sites,
canonical=CANONICAL_PHOSPHOSITE_RESIDUES
)
return self.create_site_objects(
mapped_sites,
columns=[
'refseq', 'position', 'residue', 'mod_type',
'pub_med_ids', 'adj_p_val', 'effect_size', 'event'
]
)
class HIVPhosphoImporter(
SiteModulatedUponEventImporter,
UniprotToRefSeqTrait, UniprotIsoformsTrait, UniprotSequenceAccessionTrait,
SiteImporter
):
"""HIV infection related phosphorylation sites, based on:
Greenwood, E.J., Matheson, N.J., Wals, K., van den Boomen, D.J., Antrobus, R., Williamson, J.C., and Lehner, P.J.
Temporal proteomic analysis of HIV infection reveals remodelling of the host phosphoproteome by lentiviral Vif variants.
eLife 5, e18296 (2016). https://doi.org/10.7554/eLife.18296.001
CC BY 4.0 - http://creativecommons.org/licenses/by/4.0/
Human CEM-T4 T cells
We use `Figure 6—source data 1` which provides the phosphoproteome data.
"""
requires = {importers.proteins_and_genes, importers.sequences}
requires.update(SiteImporter.requires)
source_name = '(Greenwood et al., 2016)'
site_types = ['phosphorylation (HIV)']
adj_p_threshold = 0.05
event_name = 'HIV infection'
event_reference = '(Greenwood et al., 2016)'
effect_size_type = 'log2FC'
pubmed_id = 27690223
def __init__(
self, sprot_canonical=None, sprot_splice=None, mappings_path=None,
):
SiteImporter.__init__(self)
UniprotToRefSeqTrait.__init__(self, mappings_path)
UniprotIsoformsTrait.__init__(self, sprot_canonical, sprot_splice)
def load_sites(self, file_path='data/sites/2016_Greenwood/elife-18296-fig6-data1-v3.xlsx') -> List[Site]:
sites = read_excel_or_text(file_path, sheet_name='Figure 6 - source data 1')
# split peptides into sites
columns_to_split = {
'Phosphosite Probabilities': partial(str.split, sep='; ')
}
for column, func in columns_to_split.items():
sites[column] = sites[column].apply(func)
sites = sites.explode(list(columns_to_split.keys()))
print(f'Number of considered sites: {len(sites)}')
site_data = (
sites['Phosphosite Probabilities'].str.extract(
r'^\s?(?P<residue>\w)\((?P<position_offset>\d+)\): (?P<probability>\d+\.\d+)\s?$',
)
)
assert len(sites) == len(site_data)
sites[site_data.columns] = site_data
peptide_data = (
sites['Position in Protein'].str.extract(
r'(?P<peptide_protein>\w+) \[(?P<peptide_start>\d+)-(?P<peptide_end>\d+)\]'
)
)
assert len(peptide_data) == len(sites)
sites[peptide_data.columns] = peptide_data
column_types = {
'probability': float,
'position_offset': int,
'peptide_start': int,
'peptide_end': int
}
for column, type_ in column_types.items():
sites[column] = sites[column].apply(type_)
assert all(
sites['Peptide Sequence'].str.len()
==
sites['peptide_end'] - sites['peptide_start'] + 1
)
assert all(sites['peptide_protein'] == sites['Protein Accession'])
check_proper_probability(sites['probability'], scale=100)
# select only sites with >75% probability
sites = sites[sites['probability'] > 75]
print(f'Number of sites with probability > 75%: {len(sites)}')
is_site_significant = (
(sites["q-value HIV WT vs Mock"] < self.adj_p_threshold)
)
print(f'Keeping {sum(is_site_significant)} out of {len(is_site_significant)} available sites')
# select significant sites only
sites = sites.loc[is_site_significant].copy()
# calculate position
sites['position'] = sites['peptide_start'] + sites['position_offset'] - 1
sites.rename(columns={
'Protein Accession': 'protein_accession',
"q-value HIV WT vs Mock": 'adj_p_val',
'Log2 (fold change) HIV WT vs Mock': 'effect_size'
}, inplace=True)
# some combinations of PTMs were called multiple times for a single peptide;
# as we have no way to select the most likely call, we conservatively choose
# the one which has the least significant results (to avoid inflating FDR)
sites = (
sites
.sort_values('adj_p_val', ascending=False)
.drop_duplicates(subset=['protein_accession', 'residue', 'position'], keep='first')
)
mapped_sites = self.process_event_associated_sites(
sites,
canonical=CANONICAL_PHOSPHOSITE_RESIDUES
)
return self.create_site_objects(
mapped_sites,
columns=[
'refseq', 'position', 'residue', 'mod_type',
'pub_med_ids', 'adj_p_val', 'effect_size', 'event'
]
)
class CovidPhosphoImporter(
SiteModulatedUponEventImporter,
UniprotToRefSeqTrait, UniprotIsoformsTrait, UniprotSequenceAccessionTrait,
SiteImporter
):
"""SARS-CoV-2 infection related phosphorylation sites, based on:
The Global Phosphorylation Landscape of SARS-CoV-2 Infection. Bouhaddou et al., 2020
Published: 22-06-2020 | Version 1 | DOI: 10.17632/dpkbh2g9hy.1
CC BY 4.0 - http://creativecommons.org/licenses/by/4.0
Vero E6 is a cell line originating from the kidney of a female African green monkey (Chlorocebus sabaeus)
(Osada et al., 2014) in (Bouhaddou et al., 2020).
They aligned the Chlorocebus sabaeus sequences with human sequences
to map the discovered sites to human protein orthologs.
The phosphorylation was measured in infected and control cell lines (control had a mock injection)
at 0-th and 24th hour of the experiment; the control is said to not changed much in the 24 hours (r=0.77)
and therefore the authors used the 0th measure as a reference (Bouhaddou et al., 2020).
We use Supplementary Table 1 which contains proteomic data of Vero E6 cells upon SARS-CoV-2 infection:
- PhosphoDataFull: full list of unfiltered phosphorylation sites occurring upon SARS-CoV-2 infection,
- AbundanceDataFull: full list of protein abundance measurements,
- PhosphoDataFiltered: filtered list of all detected phosphorylation sites
collapsed into single-site measurements.
- PhosphoNOverexpressionFull: the full list of unfiltered phosphorylation sites
upon N protein overexpression in Vero E6 cells.
Article link: https://www.cell.com/cell/fulltext/S0092-8674(20)30811-4
"""
requires = {importers.proteins_and_genes, importers.sequences}
requires.update(SiteImporter.requires)
source_name = 'Vero E6 (Bouhaddou et al., 2020)'
site_types = ['phosphorylation (SARS-CoV-2)']
adj_p_threshold = 0.05
event_name = 'SARS-CoV-2 infection'
event_reference = '(Bouhaddou et al., 2020)'
effect_size_type = 'log2FC'
pubmed_id = 32645325
def __init__(
self, sprot_canonical=None, sprot_splice=None, mappings_path=None,
):
SiteImporter.__init__(self)
UniprotToRefSeqTrait.__init__(self, mappings_path)
UniprotIsoformsTrait.__init__(self, sprot_canonical, sprot_splice)
def load_sites(self, file_path='data/sites/COVID/Supplemental Tables/TableS1.xlsx') -> List[Site]:
sites = read_excel_or_text(file_path, sheet_name='PhosphoDataFiltered')
is_site_significant = (
(sites['Inf_24Hr.adj.pvalue'] < self.adj_p_threshold)
&
(~(sites['Ctrl_24Hr.adj.pvalue'] < self.adj_p_threshold))
)
print(f'Keeping {sum(is_site_significant)} out of {len(is_site_significant)} available sites')
# select significant sites only
sites = sites.loc[is_site_significant].copy()
sites['residue'] = sites.site.str[0]
sites['position'] = sites.site.str[1:].apply(int)
sites.rename(columns={
'uniprot': 'protein_accession',
'Inf_24Hr.adj.pvalue': 'adj_p_val',
'Inf_24Hr.log2FC': 'effect_size'
}, inplace=True)
mapped_sites = self.process_event_associated_sites(
sites,
canonical=CANONICAL_PHOSPHOSITE_RESIDUES
)
return self.create_site_objects(
mapped_sites,
columns=[
'refseq', 'position', 'residue', 'mod_type',
'pub_med_ids', 'adj_p_val', 'effect_size', 'event'
]
)