In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import pandas as pd
from mhclovac.utils import validate_sequence


In [23]:
class MhcNameParser:

    def __init__(self, xml_file):
        self.tree = ET.parse(xml_file)
        self.parsed_data = self._parse_xml()

    def _parse_xml(self):
        root = self.tree.getroot()
        data = {}
        for child in root:
            child_data = {}
            for tag in child:
                if tag.tag == 'DisplayedRestriction':
                    child_data['name'] = tag.text
                    child_data['synonyms'] = [tag.text]
                if tag.tag == 'Synonyms':
                    child_data['synonyms'].extend(tag.text.split('|'))
                if tag.tag == 'RestrictionLevel':
                    child_data['restriction_level'] = tag.text
                if tag.tag == 'Organism':
                    child_data['species'] = tag.text
                if tag.tag == 'Class':
                    child_data['class'] = tag.text
                if tag.tag == 'Chain1Name':
                    child_data['chain1'] = tag.text
                if tag.tag == 'Chain2Name':
                    child_data['chain2'] = tag.text
            data[child_data['name']] = child_data
        return data

    @functools.lru_cache
    def get_allele_name(self, name):
        for allele in self.parsed_data.values():
            if name in allele['synonyms']:
                if allele['restriction_level'] in ['complete molecule', 'partial molecule']:
                    return allele['name'].split(' ')[0]

    @functools.lru_cache
    def get_species(self, partial):
        for allele_dict in self.parsed_data.values():
            if partial in allele_dict['synonyms']:
                return allele_dict['species']


In [26]:
raw_data = pd.read_csv(
    './data/mhc_ligand_full_single_file.zip',
    sep=',',
    nrows=1000,
    skiprows=1
)

for i, col in enumerate(raw_data.columns):
    print(i, col)

raw_data = pd.read_csv(
    './data/mhc_ligand_full_single_file.zip',
    sep=',',
    skiprows=2,
    usecols=[10, 11, 43, 91, 94, 96, 107, 111],
    names=['peptyde_type', 'peptide', 'species', 'assay_group', 'qual_meas', 'quant_meas', 'mhc_allele', 'mhc_class']
)

raw_data.head()

0 IEDB IRI
1 IEDB IRI.1
2 Type
3 PMID
4 Submission ID
5 Authors
6 Journal
7 Date
8 Title
9 Epitope IRI
10 Object Type
11 Name
12 Reference Name
13 Modified residues
14 Modifications
15 Starting Position
16 Ending Position
17 IRI
18 Synonyms
19 Source Molecule
20 Source Molecule IRI
21 Molecule Parent
22 Molecule Parent IRI
23 Source Organism
24 Source Organism IRI
25 Species
26 Species IRI
27 Comments
28 Epitope Relation
29 Object Type.1
30 Name.1
31 Starting Position.1
32 Ending Position.1
33 IRI.1
34 Synonyms.1
35 Source Molecule.1
36 Source Molecule IRI.1
37 Molecule Parent.1
38 Molecule Parent IRI.1
39 Source Organism.1
40 Source Organism IRI.1
41 Species.1
42 Species IRI.1
43 Name.2
44 IRI.2
45 Geolocation
46 Geolocation IRI
47 Sex
48 Age
49 MHC Types Present
50 Process Type
51 Disease
52 Disease IRI
53 Disease Stage
54 Epitope Relation.1
55 Object Type.2
56 Name.3
57 Reference Name.1
58 Starting Position.2
59 Ending Position.2
60 IRI.3
61 Source Molecule.2
62 Source Molecule IRI.

Unnamed: 0,peptyde_type,peptide,species,assay_group,qual_meas,quant_meas,mhc_allele,mhc_class
0,Linear peptide,KLEDLERDL,Homo sapiens (human),qualitative binding,Positive-Low,,HLA-A*02:01,I
1,Linear peptide,LITGRLQSL,Homo sapiens (human),qualitative binding,Positive-High,,HLA-A2,I
2,Linear peptide,TRVAFAGL,Mus musculus (mouse),qualitative binding,Positive,,H2-Kb,I
3,Linear peptide,RNTDFFGL,Mus musculus (mouse),qualitative binding,Positive,,H2-Kb,I
4,Linear peptide,EVMPVSMAK,Homo sapiens (human),dissociation constant KD (~EC50),Positive-Intermediate,473.0,HLA-A*03:01,I


In [27]:
assay_groups = [
    'qualitative binding',
    'dissociation constant KD (~EC50)',
    'half maximal inhibitory concentration (IC50)',
    'ligand presentation',
    'dissociation constant KD (~IC50)',
    'dissociation constant KD'
]

data = raw_data[raw_data['assay_group'].isin(assay_groups)]


In [28]:
data['peptide'] = data['peptide'].apply(lambda x: x.upper())
valid_pep_mask = data['peptide'].apply(validate_sequence)
data = data[valid_pep_mask]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['peptide'] = data['peptide'].apply(lambda x: x.upper())


In [29]:
xml_file = './data/MhcAlleleNameList.xml'
name_parser = MhcNameParser(xml_file)

data['mhc_allele'] = data['mhc_allele'].apply(lambda x: name_parser.get_allele_name(x))
data['species'] = data['mhc_allele'].apply(lambda x: name_parser.get_species(x))

data.dropna(subset=['mhc_allele', 'species', 'peptide'], inplace=True)

In [30]:
output_name = './data/mhc_full_cleaned.csv'
data.to_csv(output_name, index=False)