In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import pandas as pd
from mhclovac.utils import validate_sequence


In [12]:
class MhcNameParser:

    def __init__(self, xml_file):
        self.tree = ET.parse(xml_file)
        self.parsed_data = self._parse_xml()

    def _parse_xml(self):
        root = self.tree.getroot()
        data = {}
        for child in root:
            child_data = {}
            for tag in child:
                if tag.tag == 'DisplayedRestriction':
                    child_data['name'] = tag.text
                    child_data['synonyms'] = [tag.text]
                if tag.tag == 'Synonyms':
                    child_data['synonyms'].extend(tag.text.split('|'))
                if tag.tag == 'RestrictionLevel':
                    child_data['restriction_level'] = tag.text
                if tag.tag == 'Organism':
                    child_data['species'] = tag.text
                if tag.tag == 'Class':
                    child_data['class'] = tag.text
                if tag.tag == 'Chain1Name':
                    child_data['chain1'] = tag.text
                if tag.tag == 'Chain2Name':
                    child_data['chain2'] = tag.text
            data[child_data['name']] = child_data
        return data

    @functools.lru_cache
    def get_allele_name(self, name):
        for allele in self.parsed_data.values():
            if name in allele['synonyms']:
                if allele['restriction_level'] in ['complete molecule', 'partial molecule']:
                    return allele['name'].split(' ')[0]

    @functools.lru_cache
    def get_species(self, partial):
        for allele_dict in self.parsed_data.values():
            if partial in allele_dict['synonyms']:
                return allele_dict['species']


In [13]:
raw_data = pd.read_csv(
    './data/mhc_ligand_full_single_file.zip',
    sep=',',
    skiprows=2,
    usecols=[10, 11, 43, 91, 94, 96, 107, 110],
    names=['peptyde_type', 'peptide', 'species', 'assay_group', 'qual_meas', 'quant_meas', 'mhc_allele', 'mhc_class']
)

raw_data.head()

  raw_data = pd.read_csv(


Unnamed: 0,peptyde_type,peptide,species,assay_group,qual_meas,quant_meas,mhc_allele,mhc_class
0,Linear peptide,KLEDLERDL,Homo sapiens (human),qualitative binding,Positive-Low,,HLA-A*02:01,
1,Linear peptide,LITGRLQSL,Homo sapiens (human),qualitative binding,Positive-High,,HLA-A2,
2,Linear peptide,TRVAFAGL,Mus musculus (mouse),qualitative binding,Positive,,H2-Kb,
3,Linear peptide,RNTDFFGL,Mus musculus (mouse),qualitative binding,Positive,,H2-Kb,
4,Linear peptide,EVMPVSMAK,Homo sapiens (human),dissociation constant KD (~EC50),Positive-Intermediate,473.0,HLA-A*03:01,


In [14]:
assay_groups = [
    'qualitative binding',
    'dissociation constant KD (~EC50)',
    'half maximal inhibitory concentration (IC50)',
    'ligand presentation',
    'dissociation constant KD (~IC50)',
    'dissociation constant KD'
]

data = raw_data[raw_data['assay_group'].isin(assay_groups)]


In [17]:
data['peptide'] = data['peptide'].apply(lambda x: x.upper())
valid_pep_mask = data['peptide'].apply(validate_sequence)
data = data[valid_pep_mask]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['peptide'] = data['peptide'].apply(lambda x: x.upper())


In [19]:
xml_file = './data/MhcAlleleNameList.xml'
name_parser = MhcNameParser(xml_file)

data['mhc_allele'] = data['mhc_allele'].apply(lambda x: name_parser.get_allele_name(x))
data['species'] = data['mhc_allele'].apply(lambda x: name_parser.get_species(x))

data.dropna(subset=['mhc_allele', 'species', 'peptide'], inplace=True)

In [20]:
output_name = './data/mhc_full_cleaned.csv'
data.to_csv(output_name, index=False)