In [74]:
import pandas as pd

In [75]:
c_met = pd.read_csv("../HA_data/metadata/crystals_metadata.csv", index_col=0)
c_seq = pd.read_csv("../HA_data/aligned_sequences/sequence_alignment/crystals_aln_seq.csv", index_col=0)

# NOT the same
# I needed to add header: id,chain,host,location,year
m_met = pd.read_csv("../HA_data/metadata/models_metadata.csv", index_col=0, na_values=["-"])
m_seq = pd.read_csv("../HA_data/aligned_sequences/sequence_alignment/models_aln_seq.csv", index_col=0)
m_template = pd.read_csv("../HA_data/models_template_mapping.csv", index_col=0)

# Crystal data

In [76]:
c_met.head()

Unnamed: 0_level_0,chain,subtype,host,location,year,continent
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1ha0,A,H3N2,,Aichi,1968,Asia
4eef,A,H1N1,,BrevigMission,1918,North America
4cqp,A,H5N1,,Vietnam,2004,Asia
4cqq,A,H5N1,,Vietnam,2004,Asia
4cqr,A,H5N1,,Vietnam,2004,Asia


In [77]:
"H3N2".split("\w")

['H3N2']

In [78]:
len(c_met.location.value_counts())

51

In [79]:
c_met.host.value_counts()

avian                     54
human                     14
canine                     7
equine                     7
reassortant X31            6
swine                      5
bat                        2
reassortant IDCDC_RG29     1
dtype: int64

In [80]:
def clean_host(x):
    if pd.isnull(x):
        return "human"
    elif x.find("reassortant") > -1:
        return "reassortant"
    else:
        return x.replace("_", " ")

In [81]:
c_met.host = c_met.host.apply(clean_host)

In [82]:
c_met["host_class"] = c_met.host.map({'human': 'human', 'avian': 'avian', 'reassortant': 'reassortant',
                                   'canine': 'mammalian', 'equine': 'mammalian', 'swine': 'mammalian',
                                    'bat': 'mammalian'})

In [83]:
c_met.host_class.value_counts()

human          153
avian           54
mammalian       21
reassortant      7
dtype: int64

In [84]:
c_met["H"] = c_met.subtype.apply(lambda x: int(x.split("N")[0][1:]))
c_met["N"] = c_met.subtype.apply(lambda x: int(x.split("N")[1]) if x[-1] != "x" else "")

In [85]:
c_seq.head()

Unnamed: 0_level_0,sequence
p_id,Unnamed: 1_level_1
4bgx,-DQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLD...
4bgy,-DQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLD...
4bgw,-DQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLD...
3znm,-DQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLD...
3znl,-DQICIGYHANNSTEQVDTIMEKNVTVTHAQDILEKTHNGKLCDLD...


In [86]:
# sequence for other data missing (but not the other way)
c_seq.index.difference(c_met.index)

Index([], dtype='object')

In [87]:
c_all = c_met.join(c_seq, how="outer")

In [88]:
c_all.head()

Unnamed: 0,chain,subtype,host,location,year,continent,host_class,H,N,sequence
1eo8,A,H3N2,human,Aichi,1968.0,Asia,human,3,2,TATLCLGHHAVPNGTLVKTITDDQIEVTNATELVQSSSTGKICNN-...
1ha0,A,H3N2,human,Aichi,1968.0,Asia,human,3,2,TATLCLGHHAVPNGTLVKTITDDQIEVTNATELVQSSSTGKICNN-...
1hgd,A,H3N2,reassortant,,,,reassortant,3,2,TATLCLGHHAVPNGTLVKTITDDQIEVTNATELVQSSSTGKICNN-...
1hge,A,H3N2,reassortant,,,,reassortant,3,2,TATLCLGHHAVPNGTLVKTITDDQIEVTNATELVQSSSTGKICNN-...
1hgf,A,H3N2,reassortant,,,,reassortant,3,2,TATLCLGHHAVPNGTLVKTITDDQIEVTNATELVQSSSTGKICNN-...


In [106]:
# or actually p_id
c_all.index.name = "p_id"  # was: id

In [107]:
c_all.to_csv("../site/data/crystals_metadata.csv", float_format = '%.0f')  # for missing H/N

## Model data

In [108]:
m_met.head()

Unnamed: 0_level_0,chain,host,location,year,subtype,continent,host_class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAA43090,A,human,Berlin,1964,H2N2,Europe,human
AAA43117,A,avian,,1972,H2N9,Europe,avian
AAA43185,A,human,Japan,1957,H2N2,Asia,human
AAA43205,A,avian,Ontario,1966,H5N9,North America,avian
AAA43243,A,avian,Hong Kong,1978,H2N2,Asia,avian


In [109]:
m_met.subtype.value_counts()

H1N1     383
H3N2     259
H1N2     197
H5N1     131
H9N2      50
H2N2      25
H3N8      17
H5N2      17
H7N3      14
H10N7     14
H7N7      12
H3N1       9
H2N3       8
H5N5       5
H2N7       4
H2N5       4
H5N3       4
H3N6       4
H9Nx       3
H3N5       3
H9N1       3
H7N2       3
H7N1       3
H5N9       2
H9N5       2
H2N1       2
H10N8      2
H3N7       1
H3Nx       1
H9N7       1
H5N6       1
H7N8       1
H7N4       1
H3N9       1
H2N9       1
H9N9       1
H7N9       1
H5N8       1
H5N7       1
H10N3      1
H9N6       1
H3N3       1
dtype: int64

In [110]:
m_met.host.value_counts()

swine             485
human             383
avian             326
emu                 2
ferret              1
giant anteater      1
dtype: int64

In [111]:
m_met.host = m_met.host.fillna("human")

In [112]:
m_met["host_class"] = m_met.host.map({'human': 'human', 'avian': 'avian', 'reassortant': 'reassortant',
                                      'emu': 'mammalian', 'ferret': 'mammalian', 'giant anteater': 'mammalian',
                                      'swine': 'mammalian'})

In [113]:
m_met.head(20)

Unnamed: 0_level_0,chain,host,location,year,subtype,continent,host_class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAA43090,A,human,Berlin,1964,H2N2,Europe,human
AAA43117,A,avian,,1972,H2N9,Europe,avian
AAA43185,A,human,Japan,1957,H2N2,Asia,human
AAA43205,A,avian,Ontario,1966,H5N9,North America,avian
AAA43243,A,avian,Hong Kong,1978,H2N2,Asia,avian
AAA43247,A,human,Korea,1968,H2N2,Asia,human
AAA43248,A,human,Krasnodar,1959,H2N2,Europe,human
AAA43578,A,avian,Ontario,1976,H2N3,North America,avian
AAA43678,A,human,Singapore,1957,H2N2,Asia,human
AAA72339,A,swine,Nebraska,1992,H1N1,North America,mammalian


In [114]:
len(m_met.location.value_counts())

313

In [115]:
m_seq.head()

Unnamed: 0_level_0,sequence
p_id,Unnamed: 1_level_1
AAA43090,GDQICIGYHAN-STEKVDTILERNVTVTHAKDILEK--THNGKLCK...
AAA43247,GDQICIGYHAN-STEKVDTILERNVTVTHAKDILEK--THNGKLCK...
ABO38701,GDQICIGYHAN-STEKVDTILERNVTVTHAKDILEK--THNGKLCK...
ABO44057,GDQICIGYHAN-STEKVDTILERNVTVTHAKDILEK--THNGKLCK...
ABO52247,GDQICIGYHAN-STEKVDTILERNVTVTHAKDILEK--THNGKLCK...


In [116]:
m_template.head()

Unnamed: 0_level_0,template_id
p_id,Unnamed: 1_level_1
AAA43578,2wr5
AEK49568,4wsw
AGL58994,4wsw
AGL60617,4wsw
AGL60775,4wsw


In [117]:
m_all = m_met.join(m_template, how="outer")
m_all = m_all.join(m_seq, how="outer")
m_all.index.name = "p_id"

In [118]:
m_all[pd.isnull(m_all.subtype)]

Unnamed: 0_level_0,chain,host,location,year,subtype,continent,host_class,template_id,sequence
p_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ADZ05627,A,human,LyonCHU,2003,,North America,human,4o5n,TATLCLGH-AVPNGTIVKTITNDQIEVTNATELVQ-S----TGICD...
ADZ05628,A,human,Reunion,2004,,Africa,human,2yp5,TATLCLGH-AVPNGTIVKTITNDQIEVTNATELVQ-S----TGICD...
CAC18524,A,human,Fiji,1983,,Oceania,human,4edb,-DTICIGYHAN-STDTVDTVLEKNVTVTHSVN-LED--NHNGKLCK...


In [119]:
m_all = m_all[pd.notnull(m_all.subtype)].copy()

In [120]:
m_all["H"] = m_all.subtype.apply(lambda x: int(x.split("N")[0][1:]))
m_all["N"] = m_all.subtype.apply(lambda x: int(x.split("N")[1]) if x[-1] != "x" else "")

In [121]:
m_all.head()

Unnamed: 0_level_0,chain,host,location,year,subtype,continent,host_class,template_id,sequence,H,N
p_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAA43090,A,human,Berlin,1964,H2N2,Europe,human,3ku6,GDQICIGYHAN-STEKVDTILERNVTVTHAKDILEK--THNGKLCK...,2,2
AAA43117,A,avian,,1972,H2N9,Europe,avian,3ku6,GDQICIGYHSN-STEKVDTILERNVTVTHAKDILEK--THNGKLCK...,2,9
AAA43185,A,human,Japan,1957,H2N2,Asia,human,3ku6,GDQICIGYHAN-STEKVDTNLERNVTVTHAKDILEK--THNGKLCK...,2,2
AAA43205,A,avian,Ontario,1966,H5N9,North America,avian,1jso,-DQICIGYHAN-STKQVDTIMEKNVTVTYAQDILEK--EHNGKLCS...,5,9
AAA43243,A,avian,Hong Kong,1978,H2N2,Asia,avian,3ku6,GDQICIGYHAN-STETVDTILERNVTVTHAKNILEK--THNGKLCK...,2,2


In [122]:
# WARNING: as of now without serotype or subtype
m_all.to_csv("../site/data/models_metadata.csv", float_format = '%.0f')

In [124]:
len(m_all.subtype.unique())

42