In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import json
import bokeh

from Bio import AlignIO, SeqIO
from networkx.readwrite import json_graph
from copy import deepcopy


%matplotlib inline

In [3]:
# Load in Minto Flats' data sheet that contains the age class of the ducks
duck_ages = pd.read_csv('Raw Data/20140916 Strain Names with Ages.csv')

# If we get the unique values of the ages, we should get: 
# array(['1B', '1C', '2A', '2B', '2C', '3', 'AHY', 'HY'], dtype=object) 
np.unique(duck_ages['Age'].values)

# Based on that, do a replace.
old_ages = ['1B', '1C', '2A', '2B', '2C', '3', 'AHY', 'HY']
new_ages = ['DKL', 'DKL', 'DKL', 'DKL', 'DKL', 'DKL', 'ADT', 'FLG']

duck_ages = duck_ages.replace(old_ages, new_ages)
duck_ages

# Ensure that the common names do not contain the subtype information in it.
duck_ages['Common Name'] = duck_ages['Common Name'].str.split('(').str[0]
duck_ages = duck_ages.set_index('Common Name')
duck_ages['Age1'] = duck_ages['Age']
duck_ages['Age2'] = duck_ages['Age']
duck_ages

Unnamed: 0_level_0,Age,Age1,Age2
Common Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A/northern pintail/Interior Alaska/10BM00303R0/2010,ADT,ADT,ADT
A/northern pintail/Interior Alaska/10BM00849R2/2010,ADT,ADT,ADT
A/mallard/Interior Alaska/10BM01929R0/2010,ADT,ADT,ADT
A/mallard/Interior Alaska/10BM02111R0/2010,ADT,ADT,ADT
A/mallard/Interior Alaska/10BM02530R0/2010,ADT,ADT,ADT
A/northern pintail/Interior Alaska/10BM02539R0/2010,ADT,ADT,ADT
A/northern pintail/Interior Alaska/10BM02561R0/2010,ADT,ADT,ADT
A/northern pintail/Interior Alaska/10BM02585R0/2010,ADT,ADT,ADT
A/northern shoveler/Interior Alaska/10BM02593R0/2010,ADT,ADT,ADT
A/mallard/Interior Alaska/10BM02644R0/2010,ADT,ADT,ADT


In [5]:
# We will load the undirected version of the transmission graph and use the links to plot the arc plots.

# Load data:
data = pd.read_csv('3.10 - Full Transmission as CSV/Undirected Full Transmissions.csv', 
                   index_col=0, 
                   parse_dates=['Isolation Date1', 'Isolation Date2'])

# To perform merge, set index of data to be Isolate1.
data = data.set_index("Isolate1").join(duck_ages['Age1'])
data.index.name = 'Isolate1'
data = data.reset_index().set_index("Isolate2").join(duck_ages['Age2'])
data.index.name = "Isolate2"
data = data.reset_index()

# Impute missing age data with adult ages by default.
data['Age1'] = data['Age1'].fillna('ADT')
data['Age2'] = data['Age2'].fillna('ADT')

In [6]:
# Create NetworkX Graph
G = nx.DiGraph()

# Add in node information
for row in data.iterrows():
    G.add_node(row[1]['Isolate1'], 
               host = row[1]['Host1'],
               isolation_date = row[1]['Isolation Date1'],
               state = row[1]['State1'],
               subtype = row[1]['Subtype1'],
               age = row[1]['Age1'])
    G.add_node(row[1]['Isolate2'],
               host = row[1]['Host2'],
               isolation_date = row[1]['Isolation Date2'],
               state = row[1]['State2'],
               subtype = row[1]['Subtype2'],
               age = row[1]['Age2'])
    
    G.add_edge(row[1]['Isolate1'],
               row[1]['Isolate2'],
               tr_timeclass = row[1]['Time Class'],
               time_delta = row[1]['Time Delta'],
               trtype = row[1]['Transmission Type'],
               weight = row[1]['Weight'])

In [7]:
for edge in G.edges(data=True):
    node1 = edge[0]
    node2 = edge[1]
    if G.node[node1]['age'] == 'DKL' and G.node[node2]['age'] == 'DKL':
        edge[2]['tr_age'] = 'DD'
    elif G.node[node1]['age'] == 'DKL' and G.node[node2]['age'] == 'FLG':
        edge[2]['tr_age'] = 'DF'
    elif G.node[node1]['age'] == 'DKL' and G.node[node2]['age'] == 'ADT':
        edge[2]['tr_age'] = 'DA'
    elif G.node[node1]['age'] == 'FLG' and G.node[node2]['age'] == 'FLG':
        edge[2]['tr_age'] = 'FF'
    elif G.node[node1]['age'] == 'FLG' and G.node[node2]['age'] == 'ADT':
        edge[2]['tr_age'] = 'FA'
    elif G.node[node1]['age'] == 'ADT' and G.node[node2]['age'] == 'ADT':
        edge[2]['tr_age'] = 'AA'
    else:
        edge[2]['tr_age'] = 'NA'

In [9]:
len(G.edges(data=True))

1213

In [10]:
# This is a custom function for serializing dates such that it is JSON compatible. 
# It gets used right at the last step prior to the JSON dump.
from dateutil.tz import tzutc

UTC = tzutc()

def serialize_date(dt):
    """
    Serialize a date/time value into an ISO8601 text representation
    adjusted (if needed) to UTC timezone.

    For instance:
    >>> serialize_date(datetime(2012, 4, 10, 22, 38, 20, 604391))
    '2012-04-10T22:38:20.604391Z'
    """
    #if dt.tzinfo:
    #    dt = dt.astimezone(UTC)
    return dt.isoformat()

In [11]:
# Dump the graph as a JSON node-link file.
# Prior to doing this, serialize all of the node dates to be JSON serializable.
G_serialized = deepcopy(G)

for node in G_serialized.nodes(data=True):
    node[1]['isolation_date'] = serialize_date(node[1]['isolation_date'])

jsongraph = json_graph.node_link_data(G_serialized) # node-link format to serialize
json.dump(jsongraph, open('viz/arcage_all.json','w'))

In [12]:
len(G_serialized.edges(data=True))

1213

In [13]:
tr_ages = [edge[2]['tr_age'] for edge in G_serialized.edges(data=True)]
# tr_ages = [item['tr_age'] for item in tr_ages]
np.unique(tr_ages)

array(['AA', 'DA', 'DD', 'DF', 'FA', 'FF', 'NA'], 
      dtype='<U2')