In [1]:
import numpy as np
import pandas as pd
import feather
import plotly
from tqdm import tqdm

In [2]:
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.grid_objs import Grid, Column
from plotly.graph_objs import *
from IPython.display import display, HTML
from datetime import datetime, timedelta

In [3]:
init_notebook_mode(connected=True)

# Get dataframe

In [4]:
antiflu = feather.read_dataframe('..\\..\\Data\\Datathon_2017\\antiflu_sample_data')

In [5]:
postcode_populations_lat_lon = feather.read_dataframe('..\\..\\Data\\Datathon_2017\\postcode_populations_lat_lon')

In [6]:
antiflu.columns

Index(['Patient_ID', 'Store_ID', 'Prescriber_ID', 'Drug_ID',
       'SourceSystem_Code', 'Prescription_Week', 'Dispense_Week', 'Drug_Code',
       'NHS_Code', 'IsDeferredScript', 'Script_Qty', 'Dispensed_Qty',
       'MaxDispense_Qty', 'PatientPrice_Amt', 'WholeSalePrice_Amt',
       'GovernmentReclaim_Amt', 'RepeatsTotal_Qty', 'RepeatsLeft_Qty',
       'StreamlinedApproval_Code', 'gender', 'year_of_birth', 'postcode',
       'lat', 'long', 'MasterProductCode', 'MasterProductFullName',
       'BrandName', 'FormCode', 'StrengthCode', 'PackSizeNumber',
       'GenericIngredientName', 'EthicalSubCategoryName',
       'EthicalCategoryName', 'ManufacturerCode', 'ManufacturerName',
       'ManufacturerGroupID', 'ManufacturerGroupCode', 'ChemistListPrice',
       'ATCLevel5Code', 'ATCLevel4Code.x', 'ATCLevel3Code.x',
       'ATCLevel2Code.x', 'ATCLevel1Code.x', 'ATCLevel1Code.y',
       'ATCLevel1Name', 'ATCLevel2Code.y', 'ATCLevel2Name', 'ATCLevel3Code.y',
       'ATCLevel3Name', 'ATCLevel4C

In [7]:
antiflu[['Patient_ID','lat','long','total_pop']].head(2)

Unnamed: 0,Patient_ID,lat,long,total_pop
0,51.0,-34.861994,138.592267,22263.0
1,51.0,-34.861994,138.592267,22263.0


# init_params

In [8]:
grid_res = 50
lat_spacing = [ -42, -25 ]
lon_spacing = [ 135, 155 ]

In [9]:
lat_grid = np.linspace(lat_spacing[0],lat_spacing[1],grid_res)
lon_grid = np.linspace(lon_spacing[0],lon_spacing[1],grid_res)

In [10]:
def between(x,num1,num2):
    if (x < num2 and x >= num1):
        return(True)
    else:
        return(False)

In [11]:
def getRecordsBetweenDates(date_1,date_2):
    return antiflu[antiflu.apply( lambda x: between(x['Prescription_Week'],date_1,date_2),axis=1 ) ]

In [12]:
def addDaysToDate(date,days_num):
    return datetime.strftime((datetime.strptime(date,'%Y-%m-%d') + timedelta(days=days_num)),'%Y-%m-%d')

# 1 years worth of data here

In [42]:
date_list = [['2013-05-05','2013-05-19']]

for i in range(26):
    date_list.append([date_list[-1][1], addDaysToDate(date_list[-1][1],14)])

In [43]:
date_list

[['2013-05-05', '2013-05-19'],
 ['2013-05-19', '2013-06-02'],
 ['2013-06-02', '2013-06-16'],
 ['2013-06-16', '2013-06-30'],
 ['2013-06-30', '2013-07-14'],
 ['2013-07-14', '2013-07-28'],
 ['2013-07-28', '2013-08-11'],
 ['2013-08-11', '2013-08-25'],
 ['2013-08-25', '2013-09-08'],
 ['2013-09-08', '2013-09-22'],
 ['2013-09-22', '2013-10-06'],
 ['2013-10-06', '2013-10-20'],
 ['2013-10-20', '2013-11-03'],
 ['2013-11-03', '2013-11-17'],
 ['2013-11-17', '2013-12-01'],
 ['2013-12-01', '2013-12-15'],
 ['2013-12-15', '2013-12-29'],
 ['2013-12-29', '2014-01-12'],
 ['2014-01-12', '2014-01-26'],
 ['2014-01-26', '2014-02-09'],
 ['2014-02-09', '2014-02-23'],
 ['2014-02-23', '2014-03-09'],
 ['2014-03-09', '2014-03-23'],
 ['2014-03-23', '2014-04-06'],
 ['2014-04-06', '2014-04-20'],
 ['2014-04-20', '2014-05-04'],
 ['2014-05-04', '2014-05-18']]

In [44]:
getRecordsBetweenDates('2013-05-02','2013-05-10')

Unnamed: 0,Patient_ID,Store_ID,Prescriber_ID,Drug_ID,SourceSystem_Code,Prescription_Week,Dispense_Week,Drug_Code,NHS_Code,IsDeferredScript,...,ATCLevel2Code.y,ATCLevel2Name,ATCLevel3Code.y,ATCLevel3Name,ATCLevel4Code.y,ATCLevel4Name,ATCLevel5Name,Prescription_Year,total_pop,working_age_pct
364,6524.0,1216.0,6522.0,2720.0,F,2013-05-05,2013-05-12,DOXY13,9106G,0.0,...,J01,ANTIBACTERIALS FOR SYSTEMIC USE,J01A,TETRACYCLINES,J01AA,TETRACYCLINES,DOXYCYCLINE,2013.0,22646.0,67.0
365,6524.0,1216.0,6522.0,2720.0,F,2013-05-05,2013-05-19,DOXY13,9106G,0.0,...,J01,ANTIBACTERIALS FOR SYSTEMIC USE,J01A,TETRACYCLINES,J01AA,TETRACYCLINES,DOXYCYCLINE,2013.0,22646.0,67.0
366,6524.0,1216.0,6522.0,2720.0,F,2013-05-05,2013-06-09,DOXY13,9106G,0.0,...,J01,ANTIBACTERIALS FOR SYSTEMIC USE,J01A,TETRACYCLINES,J01AA,TETRACYCLINES,DOXYCYCLINE,2013.0,22646.0,67.0
636,9796.0,442.0,22271.0,2735.0,F,2013-05-05,2013-09-01,DOXY28,9107H,0.0,...,J01,ANTIBACTERIALS FOR SYSTEMIC USE,J01A,TETRACYCLINES,J01AA,TETRACYCLINES,DOXYCYCLINE,2013.0,20846.0,68.5
637,9796.0,442.0,22271.0,2735.0,F,2013-05-05,2013-10-06,DOXY28,9107H,0.0,...,J01,ANTIBACTERIALS FOR SYSTEMIC USE,J01A,TETRACYCLINES,J01AA,TETRACYCLINES,DOXYCYCLINE,2013.0,20846.0,68.5
638,9796.0,442.0,22271.0,2735.0,F,2013-05-05,2013-11-17,DOXY28,9107H,0.0,...,J01,ANTIBACTERIALS FOR SYSTEMIC USE,J01A,TETRACYCLINES,J01AA,TETRACYCLINES,DOXYCYCLINE,2013.0,20846.0,68.5
1088,108125.0,1607.0,16619.0,1698.0,F,2013-05-05,2013-06-30,CILI2,2965C,0.0,...,J01,ANTIBACTERIALS FOR SYSTEMIC USE,J01C,"BETA-LACTAM ANTIBACTERIALS, PENICILLINS",J01CE,BETA-LACTAMASE SENSITIVE PENICILLINS,PHENOXYMETHYLPENICILLIN,2013.0,8312.0,76.6
1089,108125.0,1607.0,16619.0,1698.0,F,2013-05-05,2013-06-30,CILI2,2965C,0.0,...,J01,ANTIBACTERIALS FOR SYSTEMIC USE,J01C,"BETA-LACTAM ANTIBACTERIALS, PENICILLINS",J01CE,BETA-LACTAMASE SENSITIVE PENICILLINS,PHENOXYMETHYLPENICILLIN,2013.0,8312.0,76.6
1803,116636.0,1798.0,0.0,9556.0,F,2013-05-05,2013-06-09,MPDO1,2707L,0.0,...,J01,ANTIBACTERIALS FOR SYSTEMIC USE,J01A,TETRACYCLINES,J01AA,TETRACYCLINES,DOXYCYCLINE,2013.0,21359.0,67.4
1804,116636.0,1798.0,0.0,9556.0,F,2013-05-05,2013-07-07,MPDO1,2707L,0.0,...,J01,ANTIBACTERIALS FOR SYSTEMIC USE,J01A,TETRACYCLINES,J01AA,TETRACYCLINES,DOXYCYCLINE,2013.0,21359.0,67.4


# Generate Data

In [45]:
pop_matrix = np.zeros((grid_res,grid_res))

tmp_pop_data = postcode_populations_lat_lon[postcode_populations_lat_lon['YEAR'] == int(date_list[-1][0][0:4])]

In [46]:
    for i in tqdm(range(grid_res-1)):
        for j in range(grid_res-1):
            pop_matrix[i,j] = tmp_pop_data[tmp_pop_data.apply(lambda x: ( between(x['lat'], lat_grid[i], lat_grid[i+1]) and between(x['long'], lon_grid[j], lon_grid[j+1]) ) ,axis=1) ]['total_pop'].sum()        

100%|██████████████████████████████████████████████████████████████████████████████████| 49/49 [01:07<00:00,  1.37s/it]


In [47]:
def generateCountMatrix(tmp_df):
    
    count_matrix = np.zeros((grid_res,grid_res))
    for i in range(grid_res-1):
        for j in range(grid_res-1):
            count_matrix[i,j] = tmp_df[tmp_df.apply(lambda x: ( between(x['lat'], lat_grid[i], lat_grid[i+1]) and between(x['long'], lon_grid[j], lon_grid[j+1]) ) ,axis=1) ]['Patient_ID'].count()
    return count_matrix

# test

In [48]:
count_matrix = generateCountMatrix(tmp_df)

In [49]:
np.sum(count_matrix)

220.0

In [50]:
def div0( a, b ):
    """ ignore / 0, div0( [-1, 0, 1], 0 ) -> [0, 0, 0] """
    with np.errstate(divide='ignore', invalid='ignore'):
        c = np.true_divide( a, b )
        c[ ~ np.isfinite( c )] = 0  # -inf inf NaN
    return c.T

In [51]:
antiflu_ratio_matrix = div0(count_matrix, pop_matrix)*1000

In [52]:
trace_heat = Heatmap(z = antiflu_ratio_matrix)

In [53]:
iplot([trace_heat])

In [54]:
grid_res

50

In [58]:
#scale needs to be ebtween 0 and 1
colscl = [
        # Let first 10% (0.1) of the values have color rgb(0, 0, 0)
        [0, 'rgb(255, 255, 255)'],
        [0.2, 'rgb(255, 255, 255)'],

        # Let values between 10-20% of the min and max of z
        # have color rgb(20, 20, 20)
        [0.2, 'rgb(255, 255, 255)'],
        [0.4, 'rgb(255, 216, 32)'],

        # Values between 20-30% of the min and max of z
        # have color rgb(40, 40, 40)
        [0.4, 'rgb(255, 216, 32)'],
        [0.8, 'rgb(234, 35, 0)'],

        [0.8, 'rgb(234, 35, 0)'],
        [1, 'rgb(0, 0, 0)']
    ]

In [56]:

data_pts = []


for date_item in tqdm(date_list):

    tmp_df = getRecordsBetweenDates(date_item[0],date_item[1])

    x = np.transpose(np.array(tmp_df[['long']]))[0]
    y = np.transpose(np.array(tmp_df[['lat']]))[0]

    xv, yv = np.meshgrid(lat_grid[:(grid_res)], lon_grid[:(grid_res)])

    #H, xedges, yedges = np.histogram2d(x,y,bins=(lon_grid,lat_grid))
    count_matrix = generateCountMatrix(tmp_df)

    H = div0(count_matrix, pop_matrix)*1000

    H = H.reshape((grid_res)*(grid_res),1)
    xv = xv.reshape((grid_res)*(grid_res),1)
    yv = yv.reshape((grid_res)*(grid_res),1)

    df = pd.DataFrame(np.concatenate((H,xv,yv), axis = 1), columns=['density','lat','lon'])
    df_reduced = df[df['density'] > 0]
    #df_reduced = df

    data_pt = [ dict(
        lat = df_reduced['lat'],
        lon = df_reduced['lon'],
        text = df_reduced['density'].astype(str),
        marker = dict(
            symbol = "square-dot",
            color = df_reduced['density'],
            colorscale= colscl,
            reversescale = False,
            opacity = 0.7,
            size = 8,
            colorbar = dict(
                thickness = 10,
                titleside = "right",
                outlinecolor = "rgb(212,212,212)",
                ticks = "outside",
                ticklen = 3,
                showticksuffix = "last",
                dtick = 0.1
            ),
        ),
        type = 'scattergeo'
    ) ]
    
    if (np.sum(H) > 0):
        data_pts.append({'data' : data_pt})

100%|██████████████████████████████████████████████████████████████████████████████████| 27/27 [04:31<00:00, 10.63s/it]


# total histogram

In [57]:
data = [ dict(
    lat = df_reduced['lat'],
    lon = df_reduced['lon'],
    text = df_reduced['density'].astype(str),
    marker = dict(
        color = df_reduced['density'],
        colorscale= colscl,
        reversescale = False,
        opacity = 0.7,
        size = 10,
        colorbar = dict(
            thickness = 10,
            titleside = "right",
            outlinecolor = "rgb(212,212,212)",
            ticks = "outside",
            ticklen = 3,
            showticksuffix = "last",
            dtick = 0.1
        ),
    ),
    type = 'scattergeo'
) ]

figure = {'data': data_pts[0]['data'],
         'layout': dict(
                            geo = dict(
                            scope = 'world',
                            showland = True,
                            landcolor = "rgb(212, 212, 212)",
                            subunitcolor = "rgb(255, 255, 255)",
                            countrycolor = "rgb(255, 255, 255)",
                            showlakes = True,
                            lakecolor = "rgb(255, 255, 255)",
                            showsubunits = True,
                            showcountries = True,
                            resolution = 50,
                            lonaxis = dict(
                                showgrid = True,
                                gridwidth = 0.5,
                                range= lon_spacing,
                                dtick = (lon_spacing[1]-lon_spacing[0])/grid_res*10
                            ),
                            lataxis = dict (
                                showgrid = True,
                                gridwidth = 0.5,
                                range= lat_spacing,
                                dtick = (lat_spacing[1]-lat_spacing[0])/grid_res*10
                            )
                            ),
         updatemenus = [{'type': 'buttons',
                                  'buttons': [{'label': 'Play',
                                               'method': 'animate',
                                               'args': [None]}]}],
            title = 'AU drugs for systemic use scatter'
        ),
        'frames' : data_pts
         }


iplot(figure)