In [2]:
%matplotlib inline
%load_ext autoreload

In [3]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sys.path.append('../../mbspbs10pc')

from mbspbs10pc import utils

# Check MBS-PBS 10% data availability

In [4]:
ROOT = os.path.join('..','..','..','data')

mbs_files = filter(lambda x: x.startswith('MBS'), os.listdir(ROOT))
pbs_files = filter(lambda x: x.startswith('PBS'), os.listdir(ROOT))
sample_pin_lookout = filter(lambda x: x.startswith('SAMPLE'), os.listdir(ROOT))[0]

print('MBS files:')
for mbs in mbs_files:
    print('{}'.format(os.path.join(ROOT, mbs)))
    
print('PBS files:')
for pbs in pbs_files:
    print('{}'.format(os.path.join(ROOT, pbs)))
    
print('Sample PIN lookout: {}'.format(os.path.join(ROOT, sample_pin_lookout)))

MBS files:
../../../data/MBS_SAMPLE_10PCT_2014.csv
../../../data/MBS_SAMPLE_10PCT_2010.csv
../../../data/MBS_SAMPLE_10PCT_2012.csv
../../../data/MBS_SAMPLE_10PCT_2008.csv
../../../data/MBS_SAMPLE_10PCT_2009.csv
../../../data/MBS_SAMPLE_10PCT_2013.csv
../../../data/MBS_SAMPLE_10PCT_2011.csv
PBS files:
../../../data/PBS_SAMPLE_10PCT_2012.csv
../../../data/PBS_SAMPLE_10PCT_2011.csv
../../../data/PBS_SAMPLE_10PCT_2009.csv
../../../data/PBS_SAMPLE_10PCT_2013.csv
../../../data/PBS_SAMPLE_10PCT_2008.csv
../../../data/PBS_SAMPLE_10PCT_2010.csv
Sample PIN lookout: ../../../data/SAMPLE_PIN_LOOKUP.csv


# PBS take a look

In [5]:
filename = pbs_files[0]
df = pd.read_csv(os.path.join(ROOT, '{}'.format(filename)), header=0, index_col=0, nrows=1000)
df.shape

(1000, 13)

In [6]:
df.head()

Unnamed: 0_level_0,SPPLY_DT,ITM_CD,PBS_RGLTN24_ADJST_QTY,BNFT_AMT,PTNT_CNTRBTN_AMT,SRT_RPT_IND,RGLTN24_IND,DRG_TYP_CD,MJR_SPCLTY_GRP_CD,UNDR_CPRSCRPTN_TYP_CD,PRSCRPTN_CNT,PTNT_CTGRY_DRVD_CD,PTNT_STATE
PTNT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
4599420094,11DEC2012,02236Q,30,0.0,16.01,R,N,GE,1.0,U,1,G2,VIC
5850597601,26NOV2012,08386J,56,72.91,0.0,R,N,GE,1.0,,1,C0,NSW
1237668038,25FEB2012,08214H,30,22.6,35.4,R,N,GE,1.0,,1,G2,VIC
4493310342,18JAN2012,02055E,120,18.92,5.8,R,N,GE,1.0,,1,C1,VIC
2902208774,26DEC2012,02751T,30,11.56,0.0,,N,GE,1.0,,1,C0,SA


In [7]:
dd = pd.read_csv(os.path.join('..', 'data', 'drugs_used_in_diabetes.csv'), header=0)

# Fix 6-digit notation
dd_set = set()
for item in dd.values.ravel():
    if len(item)<6:
        dd_set.add(str(0)+item)
    else:
        dd_set.add(item)

In [8]:
print(dd_set)

set(['09061X', '09180E', '10649L', '01763T', '09449H', '08694N', '10626G', '08811R', '08874C', '01533Q', '10038H', '09040T', '03415R', '05476F', '02939Q', '01762R', '01801T', '10510E', '10627H', '08695P', '09039R', '03439B', '10206E', '08571D', '10515K', '10677Y', '08435Y', '05474D', '03423E', '01761Q', '10048W', '10089B', '08188Y', '02944Y', '01531N', '02440K', '08696Q', '08212F', '10516L', '01711C', '09181F', '01921D', '09060W', '03424F', '10640B', '10055F', '08189B', '08810Q', '09435N', '10128C', '10888C', '08452W', '10033C', '09302N', '08390N', '02430X', '09451K', '08609D', '08533D', '02062M', '10032B', '10650M', '05475E', '09059T', '02449X', '09450J', '02933J', '10202Y', '08084L', '10035E', '09062Y', '10633P', '09182G', '10011X', '02986E', '10044P', '08983T', '08535F', '08450R', '08607B', '01713E', '10639Y', '10045Q', '08451T', '08838E', '10051B', '09224L', '10090C', '01426C', '03387G'])


In [20]:
for filename in pbs_files:
    reader = pd.read_csv(os.path.join(ROOT, '{}'.format(filename)), chunksize=1000)
    
    for i, chunk in enumerate(reader):
        idx = chunk.loc[chunk['ITM_CD'].isin(dd_set)]['ITM_CD'].index
        if len(idx)> 0: print(idx)
    break

Int64Index([ 59,  84,  94,  97, 104, 108, 144, 149, 163, 237, 240, 283, 378,
            385, 399, 416, 454, 464, 521, 539, 541, 551, 557, 568, 594, 639,
            748, 753, 814, 843, 875, 985, 999],
           dtype='int64')
Int64Index([1015, 1027, 1095, 1176, 1177, 1203, 1235, 1249, 1257, 1300, 1319,
            1329, 1353, 1357, 1377, 1410, 1425, 1432, 1448, 1486, 1523, 1549,
            1560, 1642, 1671, 1673, 1687, 1725, 1761, 1780, 1783, 1839, 1856,
            1870, 1926, 1935, 1938],
           dtype='int64')
Int64Index([2039, 2040, 2093, 2100, 2128, 2202, 2225, 2226, 2238, 2309, 2312,
            2315, 2329, 2343, 2406, 2444, 2467, 2486, 2595, 2603, 2625, 2639,
            2659, 2662, 2668, 2736, 2757, 2790, 2841, 2900, 2928, 2953],
           dtype='int64')
Int64Index([3000, 3011, 3019, 3022, 3052, 3079, 3100, 3128, 3185, 3194, 3246,
            3277, 3280, 3284, 3287, 3299, 3302, 3373, 3399, 3471, 3481, 3486,
            3527, 3539, 3604, 3606, 3694, 3737, 3774, 3784, 3796

Int64Index([37031, 37032, 37046, 37068, 37069, 37138, 37150, 37203, 37239,
            37362, 37383, 37403, 37479, 37486, 37490, 37506, 37511, 37520,
            37524, 37543, 37555, 37574, 37579, 37593, 37600, 37628, 37668,
            37674, 37701, 37708, 37711, 37758, 37762, 37802, 37817, 37840,
            37919, 37941, 37943, 37972, 37974],
           dtype='int64')
Int64Index([38116, 38132, 38178, 38190, 38272, 38278, 38362, 38383, 38389,
            38401, 38418, 38422, 38450, 38469, 38480, 38485, 38495, 38502,
            38506, 38519, 38558, 38620, 38716, 38799, 38821, 38835, 38849,
            38866, 38886, 38897, 38942, 38945],
           dtype='int64')
Int64Index([39057, 39095, 39166, 39246, 39258, 39264, 39296, 39302, 39324,
            39328, 39342, 39347, 39357, 39362, 39365, 39376, 39405, 39407,
            39438, 39469, 39475, 39486, 39490, 39515, 39531, 39567, 39589,
            39600, 39603, 39625, 39646, 39659, 39684, 39688, 39698, 39748,
            39797, 39832, 3

Int64Index([74009, 74010, 74028, 74043, 74070, 74106, 74112, 74136, 74162,
            74166, 74242, 74307, 74324, 74354, 74370, 74427, 74486, 74494,
            74546, 74547, 74653, 74751, 74788, 74872, 74934, 74991],
           dtype='int64')
Int64Index([75042, 75083, 75118, 75120, 75180, 75186, 75224, 75251, 75284,
            75341, 75374, 75396, 75401, 75411, 75503, 75521, 75556, 75578,
            75594, 75634, 75648, 75651, 75707, 75718, 75754, 75777, 75848,
            75873, 75882, 75900, 75926, 75928, 75994],
           dtype='int64')
Int64Index([76005, 76016, 76057, 76059, 76061, 76083, 76096, 76127, 76144,
            76155, 76157, 76244, 76261, 76264, 76266, 76281, 76309, 76376,
            76394, 76478, 76505, 76517, 76545, 76562, 76608, 76625, 76626,
            76638, 76680, 76701, 76720, 76742, 76763, 76817, 76842, 76903,
            76982],
           dtype='int64')
Int64Index([77014, 77023, 77035, 77129, 77153, 77171, 77178, 77227, 77287,
            77339, 77342, 77

Int64Index([118056, 118103, 118124, 118149, 118164, 118168, 118177, 118198,
            118226, 118252, 118305, 118411, 118451, 118478, 118481, 118492,
            118493, 118542, 118550, 118576, 118642, 118689, 118710, 118766,
            118795, 118796, 118823, 118826, 118835, 118841, 118864, 118919,
            118930, 118952, 118981],
           dtype='int64')
Int64Index([119023, 119111, 119126, 119146, 119157, 119218, 119231, 119244,
            119282, 119285, 119301, 119341, 119383, 119626, 119638, 119714,
            119779, 119811, 119843, 119856, 119902, 119918, 119921, 119944,
            119955, 119958],
           dtype='int64')
Int64Index([120063, 120077, 120089, 120211, 120359, 120396, 120397, 120434,
            120476, 120522, 120528, 120531, 120546, 120591, 120679, 120691,
            120709, 120779, 120816, 120827, 120830, 120866, 120877, 120898,
            120918, 120940, 120973],
           dtype='int64')
Int64Index([121016, 121043, 121044, 121045, 121060, 121082,

KeyboardInterrupt: 

In [40]:
import multiprocessing as mp
from multiprocessing import Manager

manager = Manager()
results = manager.dict()

def process_chunk(i, chunk, results):
    # process data frame
    idx = chunk.loc[chunk['ITM_CD'].isin(dd_set)]['PTNT_ID']
    if len(idx)>0: results[i] = idx.values

reader = pd.read_csv(os.path.join(ROOT, '{}'.format(filename)), chunksize=5000)
pool = mp.Pool(8) # use 4 processes

jobs = []
for i, chunk in enumerate(reader):
    # process each data frame
    f = pool.apply_async(process_chunk, [i, chunk, results])
    jobs.append(f)

for f in jobs:
    f.get()




In [46]:
pbs_years = [s.split('_')[-1].split('.')[0] for s in pbs_files]

In [50]:
results[0]

array([3928691704, 5156241855, 7146682928, 9669167460, 9994115866,
       7798245800, 2298601413, 6857553007,  488112048, 4929049992,
        960577885, 7168070086,  586106903, 7960752450, 5267384775,
       4308515754, 7146682928, 9994115866, 6647794797, 1035365209,
       6795166169, 2785437590, 3827086426, 2678863910, 1228293469,
       9450559749, 9630518521, 7222369217, 7146682928, 1825040432,
       2715095631, 6535963265, 6127458217, 6877386662, 2112734019,
       4067353197, 5637523093, 8283549141, 6775240862, 2715095631,
       2867666705, 3280762999, 4887647344, 9551026230, 5742324091,
       9330854747, 1247754848, 8745754917, 3661950237, 7827501575,
       7817130820, 1291139382,  510595601, 4619024211,  651507472,
       5227872388, 9518153326, 8918545549, 9706196330, 6897718194,
       9127260023, 3844363419, 9009769893, 2249439106,  254840024,
       5844767311, 4885240458,  400006677, 3534788118, 3073049706,
       2109925903, 1838164895, 3032180721, 8023589768, 3007244

In [34]:
a

{'a', 'ads', 'b', 'c'}

In [23]:
rr = dict(results)

In [26]:
rr[0]

Int64Index([ 59,  84,  94,  97, 104, 108, 144, 149, 163, 237, 240, 283, 378,
            385, 399, 416, 454, 464, 521, 539, 541, 551, 557, 568, 594, 639,
            748, 753, 814, 843, 875, 985, 999],
           dtype='int64')

In [None]:

#    print(df.loc[df['ITM_CD'].isin(dd_set)]['ITM_CD'].index)
#    df = pd.read_csv(os.path.join(ROOT, '{}'.format(filename)), header=0, index_col=0)