In [1]:
%matplotlib inline
%load_ext autoreload

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sys.path.append('../../mbspbs10pc')

from mbspbs10pc import utils

# MBS-PBS 10% data availability

In [3]:
ROOT = os.path.join('..','..','..','data')

mbs_files = filter(lambda x: x.startswith('MBS'), os.listdir(ROOT))
pbs_files = filter(lambda x: x.startswith('PBS'), os.listdir(ROOT))
sample_pin_lookout = filter(lambda x: x.startswith('SAMPLE'), os.listdir(ROOT))[0]

print('MBS files:')
for mbs in mbs_files:
    print('{}'.format(os.path.join(ROOT, mbs)))
    
print('PBS files:')
for pbs in pbs_files:
    print('{}'.format(os.path.join(ROOT, pbs)))
    
print('Sample PIN lookout: {}'.format(os.path.join(ROOT, sample_pin_lookout)))

MBS files:
../../../data/MBS_SAMPLE_10PCT_2010.csv
../../../data/MBS_SAMPLE_10PCT_2013.csv
../../../data/MBS_SAMPLE_10PCT_2008.csv
../../../data/MBS_SAMPLE_10PCT_2009.csv
../../../data/MBS_SAMPLE_10PCT_2011.csv
../../../data/MBS_SAMPLE_10PCT_2014.csv
../../../data/MBS_SAMPLE_10PCT_2012.csv
PBS files:
../../../data/PBS_SAMPLE_10PCT_2010.csv
../../../data/PBS_SAMPLE_10PCT_2011.csv
../../../data/PBS_SAMPLE_10PCT_2012.csv
../../../data/PBS_SAMPLE_10PCT_2009.csv
../../../data/PBS_SAMPLE_10PCT_2013.csv
../../../data/PBS_SAMPLE_10PCT_2008.csv
Sample PIN lookout: ../../../data/SAMPLE_PIN_LOOKUP.csv


# Monthly breakdown

In [118]:
filename = pbs_files[-1]
print(filename)
df = pd.read_csv(os.path.join(ROOT, '{}'.format(filename)), header=0, index_col=0, nrows=1000)
df['SPPLY_DT'] = pd.to_datetime(df['SPPLY_DT'], format='%d%b%Y')
df.shape

PBS_SAMPLE_10PCT_2008.csv


(1000, 13)

In [119]:
df.head(5)

Unnamed: 0_level_0,SPPLY_DT,ITM_CD,PBS_RGLTN24_ADJST_QTY,BNFT_AMT,PTNT_CNTRBTN_AMT,SRT_RPT_IND,RGLTN24_IND,DRG_TYP_CD,MJR_SPCLTY_GRP_CD,UNDR_CPRSCRPTN_TYP_CD,PRSCRPTN_CNT,PTNT_CTGRY_DRVD_CD,PTNT_STATE
PTNT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
9772479275,2008-05-05,01171P,1,3.63,5.0,,N,GE,1.0,,1,C1,QLD
8114957183,2008-06-06,08681X,20,14.74,5.0,,N,GE,1.0,,1,C1,WA
8866313366,2008-12-14,01325R,60,6.6,5.0,,N,GE,1.0,,1,G1,WA
8493121914,2008-11-15,02236Q,30,24.23,0.0,R,N,GE,1.0,,1,C0,VIC
5124189855,2008-09-01,02745L,2,45.29,5.0,,N,GE,2.0,,1,G1,VIC


In [114]:
df['SPPLY_DT'].values[0][-4:]

'2008'

In [106]:
#dt['SPPLY_DT'] >= '2008-05-01'
dt[np.logical_and(dt['SPPLY_DT'] >= datetime.date(year=2008, month=5, day=1), dt['SPPLY_DT'] < datetime.date(year=2008, month=5, day=31))]

Unnamed: 0_level_0,SPPLY_DT
PTNT_ID,Unnamed: 1_level_1
9772479275,2008-05-05
6884083990,2008-05-28
4676167119,2008-05-25
1823483519,2008-05-28
2091335009,2008-05-29
3042221718,2008-05-26
5312344996,2008-05-27
2673986956,2008-05-19
8230958129,2008-05-16
8079814973,2008-05-16


In [120]:
months = [datetime.date(2000, m, 1).strftime('%b').upper() for m in range(1, 13)] # JAN, FEB, MAR...
indexes = {m: set() for m in months}

indexes

{'APR': set(),
 'AUG': set(),
 'DEC': set(),
 'FEB': set(),
 'JAN': set(),
 'JUL': set(),
 'JUN': set(),
 'MAR': set(),
 'MAY': set(),
 'NOV': set(),
 'OCT': set(),
 'SEP': set()}

In [124]:
indexes

{'APR': {0, 111},
 'AUG': set(),
 'DEC': set(),
 'FEB': set(),
 'JAN': set(),
 'JUL': set(),
 'JUN': set(),
 'MAR': set(),
 'MAY': set(),
 'NOV': set(),
 'OCT': set(),
 'SEP': set()}

In [129]:
import calendar
first_day, last_day = calendar.monthrange(2008, 2)

In [130]:
last_day

29

In [50]:
df.loc[df['BNFT_AMT'] > 10]

Unnamed: 0_level_0,SPPLY_DT,ITM_CD,PBS_RGLTN24_ADJST_QTY,BNFT_AMT,PTNT_CNTRBTN_AMT,SRT_RPT_IND,RGLTN24_IND,DRG_TYP_CD,MJR_SPCLTY_GRP_CD,UNDR_CPRSCRPTN_TYP_CD,PRSCRPTN_CNT,PTNT_CTGRY_DRVD_CD,PTNT_STATE
PTNT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
8114957183,06JUN2008,08681X,20,14.74,5.0,,N,GE,1.0,,1,C1,WA
8493121914,15NOV2008,02236Q,30,24.23,0.0,R,N,GE,1.0,,1,C0,VIC
5124189855,01SEP2008,02745L,2,45.29,5.0,,N,GE,2.0,,1,G1,VIC
8503205429,15JAN2008,01906H,30,16.57,5.0,R,N,GE,1.0,,1,C1,SA
1975405202,02NOV2008,08601Q,30,26.94,31.3,R,N,GE,1.0,,1,G2,VIC
7832415747,11OCT2008,09090K,1,1768.94,5.0,R,N,GE,2.0,,1,C1,QLD
3451126260,03DEC2008,08213G,30,42.27,0.0,R,N,GE,1.0,,1,C0,NSW
3675557224,20FEB2008,08333N,60,32.60,31.3,R,N,GE,1.0,,1,G2,NSW
5787464474,30JUN2008,09062Y,56,68.12,31.3,R,N,GE,1.0,,1,G2,VIC
9711598506,23NOV2008,08601Q,30,53.24,5.0,,N,GE,1.0,,1,C1,VIC


In [48]:
df

Unnamed: 0_level_0,SPPLY_DT,ITM_CD
PTNT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
9772479275,05MAY2008,01171P
8114957183,06JUN2008,08681X
8866313366,14DEC2008,01325R
8493121914,15NOV2008,02236Q
5124189855,01SEP2008,02745L
8503205429,15JAN2008,01906H
1975405202,02NOV2008,08601Q
7832415747,11OCT2008,09090K
6346262948,12SEP2008,03119E
3451126260,03DEC2008,08213G


In [17]:
import cPickle as pkl

with open('../tmp/df.pkl','rb') as f:
    dd = pkl.load(f)

In [18]:
dd

{'PBS_SAMPLE_10PCT_2008.csv': [7589068803,
  265814024,
  3273654288,
  376307733,
  6301941784,
  1914437660,
  5465047071,
  1105199138,
  2194866182,
  1837629495,
  9817686072,
  8901492793,
  2757361726,
  1616642115,
  8693743687,
  5181538377,
  4051391842,
  8793489486,
  3193176143,
  1449787474,
  4994236504,
  6497632347,
  1837498468,
  4913889387,
  9042002029,
  3486122104,
  1781137536,
  1217921156,
  8322678919,
  5219177836,
  830079134,
  7775191214,
  8702132403,
  1005453493,
  2517106871,
  169607357,
  4106879174,
  5006164172,
  4859756763,
  8078885085,
  2830500064,
  9914810595,
  9019064548,
  4669964526,
  2391015667,
  8566341877,
  6246498554,
  7790788862,
  4898029837,
  7919108375,
  8271036696,
  4717936921,
  3610902817,
  8488354097,
  3250716986,
  2963669314,
  7098990915,
  9867886929,
  4902879574,
  9947185495,
  4623040860,
  9708241246,
  2586575203,
  5583667556,
  5216583777,
  6230770031,
  7450788209,
  7443579250,
  4818862452,
  3122135