In [1]:
%matplotlib inline
%load_ext autoreload

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sys.path.append('../../mbspbs10pc')

from mbspbs10pc import utils

# Check MBS-PBS 10% data availability

In [3]:
ROOT = os.path.join('..','..','..','data')

mbs_files = filter(lambda x: x.startswith('MBS'), os.listdir(ROOT))
pbs_files = filter(lambda x: x.startswith('PBS'), os.listdir(ROOT))
sample_pin_lookout = filter(lambda x: x.startswith('SAMPLE'), os.listdir(ROOT))[0]

print('MBS files:')
for mbs in mbs_files:
    print('{}'.format(os.path.join(ROOT, mbs)))
    
print('PBS files:')
for pbs in pbs_files:
    print('{}'.format(os.path.join(ROOT, pbs)))
    
print('Sample PIN lookout: {}'.format(os.path.join(ROOT, sample_pin_lookout)))

MBS files:
../../../data/MBS_SAMPLE_10PCT_2010.csv
../../../data/MBS_SAMPLE_10PCT_2013.csv
../../../data/MBS_SAMPLE_10PCT_2008.csv
../../../data/MBS_SAMPLE_10PCT_2009.csv
../../../data/MBS_SAMPLE_10PCT_2011.csv
../../../data/MBS_SAMPLE_10PCT_2014.csv
../../../data/MBS_SAMPLE_10PCT_2012.csv
PBS files:
../../../data/PBS_SAMPLE_10PCT_2010.csv
../../../data/PBS_SAMPLE_10PCT_2011.csv
../../../data/PBS_SAMPLE_10PCT_2012.csv
../../../data/PBS_SAMPLE_10PCT_2009.csv
../../../data/PBS_SAMPLE_10PCT_2013.csv
../../../data/PBS_SAMPLE_10PCT_2008.csv
Sample PIN lookout: ../../../data/SAMPLE_PIN_LOOKUP.csv


# PBS take a look

In [4]:
filename = '../../../data/PBS_SAMPLE_10PCT_2010.csv'
print(filename)
df = pd.read_csv(filename, header=0, index_col=0)
df.shape

../../../data/PBS_SAMPLE_10PCT_2010.csv


  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


(18312638, 13)

In [5]:
df.head()

Unnamed: 0_level_0,SPPLY_DT,ITM_CD,PBS_RGLTN24_ADJST_QTY,BNFT_AMT,PTNT_CNTRBTN_AMT,SRT_RPT_IND,RGLTN24_IND,DRG_TYP_CD,MJR_SPCLTY_GRP_CD,UNDR_CPRSCRPTN_TYP_CD,PRSCRPTN_CNT,PTNT_CTGRY_DRVD_CD,PTNT_STATE
PTNT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3776951855,31JAN2010,08173E,30,11.26,33.3,R,N,GE,1.0,,1,G2,WA
4526557106,02DEC2010,08814X,192,16.64,0.0,R,N,GE,1.0,,1,C0,NSW
980508493,12MAY2010,03162K,25,2.27,5.4,,N,GE,1.0,,1,C1,VIC
9449546678,10SEP2010,01884E,20,8.55,0.0,,N,GE,1.0,,1,C1,VIC
8671809277,16JUN2010,08567X,1,27.49,0.0,,N,GE,1.0,,1,C0,NSW


In [6]:
uct = df['UNDR_CPRSCRPTN_TYP_CD'].fillna('C')

In [8]:
np.unique(uct.values.ravel())

array(['C'], dtype=object)

In [9]:
filename = '../../../data/PBS_SAMPLE_10PCT_2013.csv'
print(filename)
df = pd.read_csv(filename, header=0, index_col=0)
df.shape

../../../data/PBS_SAMPLE_10PCT_2013.csv


(25895471, 13)

In [10]:
uct = df['UNDR_CPRSCRPTN_TYP_CD'].fillna('C')

In [12]:
df[uct == 'U']['PTNT_CTGRY_DRVD_CD'].head(25)

PTNT_ID
5847446252    G2
5675097669    G2
4609807887    G2
6659433488    G2
2338975510    G2
8651180199    G2
7020190531    G2
9860999313    G2
7250732616    G2
2638236226    G2
8401999670    G2
5723488179    G2
4514226803    G2
3464425673    G2
8250874993    G2
6922442252    G2
8374010284    G2
3795636265    G2
6865105313    G2
1442473148    G2
750268975     G2
5685881534    G2
4119398502    G2
664686344     G2
651791418     G2
Name: PTNT_CTGRY_DRVD_CD, dtype: object

# Drugs used in diabetes

In [None]:
_dd = pd.read_csv(os.path.join('..', 'data', 'drugs_used_in_diabetes.csv'), header=0)

# Fix 6-digit notation
dd = set()
for item in _dd.values.ravel():
    if len(item)<6:
        dd.add(str(0)+item)
    else:
        dd.add(item)

# FIXME - exclude Metformins and Sulfonamides
dd = pd.DataFrame(data=list(dd), columns=_dd.columns)
ms = pd.read_csv(os.path.join('..', 'data', 'metformins_sulfonamides.csv'), header=0)
mask = []
for d in dd.values:
    mask.append(d not in ms.values)
dd = pd.DataFrame(data=dd.values[mask], columns=dd.columns)
print(dd.shape)

In [None]:
dd.head()

In [None]:
dd_set = set(list(dd.values.ravel()))

# Filter diabetes

In [None]:
filename = 'PBS_SAMPLE_10PCT_2011.csv'
costs = pd.read_csv(os.path.join(ROOT, '{}'.format(filename)), header=0,
                    usecols=['BNFT_AMT', 'PTNT_CNTRBTN_AMT', 'ITM_CD', 'SPPLY_DT'])
costs.head()

In [None]:
dd_costs = costs.loc[costs['ITM_CD'].isin(dd_set)]
print(dd_costs.shape)

In [None]:
dd_costs['TOT'] = dd_costs['BNFT_AMT'] + dd_costs['PTNT_CNTRBTN_AMT']

In [None]:
dd_costs.sort_values(by='TOT').head()

In [None]:
plt.hist(dd_costs['TOT'], bins=200)
plt.xlim([0,200])
plt.title(2011)

In [None]:
plt.hist(dd_costs['TOT'], bins=200)
plt.xlim([0,200])
plt.title(2012)

In [None]:
#costs[np.logical_and(costs['PTNT_CNTRBTN_AMT'] > 35.4, costs['ITM_CD'].isin(dd_set))]

In [None]:
over_copayment = []
below_copayment = []
thresholds = [31.3, 32.9, 33.3, 34.2, 35.4, 36.1]

for year, thresh in zip(np.arange(2008,2014), thresholds):
    print('Reading {}'.format(year))
    filename = 'PBS_SAMPLE_10PCT_'+str(year)+'.csv'
    costs = pd.read_csv(os.path.join(ROOT, '{}'.format(filename)), header=0,
                        usecols=['BNFT_AMT', 'PTNT_CNTRBTN_AMT', 'ITM_CD', 'SPPLY_DT'])
    dd_costs = costs.loc[costs['ITM_CD'].isin(dd_set)] # filter for diabetes
    
    c = dd_costs['BNFT_AMT'] + dd_costs['PTNT_CNTRBTN_AMT']
    
    break

In [None]:
len(np.where(c > thresh)[0])

In [None]:
len(c)

# Costs plot

In [None]:
#idx = np.where(np.logical_and(costs['PTNT_CNTRBTN_AMT'].values > 0, costs['PTNT_CNTRBTN_AMT'].values < 50))

In [None]:
xx = dd_costs['BNFT_AMT']
print(xx.min())

In [None]:
plt.figure(dpi=100)
plt.hist(xx, bins=25);

# Summary plots

In [None]:
import cPickle as pkl

with open('../tmp/3_df_yearly_no_copayment.pkl','rb') as f:
    dfy = pkl.load(f)

In [None]:
plt.figure(dpi=100)
lengths = [len(dfy['PBS_SAMPLE_10PCT_'+str(k)+'.csv']) for k in np.arange(2008,2014)]
plt.plot(np.arange(2008,2014), lengths, '-o')
plt.ylabel('#people using drugs for diabetes')
plt.ylim([0, 50000])
plt.savefig('../tmp/people_using_dd.png');

In [None]:
starting = []
for target_year in np.arange(2009, 2014):
    # From the target year, go backward and find the subjects
    # that were prescribed with diabete control drugs for the
    # first time
    positive_subjects = set(dfy['PBS_SAMPLE_10PCT_'+str(target_year)+'.csv'])

    for year in np.arange(2008, target_year)[::-1]:
        curr = set(dfy['PBS_SAMPLE_10PCT_'+str(year)+'.csv'])
        positive_subjects = set(filter(lambda x: x not in curr, positive_subjects))
    starting.append(len(positive_subjects))

In [None]:
plt.figure(dpi=100)
plt.bar(np.arange(len(starting)), starting)
plt.xticks(np.arange(len(starting)), np.arange(2009, 2014))
plt.ylabel('#people started using drugs for diabetes')
plt.title('Max value is: {} achieved in {}'.format(np.max(starting),
                                             np.arange(2009, 2014)[np.argmax(starting)]))
plt.savefig('../tmp/people_started_using_dd.png');

# Monthly summary plots

In [None]:
import cPickle as pkl

with open('../tmp/df3.pkl','rb') as f:
    dfy = pkl.load(f)

In [None]:
lengths = []
xaxis = []
for year in range(2008, 2014):
    df_year = dfy['PBS_SAMPLE_10PCT_'+str(year)+'.csv']
    
    for month in range(1, 13):
        df_month = df_year[month]
        lengths.append(len(df_month)) 
        xaxis.append(str(month)+'-'+str(year))
lengths = np.array(lengths)

In [None]:
plt.figure(figsize=(20, 4))
plt.plot(np.arange(len(lengths)), lengths, '-o')
plt.ylabel('#people using drugs for diabetes', fontsize=15)
plt.yticks(fontsize=15)
plt.xticks(np.arange(len(lengths)), xaxis, rotation='vertical', fontsize=15)
plt.title('Not year consistent (co-payment included)', fontsize=15)
plt.savefig('../tmp/people_using_dd_monthly_breakdown_yes_copay.png');

In [None]:
#plt.figure(figsize=(20, 4))
#plt.plot(np.arange(len(lengths)), lengths, '-o')
#plt.ylabel('#people using drugs for diabetes', fontsize=15)
#plt.yticks(fontsize=15)
#plt.xticks(np.arange(len(lengths)), xaxis, rotation='vertical', fontsize=15)
#plt.title('Not year consistent', fontsize=15)
#plt.savefig('../tmp/people_using_dd_monthly_breakdown.png');

In [None]:
#plt.figure(figsize=(20, 4))
#plt.plot(np.arange(len(lengths)), lengths, '-o')
#plt.ylabel('#people using drugs for diabetes', fontsize=15)
#plt.yticks(fontsize=15)
#plt.xticks(np.arange(len(lengths)), xaxis, rotation='vertical', fontsize=15)
#plt.title('Year consistency', fontsize=15)
#plt.savefig('../tmp/people_using_dd_year_consistent_monthly_breakdown.png');

In [None]:
month_year = [x.split('-') for x in xaxis]
starting = []
for i, (target_month, target_year) in enumerate(month_year):
    print('{})----------------------------'.format(i))
    print('target: ', target_month, target_year)
    positive_subjects = set(dfy['PBS_SAMPLE_10PCT_'+str(target_year)+'.csv'][int(target_month)])
    
    for month, year in month_year[:i][::-1]:
        print('look', month, year)
        curr = set(dfy['PBS_SAMPLE_10PCT_'+str(year)+'.csv'][int(month)])
        positive_subjects = set(filter(lambda x: x not in curr, positive_subjects))
    starting.append(len(positive_subjects))

In [None]:
_starting = starting[12:]

plt.figure(figsize=(20, 4))
plt.bar(np.arange(len(_starting)), _starting)
plt.xticks(np.arange(len(xaxis[12:])), xaxis[12:], rotation='vertical', fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel('#people started using drugs for diabetes', fontsize=15)
plt.title('Max value is: {} achieved in {} (co-payment included)'.format(np.max(_starting),
                                             xaxis[12:][np.argmax(_starting)]), fontsize=15);
plt.savefig('../tmp/people_started_using_dd_monthly_breakdown_copayment.png');

In [None]:
dd = dfy

In [None]:
curr = set(pd.read_csv('../../../data/PBS_SAMPLE_10PCT_2008.csv', header=0, usecols=['PTNT_ID']).values.ravel())

In [None]:
len(curr)

In [None]:
diabete = set(dd['PBS_SAMPLE_10PCT_2008.csv'])

In [None]:
len(diabete)

In [None]:
out = set(filter(lambda x: x not in diabete, curr))

In [None]:
len(out)

In [None]:
len(diabete) + len(curr)

In [None]:
len(out.intersection(diabete))

In [None]:
a = set([1,2,3])
b = set([1,2,3])

print(a.union(b))

In [None]:
pos = pd.read_csv('../tmp/3_df_yearly_no_copayment_class_1.csv', header=0)
neg = pd.read_csv('../tmp/3_df_yearly_no_copayment_class_0.csv', header=0)

In [None]:
ii = np.intersect1d(pos.values.ravel(), neg.values.ravel())


In [None]:
dd = {123123: ['gp-5-4-gp-f-4'], 4532432: ['gp-4-3-pg-7-s-3']}
df = pd.DataFrame.from_dict(dd, orient='index').rename({0: 'Sequence'}, axis=1)
df