In [1]:
# Data is available here!
# https://opendata.nhsbsa.net/dataset/english-prescribing-data-epd/resource/352fad86-d416-4076-aa2c-1e73d42f51cf

In [2]:
# Definitions :)

# EPD -- English prescribing data
# BNF -- British National Formulary
# ADQ -- Average Daily Quantity (Seen in "ADQUSAGE")

# A formulary is a list of pharmaceutical drugs, often decided upon by a group of people, 
# for various reasons such as insurance coverage or use at a medical facility.

# BNF_CODE -- The unique code used to refer to a BNF Presentation. For example, 0501013B0AAABAB
# BNF_DESCRIPTION -- The name given to the specific type, strength, and formulation of a drug; or, the specific type of an appliance. For example, Amoxicillin 500mg capsules

# BNF_CHEMICAL_SUBSTANCE -- A unique code used to refer to a BNF Chemical Substance. For example, 0501013B0
# CHEMICAL_SUBSTANCE_BNF_DESCR -- The name of the main active ingredient in a drug or the type of an appliance. Determined by the British National Formulatory (BNF) for drugs, or the NHS BSA for appliances. For example, Amoxicillin

In [3]:
# Some observations:

# 1. Counterintuitively, a chemical substance has multiple drugs associated with it, 
# but a drug only ever has a single chemical substance associated with it.
# This means we can calculate total cost just by summing over chemical substances->drugs->records

# 2. The problem sheet doesn't make it obvious that there should only be ONE formulary (and corresponding formulary object)
# in this exercise. I spent ages trying to figure out where a "formularly id" should come from but I don't think such a thing exists!

In [4]:
import requests
import pandas as pd
import random
import locale

from dataclasses import dataclass, field
from typing import List

In [5]:
# we want to output £... later so make sure we're using UK locale
# not especially relevant to the exercise
locale.setlocale(locale.LC_ALL, 'en_GB');

In [6]:
# Define some OOP classes

@dataclass
class PracticePrescribingRecord:
    
    # (1) a practice prescribing record class: this is equivalent to one row of information in the EPD data, 
    # this class should store information on: practice name and practice code, total quantity, actual cost
    
    practice_name: str
    practice_code: str
    total_quantity: float
    actual_cost: float
    
@dataclass
class Drug:
    
    # (2) a drug class: this should model each drug and include information on BNF code and BNF description, 
    # as well as a list of practice prescribing records for the drug
    
    drug_name: str
    drug_code: str
    
    practice_prescribing_records: List[PracticePrescribingRecord] = field(repr=False)
    
    def total_cost(self) -> float:
        return sum(map(lambda x: x.actual_cost, self.practice_prescribing_records))

@dataclass
class ChemicalSubstance:
    
    # (3) a chemical substance class: this should model all chemical substances 
    # and include information on the substance name (CHEMICAL_SUBSTANCE BNF DESCR) and code (BNF_CHEMICAL_SUBSTANCE)
    # as well as a list of drugs which have this substance as their active ingredient
    
    substance_name: str
    substance_code: str
    
    drugs: List[Drug] = field(repr=False)
    
    def total_cost(self) -> float:
        return sum(map(lambda x: x.total_cost(), self.drugs))
    
@dataclass
class Formulary:
    
    # (4) a formulary class: which models the overall system and include a formulary name, a list of substances,
    # and a method to return the overall actual costs of all practice prescribing records included
    
    formulary_name: str
    chemical_substances: List[ChemicalSubstance] = field(repr=False)
    
    def total_cost(self) -> float:
        return sum(map(lambda x: x.total_cost(), self.chemical_substances))

In [7]:
# Fetch some data

sql = "SELECT * from `EPD_202301` limit 20000"
url =  "https://opendata.nhsbsa.net/api/3/action/datastore_search_sql?resource_id=EPD_202301&sql=" + sql
response = requests.get(url)
records = response.json()['result']['result']['records']
data = pd.DataFrame.from_dict(records)

In [8]:
def make_objects_from_dataframe(df):

    # there are faster ways to do this, but this keeps things (reasonably) simple without being obnoxiously slow
    
    drug_lookup = {}    # mapping from drug id to drug class instance (in the OOP sense)
    chemical_substance_lookup = {}    # mapping from chemical substance id to chemical substance info

    # create OOP objects for each unique drug and chemical substance
    # also add each record to the list of records on the drug mentioned by that record
    for row in df.to_dict(orient="records"):

        practice_prescribing_record = PracticePrescribingRecord(
                        practice_name=row['PRACTICE_NAME'], 
                        practice_code=row['PRACTICE_CODE'], 
                        total_quantity=row['TOTAL_QUANTITY'], 
                        actual_cost=row['ACTUAL_COST'])

        chemical_substance_code = row['BNF_CHEMICAL_SUBSTANCE']
        chemical_substance_description = row['CHEMICAL_SUBSTANCE_BNF_DESCR']

        # create an entry for the chemical substance if we haven't seen it before
        if chemical_substance_code not in chemical_substance_lookup:
            chemical_substance_lookup[chemical_substance_code] = ChemicalSubstance(
                substance_name=chemical_substance_description,
                substance_code=chemical_substance_code,
                drugs=None) # will populate this in the second pass

        drug_code = row['BNF_CODE']
        drug_name = row['BNF_DESCRIPTION']

        if drug_code not in drug_lookup:
            drug_lookup[drug_code] = Drug(drug_code=drug_code, drug_name=drug_name, practice_prescribing_records=[])

        drug_lookup[drug_code].practice_prescribing_records.append(practice_prescribing_record)

    # do a second pass: for each chemical substance we know about, get the drugs that use that substance:
    for chemical_substance_code in chemical_substance_lookup:
        drug_codes = list(df[df['BNF_CHEMICAL_SUBSTANCE'] == chemical_substance_code]['BNF_CODE'].drop_duplicates())
        drug_objects = list(map(lambda x: drug_lookup[x], drug_codes))
        chemical_substance_lookup[chemical_substance_code].drugs = drug_objects
    
    formulary = Formulary(formulary_name='BNF', chemical_substances=list(chemical_substance_lookup.values()))
    
    return formulary, chemical_substance_lookup, drug_lookup

In [9]:
formulary, chemical_substance_lookup, drug_lookup = make_objects_from_dataframe(data)

In [10]:
list(chemical_substance_lookup.items())[:3]

[('1304000V0',
  ChemicalSubstance(substance_name='Hydrocortisone', substance_code='1304000V0')),
 ('0304010E0',
  ChemicalSubstance(substance_name='Fexofenadine hydrochloride', substance_code='0304010E0')),
 ('2122',
  ChemicalSubstance(substance_name='Emollients', substance_code='2122'))]

In [11]:
chemical_substance_lookup['0103050P0'].drugs[:3]

[Drug(drug_name='Omeprazole 20mg gastro-resistant capsules', drug_code='0103050P0AAAAAA'),
 Drug(drug_name='Omeprazole 20mg dispersible gastro-resistant tablets', drug_code='0103050P0AAANAN'),
 Drug(drug_name='Omeprazole 10mg gastro-resistant capsules', drug_code='0103050P0AAAFAF')]

In [12]:
drug_lookup['0103050P0BBAEAN'].practice_prescribing_records[:3]

[PracticePrescribingRecord(practice_name='CLARENCE MEDICAL CENTRE', practice_code='K81074', total_quantity=56.0, actual_cost=26.09602),
 PracticePrescribingRecord(practice_name='FARNHAM DENE MEDICAL PRACTICE', practice_code='H81615', total_quantity=168.0, actual_cost=78.27567),
 PracticePrescribingRecord(practice_name='SEVERNSIDE MEDICAL PRACTICE', practice_code='L84052', total_quantity=28.0, actual_cost=13.05421)]

In [13]:
print(f'The {formulary.formulary_name} formulary, ' +
      f'contains {len(formulary.chemical_substances)} substances ' +
      f'and a total cost of {locale.currency(formulary.total_cost(), grouping=True)}')

The BNF formulary, contains 921 substances and a total cost of £972,027.85
