In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import pathlib as path
import matplotlib.pyplot as plt
import requests

from src import parameters as params

## Read data

In [None]:
df = pd.read_csv(params.DATA_DIR / 'task_dataset.csv')

There is one record for each GP in England:

In [None]:
len(df) == len(df.gp_code.drop_duplicates())

## Derive performance metrics

### Workforce

In [None]:
df['patients_per_gp'] = df['numberofpatients'] / (df['qualified_gp'] + df['training_gp'])

workforce_metrics = ['patients_per_gp']

### Waiting times (Same day appointments)

In [None]:
df['same_day_appointment_percentage'] = df['BookingtoApptGap_SameDay'] / (df['AttendanceOutcome_Attended'] + df['AttendanceOutcome_Unknown'])

waiting_times_metrics = ['same_day_appointment_percentage']

### Digital Access

In [None]:
df['digital_access_percentage'] = (df['ApptModality_Telephone'] + df['ApptModality_VideoConferenceOnline']) / (df['AttendanceOutcome_Attended'] + df['AttendanceOutcome_Unknown'])

digital_access_metrics = ['digital_access_percentage']

### Attendance rate

In [None]:
df['attendance_rate'] = df['AttendanceOutcome_Attended'] / (df['AttendanceOutcome_Attended'] + df['AttendanceOutcome_Unknown'] + df['AttendanceOutcome_DNA'])

attendance_metrics = ['attendance_rate']

### Quality and Outcomes Framework (QOF)

In [None]:
df['qof_total'] = df.Total_QoF / 100
df['qof_hypertension'] = df.Hypertension / 100

qof_metrics = ['qof_total', 'qof_hypertension', 'EmergencyPresentationsCancer']

### Patient experience / satisfaction (GP Survey)

In [None]:
gp_survey_metrics = ['overallexp', 'lastgpapptneeds', 'lastgpapptwait', 'localgpservicesreception', 'gpcontactoverall']

### CQC Ratings

In [None]:
cqc_rating_encoding = {'Outstanding': 4, 'Good': 3, 'Requires improvement': 2, 'Inadequate': 1}

cqc_rating_columns = ['responsive', 'overall', 'wellled', 'effective', 'caring', 'safe']

cqc_metrics = []
for column in cqc_rating_columns:
    cqc_metrics.append(f'{column}_coded')
    df[f'{column}_coded'] = df[column].apply(lambda x : cqc_rating_encoding[x] if x in cqc_rating_encoding.keys() else np.nan)

In [None]:
performance_metrics = workforce_metrics + waiting_times_metrics + digital_access_metrics + attendance_metrics + qof_metrics + gp_survey_metrics + cqc_metrics
performance_metrics[:-6]

## Filter to just North Central London (NCL) ICB

Filtering to just looking at GPs in NCL leaves us with 175 records

In [None]:
df_ncl = df[df.icb_code == params.ncl_icb]
df_ncl.shape

In [None]:
df_ncl[performance_metrics].describe()

## Multi-criteria Decision Analysis (MCDA)


We wish to derive a composite score for each GP in NCL based on the performance metrics listed above.

### Missing Data

First, we must deal with missing values for these performance metrics. We will impute the NCL median when a performance metric is unavailable.

In [None]:
df_ncl[performance_metrics].isna().sum()

In [None]:
df_ncl[df_ncl.isna().any(axis = 1)][performance_metrics].head()

In [None]:
df_ncl_median_imputed = df_ncl[performance_metrics].fillna(df_ncl[performance_metrics].median())

In [None]:
df_ncl_median_imputed[df_ncl.isna().any(axis = 1)][performance_metrics].head()

### Scaling / Normalisation of Metrics

Min max normalisation

In [None]:
def min_max_normalisation(series):
    return (series - series.min()) / (series.max() - series.min())