In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import pathlib as path
import matplotlib.pyplot as plt
import requests

import geopandas as gpd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import contextily as ctx

from src import parameters as params
from src import utils

## Read data

In [None]:
df = pd.read_csv(params.DATA_DIR / 'task_dataset.csv')

There is one record for each GP in England:

In [None]:
len(df) == len(df.gp_code.drop_duplicates())

## Derive performance metrics

### Efficiency

Here we calculate the number of appointments per gp. 

In some cases the number of GPs < 1. This causes the number of appointments per gp to be greater than the number of appointments.
In this case we simply take attended appointments to be the number of appointments per gp.

In [None]:
df['patients_per_gp'] = df['numberofpatients'] / (np.ceil(df['qualified_gp']) + np.ceil(df['training_gp']))
params.column_display_names['patients_per_gp'] = 'Patients per GP'


df['appts_per_gp'] = (df['AttendanceOutcome_Attended'] + df['AttendanceOutcome_Unknown']) / (np.ceil(df['qualified_gp']) + np.ceil(df['training_gp']))
params.column_display_names['appts_per_gp'] = 'Appointments per GP'

efficiency_metrics = ['appts_per_gp']

In [None]:
df['patients_per_gp'] = df['numberofpatients'] / (df['qualified_gp'] + df['training_gp'])
params.column_display_names['patients_per_gp'] = 'Patients per GP'


df['appts_per_gp'] = df.apply(
    lambda row: min(
        (row['AttendanceOutcome_Attended'] + row['AttendanceOutcome_Unknown']) / (row['qualified_gp'] + row['training_gp']),
        (row['AttendanceOutcome_Attended'] + row['AttendanceOutcome_Unknown'])
    ), axis=1
)
params.column_display_names['appts_per_gp'] = 'Appointments per GP'

efficiency_metrics = ['appts_per_gp']

In [None]:
fig, ax = plt.subplots(1, 4)
sns.boxplot(df.patients_per_gp.rename(params.column_display_names['patients_per_gp']), ax=ax[3])
sns.boxplot(df.appts_per_gp.rename(params.column_display_names['appts_per_gp']), ax=ax[0])
sns.boxplot(df.training_gp.rename(params.column_display_names['training_gp']), ax=ax[1])
sns.boxplot(df.qualified_gp.rename(params.column_display_names['qualified_gp']), ax=ax[2])

plt.tight_layout()

### Waiting times (Same day appointments)

In [None]:
df['same_day_appointment_percentage'] = df['BookingtoApptGap_SameDay'] / (df['AttendanceOutcome_Attended'] + df['AttendanceOutcome_Unknown'])
params.column_display_names['same_day_appointment_percentage'] = 'Same-day Appointments (%)'

waiting_times_metrics = ['same_day_appointment_percentage']

### Digital Access

In [None]:
df['digital_access_percentage'] = (df['ApptModality_Telephone'] + df['ApptModality_VideoConferenceOnline']) / (df['AttendanceOutcome_Attended'] + df['AttendanceOutcome_Unknown'])
params.column_display_names['digital_access_percentage'] = 'Digital Access (%)'

digital_access_metrics = ['digital_access_percentage']

### Attendance rate

In [None]:
df['attendance_rate'] = df['AttendanceOutcome_Attended'] / (df['AttendanceOutcome_Attended'] + df['AttendanceOutcome_Unknown'] + df['AttendanceOutcome_DNA'])
params.column_display_names['attendance_rate'] = 'Attendance Rate (%)'

attendance_metrics = ['attendance_rate']

### Quality and Outcomes Framework (QOF)

In [None]:
df['qof_total'] = df.Total_QoF / 100
params.column_display_names['qof_total'] = 'QoF Total (%)'

df['qof_hypertension'] = df.Hypertension / 100
params.column_display_names['qof_hypertension'] = 'QoF Hypertension (%)'

df['qof_child_vaccination'] = df.ChildVaccination / 100
params.column_display_names['qof_child_vaccination'] = 'QoF Child Vaccination (%)'

qof_metrics = ['qof_total', 'qof_hypertension', 'qof_child_vaccination', 'EmergencyPresentationsCancer', 'AntibioticPrescribing']

### Patient experience / satisfaction (GP Survey)

In [None]:
gp_survey_metrics = ['overallexp', 'lastgpapptneeds', 'lastgpapptwait', 'localgpservicesreception', 'gpcontactoverall']

### CQC Ratings

In [None]:
cqc_rating_encoding = {'Outstanding': 4, 'Good': 3, 'Requires improvement': 2, 'Inadequate': 1}

cqc_rating_columns = ['responsive', 'overall', 'wellled', 'effective', 'caring', 'safe']

cqc_metrics = []
for column in cqc_rating_columns:
    cqc_metrics.append(f'{column}_coded')
    df[f'{column}_coded'] = df[column].apply(lambda x : cqc_rating_encoding[x] if x in cqc_rating_encoding.keys() else np.nan)
    params.column_display_names[f'{column}_coded'] = params.column_display_names[column]

In [None]:
performance_metrics = efficiency_metrics + waiting_times_metrics + digital_access_metrics + attendance_metrics + qof_metrics + gp_survey_metrics + cqc_metrics

## Filter to just North Central London (NCL) ICB

Filtering to just looking at GPs in NCL leaves us with 175 records, as specified in the Task Description.

In [None]:
df_ncl = df[df.icb_code == params.ncl_icb]

len(df_ncl) == 175

## Multi-criteria Decision Analysis (MCDA)


We wish to derive a composite score for each GP in NCL based on the performance metrics listed above.

### Missing Data

First, we must deal with missing values for these performance metrics. We will impute the NCL median when a performance metric is unavailable.

In [None]:
df_ncl[params.performance_metrics.metric].isna().sum()

In [None]:
df_ncl_median_imputed = df_ncl.copy()
df_ncl_median_imputed[params.performance_metrics.metric] = df_ncl[params.performance_metrics.metric].fillna(df_ncl[params.performance_metrics.metric].median())

### Scaling / Normalisation of Metrics

We normalise the performance metrics to all have the same scale. This is so that when we aggregate them to obtain a composite score, they all carry the same weight.

Min max normalisation is used.

We want a higher score to be a good thing, so some metrics will need 'inverting'.

In [None]:
normalised_metrics = []
for i, (metric, invert) in enumerate(zip(params.performance_metrics.metric, params.performance_metrics.invert)):
    normalised_metrics.append(f'{metric}_norm')
    if invert:
        df_ncl_median_imputed[f'{metric}_norm'] = 1 - utils.min_max_normalisation(df_ncl_median_imputed[metric])
    else:
        df_ncl_median_imputed[f'{metric}_norm'] = utils.min_max_normalisation(df_ncl_median_imputed[metric])

In [None]:
df_ncl_median_imputed[normalised_metrics].describe()

### Assign weights to each performance metric

This is where I would seek stakeholder engagement to understand which performance measures we are most interested in. These would then be assigned more weight.

Please see the parameters.py file for the parameters used in this analysis

### Obtain Composite Score

In [None]:
df_ncl_median_imputed['performance_score'] = df_ncl_median_imputed[normalised_metrics].mul(list(params.performance_metrics.weight / params.performance_metrics.weight.sum()), axis = 1).sum(axis = 1)
df_ncl_median_imputed['performance_score'].describe()

In [None]:
df_ncl_gps_ranked = df_ncl_median_imputed.sort_values(by = 'performance_score', ascending = True)

### Analysis of Composite Score

In [None]:
plt.figure(figsize = (20, 10))
plt.bar(df_ncl_gps_ranked.gp_code, df_ncl_gps_ranked.performance_score)

Lowest performing GPs in NCL

In [None]:
df_ncl_gps_ranked[0:7][['gp_code', 'performance_score'] + performance_metrics]

In [None]:
plt.hist(df_ncl_median_imputed.performance_score, bins = 20)
plt.xlim(0, 1)

plt.savefig(params.OUTPUTS_DIR / 'performance-score-distribution.png')

In [None]:
fig, ax = plt.subplots(5, 3, figsize=(10,15))

for i, metric in enumerate(['performance_score'] + performance_metrics[0:-6]):
    if metric == 'performance_score':
        df_boxplot = df_ncl_median_imputed[metric].rename('Composite Performance Score')
    else:
        df_boxplot = df_ncl_median_imputed[metric].rename(params.column_display_names[metric])
    sns.boxplot(df_boxplot, ax=ax[i//3, i%3], width = 0.2)

plt.tight_layout()

plt.savefig(params.OUTPUTS_DIR / 'performance-boxplot.png')

#### Map Plot

In [None]:
### WARNING: This cell may take a long time to run (approx. 3 minutes)

# Geocode the postcodes
geolocator = Nominatim(user_agent="gp_performance_mapper")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

df_ncl_median_imputed['location'] = df_ncl_median_imputed['postcode'].apply(geocode)
df_ncl_median_imputed['point'] = df_ncl_median_imputed['location'].apply(lambda loc: tuple(loc.point) if loc else None)

# Drop rows with missing geocoded data
df_ncl_median_imputed = df_ncl_median_imputed.dropna(subset=['point'])

# Create a GeoDataFrame
gdf = gpd.GeoDataFrame(df_ncl_median_imputed, geometry=gpd.points_from_xy(df_ncl_median_imputed['point'].apply(lambda x: x[1]), df_ncl_median_imputed['point'].apply(lambda x: x[0])))
gdf.crs = "EPSG:4326"

In [None]:
# Plotting
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
gdf.plot(column='performance_score', ax=ax, legend=False, cmap='coolwarm', markersize=50)

# Add basemap
ctx.add_basemap(ax, crs=gdf.crs, source=ctx.providers.CartoDB.Positron)

# Add colorbar with label
sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=plt.Normalize(vmin=gdf['performance_score'].min(), vmax=gdf['performance_score'].max()))
sm._A = []
cbar = plt.colorbar(sm, ax=ax)
cbar.set_label('Performance Score')

# Add title and labels
plt.title('GP Performance Map')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

plt.savefig(params.OUTPUTS_DIR / 'performance-map.png')

# Show plot
plt.show()

#### Correlation Matrix

In [None]:
cmap = sns.diverging_palette(230,20,as_cmap=True)

#plt.figure(figsize=(20, 10))
sns.heatmap(df_ncl[['IMD2019', 'patients_per_gp'] + performance_metrics].corr(), cmap=cmap)
#plt.tight_layout()

plt.savefig(params.OUTPUTS_DIR / 'correlation-heatmap.pdf')