In [2]:
# required packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [3]:
# Load the source data
df_metric_score_by_provider_monthly = pd.read_excel("../data/Rula Insights Manager Case Data.xlsx")

### Clean 01. Clean provider_id

This is not required, but makes it easier to order provider by their ID #.

In [4]:
# Clean Provider ID
df_metric_score_by_provider_monthly['Provider'] = df_metric_score_by_provider_monthly['Provider'].str.replace('Provider', '').astype(int)
df_metric_score_by_provider_monthly.rename(columns={'Provider': 'provider_id'}, inplace=True)

### Clean 02. Datetime Spine
The check #3 confirmed that there is a gap in date range. This will make difficult to assess underperformance. We'll give global min and globa max month_year for all provider and metrics.

In [5]:
def expand_date_range_with_metric_source(df, metric_sources):
    """
    Expands the dataframe so that each row has a timestamp between min_month_year and max_month_year,
    and includes the metric_source for each provider.
    
    Parameters:
    df (pd.DataFrame): DataFrame with columns 'provider_id', 'min_month_year', and 'max_month_year'
    original_df (pd.DataFrame): Original DataFrame with columns 'provider_id', 'month_year', and 'metric_source'
    
    Returns:
    pd.DataFrame: Expanded DataFrame with each row having a timestamp between min_month_year and max_month_year,
                  and includes the metric_source for each provider
    """
    # Ensure the date columns are in datetime format
    df['min_month_year'] = pd.to_datetime(df['min_month_year'])
    df['max_month_year'] = pd.to_datetime(df['max_month_year'])
    
    # Create a list to hold the expanded rows
    expanded_rows = []
    
    # Iterate over each row in the dataframe
    for _, row in df.iterrows():
        # Generate a date range for each provider
        date_range = pd.date_range(start=row['min_month_year'], end=row['max_month_year'], freq='MS')
        for date in date_range:
            for metric_source in metric_sources:
                expanded_rows.append({'provider_id': row['provider_id'], 'month_year': date, 'metric_source': metric_source})
    
    # Create a new dataframe from the expanded rows
    expanded_df = pd.DataFrame(expanded_rows)
    
    return expanded_df

In [6]:
# clean the input df and list for expand_date_range_with_metric_source function
df_metric_score_date_range = df_metric_score_by_provider_monthly\
    .groupby('provider_id')\
    .agg(min_month_year=('month_year', 'min'), max_month_year=('month_year', 'max'))\
    .reset_index()

metric_sources = df_metric_score_by_provider_monthly.metric_source.unique()

# run function
df_metric_score_date_range = expand_date_range_with_metric_source(df_metric_score_date_range, metric_sources)

# join the output dataframe with original dataframe - df_metric_score_by_provider_monthly.
df_metric_score_by_provider_monthly_missing_date_filled = df_metric_score_date_range\
    .merge(
        df_metric_score_by_provider_monthly,
        how='left',
        on=['provider_id', 'month_year', 'metric_source'])\
    .sort_values(by=['provider_id', 'metric_source', 'month_year'], ascending=[True, True, True])\
    .reset_index(drop = True)

Re-checking provider #3 example we observed missing values in 

In [8]:
df_metric_score_by_provider_monthly_missing_date_filled\
    .query("(provider_id == 3) & (metric_source == 'chart_review_months')")

Unnamed: 0,provider_id,month_year,metric_source,metric_score
144,3,2023-09-01,chart_review_months,
145,3,2023-10-01,chart_review_months,
146,3,2023-11-01,chart_review_months,
147,3,2023-12-01,chart_review_months,1.0
148,3,2024-01-01,chart_review_months,
149,3,2024-02-01,chart_review_months,
150,3,2024-03-01,chart_review_months,1.0
151,3,2024-04-01,chart_review_months,
152,3,2024-05-01,chart_review_months,
153,3,2024-06-01,chart_review_months,


### Clean 03. Missing data imputation
If the value is missing, impute the median 

In [9]:
def impute_missing_values(df, method='median'):
    """
    Impute missing values in the DataFrame based on the specified method.
    
    Parameters:
    df (pd.DataFrame): DataFrame with missing values to be imputed.
    method (str): Method to impute missing values. Options are 'zero', 'median', 'average'.
    
    Returns:
    pd.DataFrame: DataFrame with missing values imputed.
    """

    if method not in ['zero', 'median', 'average']:
        raise ValueError("Method must be one of 'zero', 'median', or 'average'")    
    
    # Calculate median and mean metric scores
    agg_df = df.groupby(['provider_id', 'metric_source']).agg(
        median_metric_score=('metric_score', 'median'),
        mean_metric_score=('metric_score', 'mean')
    ).reset_index()

    # Merge the aggregated values back to the original dataframe
    df = df.merge(agg_df, on=['provider_id', 'metric_source'], how='left')

    df['zero_metric_score'] = 0    

    if method == 'zero':
        df['metric_score_imputed'] = df['metric_score'].combine_first(
            df['zero_metric_score']
        )        
    elif method == 'median':
        df['metric_score_imputed'] = df['metric_score'].combine_first(
            df['median_metric_score']
        ) 
    elif method == 'average':
        df['metric_score_imputed'] = df['metric_score'].combine_first(
            df['mean_metric_score']
        )
    
    # Add is_imputed column
    df['is_imputed'] = df['metric_score'].isna() & df['metric_score_imputed'].notna()
    
    # Drop unnecessary columns
    df = df.drop(columns=['zero_metric_score', 'mean_metric_score', 'median_metric_score'])
            
    return df

df_metric_score_by_provider_median_impute = impute_missing_values(df_metric_score_by_provider_monthly_missing_date_filled, method = 'median')
df_metric_score_by_provider_zero_impute = impute_missing_values(df_metric_score_by_provider_monthly_missing_date_filled, method = 'zero')
df_metric_score_by_provider_average_impute = impute_missing_values(df_metric_score_by_provider_monthly_missing_date_filled, method = 'average')

In [10]:
df_metric_score_by_provider_zero_impute\
    .query("(provider_id == 3) & (metric_source == 'chart_review_months')")

Unnamed: 0,provider_id,month_year,metric_source,metric_score,metric_score_imputed,is_imputed
144,3,2023-09-01,chart_review_months,,0.0,True
145,3,2023-10-01,chart_review_months,,0.0,True
146,3,2023-11-01,chart_review_months,,0.0,True
147,3,2023-12-01,chart_review_months,1.0,1.0,False
148,3,2024-01-01,chart_review_months,,0.0,True
149,3,2024-02-01,chart_review_months,,0.0,True
150,3,2024-03-01,chart_review_months,1.0,1.0,False
151,3,2024-04-01,chart_review_months,,0.0,True
152,3,2024-05-01,chart_review_months,,0.0,True
153,3,2024-06-01,chart_review_months,,0.0,True


Let's save this dataset as flat file for us to continue to work with in next steps.

In [None]:
df_metric_score_by_provider_zero_impute\
    .to_pickle("../data/case_data_cleaned.pkl")