In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

## Set up data

In [2]:
columns = ['Database Name', 'Patient Number', 'Client Number', 'Consultation Number', 'Item Name', 'Units',
           'Item Label', 'Clinic Code', 'Consulting Vet', 'Patient Desexed', 'Deceased Date',
           'Visit Date', 'Consultation Date', 'Examination Text' ,'Patient Species', 'Patient Breed',
           'Patient Colour', 'Patient Sex', 'Patient Date of Birth', 'Insured', 'weight', 'temperature',
           'HeartRate', 'BodyScore', 'DentalGrade', 'RespRate', 'PainScore', 'BP', 'CRT', 'MMColour']

In [3]:
data_dir = '../Data/vet_compass/'
data = pd.read_csv(data_dir + 'STR024A03 20190719 sample x10,000.csv', names = columns)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
data.head()

Unnamed: 0,Database Name,Patient Number,Client Number,Consultation Number,Item Name,Units,Item Label,Clinic Code,Consulting Vet,Patient Desexed,Deceased Date,Visit Date,Consultation Date,Examination Text,Patient Species,Patient Breed,Patient Colour,Patient Sex,Patient Date of Birth,Insured,weight,temperature,HeartRate,BodyScore,DentalGrade,RespRate,PainScore,BP,CRT,MMColour
0,RxNSW,7138614,,13869415,,,,1,WA1,1,,2012-07-05 17:09:03.000,2012-07-05 17:09:03.000,-------------------- Fees Charged: -----------...,Cat,Persian,White,Female,2010-07-05 00:00:00.000,,,,,,,,,,,
1,RxNSW,7737653,,22925726,,,,1,ZOU,1,,2003-04-25 00:00:00.000,2003-04-25 00:00:00.000,,Cat,Tonkinese,Lilac,Male,2002-11-26 00:00:00.000,,,,,,,,,,,
2,RxNSW,7738120,,22934344,,,,1,ZOU,1,,2004-01-10 00:00:00.000,2004-01-10 00:00:00.000,,Cat,Domestic Medium Hair,Tortoiseshell,Female,2003-02-14 00:00:00.000,,,,,,,,,,,
3,RxNSW,7745004,,23043653,,,,1,ZOU,1,,2013-02-18 16:47:52.000,2013-02-18 16:47:52.000,,Cat,Siamese,Lilac Point,Male,2008-07-20 00:00:00.000,,,,,,,,,,,
4,RxNSW,7742894,,23043708,,,,1,ZOU,1,2013-11-05 10:00:34.000,2013-02-20 08:40:38.000,2013-02-20 08:40:38.000,Dental; Renal Dz,Cat,Domestic Long Hair,Ginger & White,Male,1994-03-16 00:00:00.000,,,,,,,,,,,


## Explore vital signs columns

In [5]:
vitalsigns = ['weight', 'temperature', 'HeartRate', 'BodyScore', 'DentalGrade', 'RespRate', 'PainScore', 'BP', 'CRT', 'MMColour']

for sign in vitalsigns:
    print(data[sign].dtype)

object
object
object
object
object
object
object
object
object
object


In [6]:
data[vitalsigns].describe()

Unnamed: 0,weight,temperature,HeartRate,BodyScore,DentalGrade,RespRate,PainScore,BP,CRT,MMColour
count,318629.0,112138.0,117336,99396,94292,70632,22699,3089,102755,112096
unique,1920.0,340.0,406,187,308,283,113,123,202,358
top,4.5,38.6,180,5/9,0,24,0,-,<2,pink
freq,5908.0,8547.0,20327,20524,27480,9030,13815,1566,31601,70763


In [7]:
data_count = data[vitalsigns].count().reset_index(name='count').sort_values(['count'], ascending=False)
data_count['proportion'] = data_count['count']/data.shape[0]

data_count

Unnamed: 0,index,count,proportion
0,weight,318629,0.440918
2,HeartRate,117336,0.162369
1,temperature,112138,0.155176
9,MMColour,112096,0.155118
8,CRT,102755,0.142192
3,BodyScore,99396,0.137544
4,DentalGrade,94292,0.130481
5,RespRate,70632,0.09774
6,PainScore,22699,0.031411
7,BP,3089,0.004275


Weight is the vital sign with the largest number of non-nan entries (44%), followed by heart rate (16%).

## Clean weight data

In [8]:
def clean_weights(df, grams_to_kg=20):
    
    # Make a copy of the weight column to clean
    df['weight_kg'] = df['weight']
    
    # Remove weights that are in cage or carrier
    df.loc[df['weight_kg'].str.contains('cage|carrier', na=False), 'weight_kg'] = np.nan
    
    # make kg column with values extracted
    # patterns: digits.,digits OR .,digits
    df['weight_kg'] = df['weight_kg'].astype(str).str.extract(r'(\d+[\.,]*\d*|[\.,]\d+)')
    
    # clean mistaken decimal points
    df['weight_kg'] = df['weight_kg'].str.replace('\.\.|,\.|\.,|,', '.')
    
    # coerce to numeric
    df['weight_kg'] = pd.to_numeric(df['weight_kg'], errors="coerce")
    
    # find entries with values greater than 20 (likely in grams rather than kg)
    df.loc[df['weight_kg']>grams_to_kg, 'weight_kg'] = df['weight_kg']/1000.0
    
    # drop original weights column
    df = df.drop('weight', axis=1)
    
    return df

In [9]:
vitals = data[vitalsigns].copy()
vitals_cleaned = clean_weights(vitals)

In [10]:
original_count = vitals['weight'].count()
cleaned_count = vitals_cleaned['weight_kg'].count()

print(f"Original weight count = {original_count}")
print(f"Cleaned weight count = {cleaned_count}")
print(f"Difference = {original_count - cleaned_count}")

Original weight count = 318629
Cleaned weight count = 318541
Difference = 88
