In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# Read in the biochem, haem and urine data and do some initial processing

In [2]:
data_dir = '../Data/big_cats/Access_DB_table_exports/'
biochem = pd.read_excel(data_dir + 'Biochemistry ReportsTZ_ZV_WA_SA_MG_DM.xlsx')
biochem['Date'] = pd.to_datetime(biochem['Date'])
biochem['ARKS No'] = biochem['ARKS Number']
biochem.drop('ARKS Number', axis = 1, inplace=True)
biochem['biochem id'] = biochem.index # add an id column

haem = pd.read_excel(data_dir + 'Haematology ReportsTZ_ZV_MG.xlsx')
haem['Date'] = pd.to_datetime(haem['Date'])
haem['haem id'] = haem.index  # add an id column

urine = pd.read_excel(data_dir + 'UrinalysisTZ_ZV_WA_SA_MG_DM.xlsx')
urine['Date of Sample Analysis'] = pd.to_datetime(urine['Date of Sample Analysis'])
urine['Date'] = urine['Date of Sample Analysis']
urine.drop('Date of Sample Analysis', axis = 1, inplace=True)
urine['urine id'] = urine.index  # add an id column

In [3]:
print('Biochemistry data has ' + str(len(biochem)) + ' records for ' + str(len(biochem['ARKS No'].unique())) + ' cats')
print('Haematology data has ' + str(len(haem)) + ' records for ' + str(len(haem['ARKS No'].unique())) + ' cats')
print('Urine data has ' + str(len(urine)) + ' records for ' + str(len(urine['ARKS No'].unique())) + ' cats')

Biochemistry data has 1658 records for 330 cats
Haematology data has 727 records for 151 cats
Urine data has 1240 records for 241 cats


In [4]:
# Drop records where ARKS Number or Date is nan
biochem.dropna(subset=['ARKS No', 'Date'], inplace = True)
haem.dropna(subset=['ARKS No', 'Date'], inplace = True)
urine.dropna(subset=['ARKS No', 'Date'], inplace = True)

print('After removing rows records where ARKS Number or Date were nan:')
print('Biochemistry data has ' + str(len(biochem)) + ' records for ' + str(len(biochem['ARKS No'].unique())) + ' cats')
print('Haematology data has ' + str(len(haem)) + ' records for ' + str(len(haem['ARKS No'].unique())) + ' cats')
print('Urine data has ' + str(len(urine)) + ' records for ' + str(len(urine['ARKS No'].unique())) + ' cats')

After removing rows records where ARKS Number or Date were nan:
Biochemistry data has 1658 records for 330 cats
Haematology data has 724 records for 150 cats
Urine data has 1238 records for 240 cats


## add a prefix to each column name to identify which sheet it belongs to
## BUT NOT Date , ARKS No or id bc this is used for the merge

In [5]:
biochem_cols = list(biochem.columns)
biochem_cols.remove('Date')
biochem_cols.remove('ARKS No')
biochem_cols.remove('biochem id')
biochem_dict= {}
for col in biochem_cols:
    biochem_dict[col] = 'biochem: ' + col
# use dictionary to rename columns
biochem.rename(columns = biochem_dict, inplace=True)


haem_cols = list(haem.columns)
haem_cols.remove('Date')
haem_cols.remove('ARKS No')
haem_cols.remove('haem id')
haem_dict= {}
for col in haem_cols:
    haem_dict[col] = 'haem: ' + col
# use dictionary to rename columns
haem.rename(columns = haem_dict, inplace=True)

urine_cols = list(urine.columns)
urine_cols.remove('Date')
urine_cols.remove('ARKS No')
urine_cols.remove('urine id')
urine_dict= {}
for col in urine_cols:
    urine_dict[col] = 'urine: ' + col
# use dictionary to rename columns
urine.rename(columns = urine_dict, inplace=True)

### order is important for the merging

In [6]:
# do some important stuff
haem = haem.sort_values(by=['Date'])
biochem = biochem.sort_values(by=['Date'])
urine = urine.sort_values(by=['Date'])
# add additional cols to keep track of dates from each table
biochem['biochem date'] = biochem['Date']
haem['haem date'] = haem['Date']
urine['urine date'] = urine['Date']

# throw away duplicate dates - keep only the last rrecord
haem.drop_duplicates(subset=['ARKS No', 'Date'], keep='last', inplace=True)
biochem.drop_duplicates(subset=['ARKS No', 'Date'], keep='last', inplace=True)
urine.drop_duplicates(subset=['ARKS No', 'Date'], keep='last', inplace=True)

# Begin merge

## First find the triples

In [7]:
tol = pd.Timedelta('14 day')

# first join haem and biochem
df1 = pd.merge_asof(by=['ARKS No'], left=haem,right=biochem, left_on='Date', right_on='Date', direction='nearest',tolerance=tol)
# then join this to urine
df2 = pd.merge_asof(by=['ARKS No'], left=df1,right=urine, left_on='Date', right_on='Date', direction='nearest',tolerance=tol)

# take only triple matches
triples = df2.dropna(subset=['haem date', 'biochem date', 'urine date'], how='any')
print('there are ' +str(len(triples)) + ' triple matches' )
triples

there are 244 triple matches


Unnamed: 0,haem: ID,ARKS No,haem: Animal Name,Date,haem: Laboratory Report No,haem: Laboratory,haem: Laboratory text only,haem: RBC (x 10^12/L),haem: haemoglobin (g/L),haem: HCT (%),haem: Total Protein (refractometer),haem: leukocyte count (x10^9/L),haem: neutrophils (%),haem: neutrophils (band),haem: neutrophils(seg) (x10^9/L),haem: lymphocytes (x10^9/L),haem: lymphocytes (%),haem: monocytes (x10^9/L),haem: monocytes (%),haem: eosinophils (x10^9/L),haem: eosinophils (%),haem: estimated platelets x 10^9/L (Hi),haem: estimated platelets /HPF (Low),haem: plasma sample appearance,haem: MCV (fL),haem: MCHC (g/L),haem: MCH (pg),haem: RBC morphology,haem: notes:,haem: validated,haem id,haem date,biochem: ID,biochem: Biochem Report No,biochem: Animal Name,biochem: Low USG?,biochem: BUN (mmol/L),biochem: Hypercreatininaemia?,biochem: Creatinine (umol/L),biochem: Total Plasma Protein (g/L),biochem: Serum Albumin (g/L),biochem: Globulins (g/L),biochem: Anaemic? <25,biochem: Haematocrit (%),biochem: Potassium (mmol/L),biochem: Calcium (mmol/L),biochem: Hyperphosphataemia? >2_9 to 3,biochem: Phosphate (mmol/L),biochem: Sodium (mmol/L),biochem: Chloride (mmol/L),biochem: Bicarbonate,biochem: Glucose,biochem: CK (U/L),biochem: ALP (U/L),biochem: ALT (U/L),biochem: AST,biochem: Amylase,biochem: Cholesterol (mmol/L),biochem: Bilirubin,biochem: Serum/plasma appearance,biochem: Comments:,biochem: Validated,biochem id,biochem date,urine: ID,urine: Laboratory Report Number,urine: Laboratory,urine: Animal name,urine: Sample Collection Method,urine: Urine Appearance,urine: Urine Specific Gravity,urine: Low USG?,urine: Azotaemia?,urine: Urine protein (g/L),urine: urine creatinine (umol/l),urine: Protein:creatinine ratio,urine: Proteinuric UPC greater 0:4,urine: Heska Microalbuminuria test,urine: glucose (Multistix),urine: bilirubin (Multistix),urine: ketones (Multistix),urine: blood (Hb -Multistix),urine: pH,urine: protein (Multistix),urine: protein (sulfanilic acid),urine: Positive urine culture,urine: Positive urine culture comments,urine: Culture results,urine: leukocytes (/cumm)x,urine: blood (RBC/ ul) (haematuriax,urine: Fat droplets,urine: Lipid,urine: Casts,urine: Epith cells,urine: Crystals,urine: Organisms,urine: Contaminants?,urine: Comments:,urine: Fractional Excretion,urine: Validated,urine: Date validated by RD,urine: Feline specific USG,urine: Refractive Index nR,urine: Fractional Excretion1,urine: Validated1,urine: Date validated by RD1,urine: Fe Sodium (RI 0-0_0-07),urine: Fe Potassium (RI 0-0_11-7),urine: Fe Chloride (RI 0-0_0-20),urine: FE Calcium (RI 0-0_0-13),urine: FE Phosphorus (RI 0_16),urine id,urine date
15,464,670005,Quintus,1988-05-08,,Taronga Zoo,,,85.0,22.0,83.0,13.2,98.0,,,,2.0,,,,,,,,,,,,,False,12,1988-05-08,515.0,,Quintus,False,9.8,False,314.0,75.0,31.0,44.0,True,22.0,3.9,2.11,False,1.25,145.0,113.0,15.0,11.7,184.0,5.0,35.0,17.0,,5.5,,,,True,1157.0,1988-05-09,472.0,,1,Quintus,,,1.025,True,False,,,,False,,,,,,6.0,,,False,,,,,,False,False,False,False,False,False,,,False,,,,,False,,,,,,,809.0,1988-05-13
16,463,670005,Quintus,1988-05-20,,Taronga Zoo,,4.4,87.0,28.3,,,89.0,,,,5.0,,4.0,,2.0,,,,,,,,,False,41,1988-05-20,512.0,,Quintus,True,23.5,True,485.0,71.0,24.0,47.0,False,28.0,3.9,2.07,False,2.28,155.0,127.0,19.0,17.8,102.0,5.0,32.0,22.0,,4.0,4,,,True,1158.0,1988-05-19,425.0,215,1,Quintus,,,1.019,True,True,,,,False,,nil,nil,nil,3+,6.0,2+,,False,,,14,8,,False,True,False,True,False,False,"scanty hyaline casts, amorphous urates 2+",,True,,,,,False,,,,,,,808.0,1988-05-19
17,462,670005,Quintus,1988-05-25,,Taronga Zoo,,3.3,75.0,19.0,80.0,25.0,99.0,,,,,,1.0,,,,,,,,,,,False,7,1988-05-25,512.0,,Quintus,True,23.5,True,485.0,71.0,24.0,47.0,False,28.0,3.9,2.07,False,2.28,155.0,127.0,19.0,17.8,102.0,5.0,32.0,22.0,,4.0,4,,,True,1158.0,1988-05-19,471.0,,1,Quintus,Cystocentesis,,,False,False,,,,False,,nil,nil,nil,1+,6.0,1+,,False,,,,50,,True,False,False,False,False,False,3+ fat globules,,True,,,,,False,,,,,,,807.0,1988-05-25
19,388,750011,X-Ray,1989-03-07,,Taronga Zoo,,6.0,,30.0,92.0,5.3,90.0,,,,7.0,,1.0,,2.0,,10,,,,,,,False,53,1989-03-07,424.0,,X-Ray,True,13.0,False,,92.0,,,False,30.0,,,False,,,,,15.0,,,,,,,,,"concomitant USG 1.015, inappetant and depresse...",True,1629.0,1989-03-07,374.0,,1,X-Ray,,,1.016,True,False,,,,False,,nil,nil,nil,nil,5.0,1+,,False,,,,,,False,False,False,False,False,False,,,True,,,,,False,,,,,,,1226.0,1989-03-07
25,551,A20046,Ovambo,1989-09-29,,Orana Pathology Service,,7.37,129.0,43.0,75.0,12.5,66.0,,,,21.0,,1.0,,12.0,352,,,,,,,,False,421,1989-09-29,771.0,,Ovambo,False,6.6,False,131.0,75.0,31.0,44.0,False,,4.5,2.48,False,,161.0,,,,44.0,,31.0,17.0,2679.0,,3,,,True,1116.0,1989-09-30,20.0,,WPZ,Ovambo,,,>1.045,False,False,,,,False,,,,,,,,,False,,,,,,False,False,False,False,False,False,,,False,,,,,False,,,,,,,776.0,1989-09-29
26,547,A20047,Nakuru,1989-09-29,,Orana Pathology Service,,6.7,110.0,39.0,70.0,10.6,69.0,,,,28.0,,,,3.0,357,,,,,,,,False,281,1989-09-29,770.0,,Nakuru,False,8.4,False,118.0,70.0,31.0,39.0,False,,4.5,2.53,False,,160.0,,,,52.0,,36.0,16.0,2750.0,,3,,,False,992.0,1989-09-30,18.0,,WPZ,Nakuru,,,>1.045,False,False,,,,False,,,,,,6.0,1+,,False,,,,,,False,False,False,False,False,False,,,False,,,,,False,,,,,,,725.0,1989-09-29
28,470,880014,Claudius,1989-11-24,,,,8.24,139.0,48.0,,24.6,72.0,,,,22.0,,,,6.0,277,,,,,,,,True,624,1989-11-24,575.0,,Claudius,False,19.0,False,141.0,68.0,40.0,28.0,False,34.0,4.2,2.71,False,,165.0,,,,1153.0,26.0,165.0,103.0,2327.0,,1,,,True,216.0,1989-11-28,475.0,,,Claudius,,,>1.050,False,False,,,,False,,,,,,,,,False,,,,,,False,False,False,False,False,False,,,False,,,,,False,,,,,,,227.0,1989-11-23
29,135,880014,Claudius,1989-11-28,,,,6.13,10.0,34.3,,26.1,62.0,1.0,16.2,3.9,15.0,0.5,2.0,5.2,20.0,,,,56.0,,,,"anisocytosis +, macrocytosis +, polychromasia +",True,144,1989-11-28,575.0,,Claudius,False,19.0,False,141.0,68.0,40.0,28.0,False,34.0,4.2,2.71,False,,165.0,,,,1153.0,26.0,165.0,103.0,2327.0,,1,,,True,216.0,1989-11-28,475.0,,,Claudius,,,>1.050,False,False,,,,False,,,,,,,,,False,,,,,,False,False,False,False,False,False,,,False,,,,,False,,,,,,,227.0,1989-11-23
30,389,750011,X-Ray,1990-05-31,,Taronga Zoo,,,,30.0,,7.7,,,,,,,,,,,,,,,,,,False,52,1990-05-31,425.0,,X-Ray,True,22.6,True,432.0,73.0,36.0,37.0,False,30.0,3.7,2.36,False,,154.0,118.0,15.0,6.2,99.0,13.0,39.0,20.0,,8.7,2,,,True,1630.0,1990-05-31,375.0,617,1,X-Ray,,,1.014,True,True,,,,False,,nil,nil,nil,1+,5.5,1+,,False,,,10-100,<10,,False,False,False,False,False,False,"3+ amorphous urates, occasional spermatozoa",,True,,,,,False,,,,,,,1225.0,1990-05-31
31,390,750011,X-Ray,1990-07-24,,Taronga Zoo,,,,26.0,83.0,9.13,93.0,2.0,8.49,0.36,4.0,,,0.09,1.0,,,,,,,,,True,23,1990-07-24,426.0,,X-Ray,True,32.0,False,296.0,83.0,,,False,26.0,,,False,,,,,5.6,75.0,,,66.0,,,,,,True,1631.0,1990-07-24,376.0,901,1,X-Ray,Manually expressed,,1.013,True,False,,,,False,,nil,nil,nil,1+,5.0,trace,,False,,,>100,<10,,True,False,False,False,False,False,,,True,,,,,False,,,,,,,1224.0,1990-07-24


In [8]:
# get the bichem, haem and urine ids form the triple matches so that they can be removed
triple_haem_ids = list(triples['haem id'])
triple_biochem_ids = list(triples['biochem id'])
triple_urine_ids = list(triples['urine id'])

# drop these ids from original data frames
haem_drop_triple = haem[~haem['haem id'].isin(triple_haem_ids)]
biochem_drop_triple = biochem[~biochem['biochem id'].isin(triple_biochem_ids)]
urine_drop_triple = urine[~urine['urine id'].isin(triple_urine_ids)]

## Now find the doubles

In [9]:
# perform 3 merge combinations
df_hb = pd.merge_asof(by=['ARKS No'], left=haem_drop_triple,right=biochem_drop_triple, left_on='Date', right_on='Date', direction='nearest',tolerance=tol)

df_hu = pd.merge_asof(by=['ARKS No'], left=haem_drop_triple,right=urine_drop_triple, left_on='Date', right_on='Date', direction='nearest',tolerance=tol)

df_ub = pd.merge_asof(by=['ARKS No'], left=urine_drop_triple,right=biochem_drop_triple, left_on='Date', right_on='Date', direction='nearest',tolerance=tol)                 

In [10]:
# take only double matches
double_hb = df_hb.dropna(subset=['haem date', 'biochem date'], how='any')
double_hu = df_hu.dropna(subset=['haem date', 'urine date'], how='any')
double_ub = df_ub.dropna(subset=['urine date', 'biochem date'], how='any')

# get ids for all combos
double_hb_h_ids = list(double_hb['haem id'])
double_hb_b_ids = list(double_hb['biochem id'])

double_hu_h_ids = list(double_hu['haem id'])
double_hu_u_ids = list(double_hu['urine id'])

double_ub_u_ids = list(double_ub['urine id'])
double_ub_b_ids = list(double_ub['biochem id'])

# drop these ids from data frames where triples have already been dropped
haem_singles = haem_drop_triple[~haem_drop_triple['haem id'].isin(double_hb_h_ids + double_hu_h_ids)]
biochem_singles = biochem_drop_triple[~biochem_drop_triple['biochem id'].isin(double_hb_b_ids + double_ub_b_ids)]
urine_singles = urine_drop_triple[~urine_drop_triple['urine id'].isin(double_hu_u_ids + double_ub_u_ids)]

In [11]:
# now concat everything
everything = pd.concat([triples, double_hb, double_hu, double_ub, haem_singles, biochem_singles, urine_singles], sort=False)

In [12]:
everything

Unnamed: 0,haem: ID,ARKS No,haem: Animal Name,Date,haem: Laboratory Report No,haem: Laboratory,haem: Laboratory text only,haem: RBC (x 10^12/L),haem: haemoglobin (g/L),haem: HCT (%),haem: Total Protein (refractometer),haem: leukocyte count (x10^9/L),haem: neutrophils (%),haem: neutrophils (band),haem: neutrophils(seg) (x10^9/L),haem: lymphocytes (x10^9/L),haem: lymphocytes (%),haem: monocytes (x10^9/L),haem: monocytes (%),haem: eosinophils (x10^9/L),haem: eosinophils (%),haem: estimated platelets x 10^9/L (Hi),haem: estimated platelets /HPF (Low),haem: plasma sample appearance,haem: MCV (fL),haem: MCHC (g/L),haem: MCH (pg),haem: RBC morphology,haem: notes:,haem: validated,haem id,haem date,biochem: ID,biochem: Biochem Report No,biochem: Animal Name,biochem: Low USG?,biochem: BUN (mmol/L),biochem: Hypercreatininaemia?,biochem: Creatinine (umol/L),biochem: Total Plasma Protein (g/L),biochem: Serum Albumin (g/L),biochem: Globulins (g/L),biochem: Anaemic? <25,biochem: Haematocrit (%),biochem: Potassium (mmol/L),biochem: Calcium (mmol/L),biochem: Hyperphosphataemia? >2_9 to 3,biochem: Phosphate (mmol/L),biochem: Sodium (mmol/L),biochem: Chloride (mmol/L),biochem: Bicarbonate,biochem: Glucose,biochem: CK (U/L),biochem: ALP (U/L),biochem: ALT (U/L),biochem: AST,biochem: Amylase,biochem: Cholesterol (mmol/L),biochem: Bilirubin,biochem: Serum/plasma appearance,biochem: Comments:,biochem: Validated,biochem id,biochem date,urine: ID,urine: Laboratory Report Number,urine: Laboratory,urine: Animal name,urine: Sample Collection Method,urine: Urine Appearance,urine: Urine Specific Gravity,urine: Low USG?,urine: Azotaemia?,urine: Urine protein (g/L),urine: urine creatinine (umol/l),urine: Protein:creatinine ratio,urine: Proteinuric UPC greater 0:4,urine: Heska Microalbuminuria test,urine: glucose (Multistix),urine: bilirubin (Multistix),urine: ketones (Multistix),urine: blood (Hb -Multistix),urine: pH,urine: protein (Multistix),urine: protein (sulfanilic acid),urine: Positive urine culture,urine: Positive urine culture comments,urine: Culture results,urine: leukocytes (/cumm)x,urine: blood (RBC/ ul) (haematuriax,urine: Fat droplets,urine: Lipid,urine: Casts,urine: Epith cells,urine: Crystals,urine: Organisms,urine: Contaminants?,urine: Comments:,urine: Fractional Excretion,urine: Validated,urine: Date validated by RD,urine: Feline specific USG,urine: Refractive Index nR,urine: Fractional Excretion1,urine: Validated1,urine: Date validated by RD1,urine: Fe Sodium (RI 0-0_0-07),urine: Fe Potassium (RI 0-0_11-7),urine: Fe Chloride (RI 0-0_0-20),urine: FE Calcium (RI 0-0_0-13),urine: FE Phosphorus (RI 0_16),urine id,urine date
15,464.0,670005,Quintus,1988-05-08,,Taronga Zoo,,,85,22.0,83.0,13.20,98.0,,,,2.0,,,,,,,,,,,,,False,12.0,1988-05-08,515.0,,Quintus,False,9.80,False,314.0,75.0,31.0,44.0,True,22.0,3.90,2.110,False,1.2500,145.0,113.0,15.0,11.7000,184.0,5.0,35.0,17.0,,5.5000,,,,True,1157.0,1988-05-09,472.0,,1,Quintus,,,1.025,True,False,,,,False,,,,,,6,,,False,,,,,,False,False,False,False,False,False,,,False,,,,,False,,,,,,,809.0,1988-05-13
16,463.0,670005,Quintus,1988-05-20,,Taronga Zoo,,4.40,87,28.3,,,89.0,,,,5.0,,4.0,,2.0,,,,,,,,,False,41.0,1988-05-20,512.0,,Quintus,True,23.50,True,485.0,71.0,24.0,47.0,False,28.0,3.90,2.070,False,2.2800,155.0,127.0,19.0,17.8000,102.0,5.0,32.0,22.0,,4.0000,4,,,True,1158.0,1988-05-19,425.0,215,1,Quintus,,,1.019,True,True,,,,False,,nil,nil,nil,3+,6,2+,,False,,,14,8,,False,True,False,True,False,False,"scanty hyaline casts, amorphous urates 2+",,True,,,,,False,,,,,,,808.0,1988-05-19
17,462.0,670005,Quintus,1988-05-25,,Taronga Zoo,,3.30,75,19.0,80.0,25.00,99.0,,,,,,1.0,,,,,,,,,,,False,7.0,1988-05-25,512.0,,Quintus,True,23.50,True,485.0,71.0,24.0,47.0,False,28.0,3.90,2.070,False,2.2800,155.0,127.0,19.0,17.8000,102.0,5.0,32.0,22.0,,4.0000,4,,,True,1158.0,1988-05-19,471.0,,1,Quintus,Cystocentesis,,,False,False,,,,False,,nil,nil,nil,1+,6,1+,,False,,,,50,,True,False,False,False,False,False,3+ fat globules,,True,,,,,False,,,,,,,807.0,1988-05-25
19,388.0,750011,X-Ray,1989-03-07,,Taronga Zoo,,6.00,,30.0,92.0,5.30,90.0,,,,7.0,,1.0,,2.0,,10,,,,,,,False,53.0,1989-03-07,424.0,,X-Ray,True,13.00,False,,92.0,,,False,30.0,,,False,,,,,15.0000,,,,,,,,,"concomitant USG 1.015, inappetant and depresse...",True,1629.0,1989-03-07,374.0,,1,X-Ray,,,1.016,True,False,,,,False,,nil,nil,nil,nil,5.0,1+,,False,,,,,,False,False,False,False,False,False,,,True,,,,,False,,,,,,,1226.0,1989-03-07
25,551.0,A20046,Ovambo,1989-09-29,,Orana Pathology Service,,7.37,129,43.0,75.0,12.50,66.0,,,,21.0,,1.0,,12.0,352,,,,,,,,False,421.0,1989-09-29,771.0,,Ovambo,False,6.60,False,131.0,75.0,31.0,44.0,False,,4.50,2.480,False,,161.0,,,,44.0,,31.0,17.0,2679.0,,3,,,True,1116.0,1989-09-30,20.0,,WPZ,Ovambo,,,>1.045,False,False,,,,False,,,,,,,,,False,,,,,,False,False,False,False,False,False,,,False,,,,,False,,,,,,,776.0,1989-09-29
26,547.0,A20047,Nakuru,1989-09-29,,Orana Pathology Service,,6.70,110,39.0,70.0,10.60,69.0,,,,28.0,,,,3.0,357,,,,,,,,False,281.0,1989-09-29,770.0,,Nakuru,False,8.40,False,118.0,70.0,31.0,39.0,False,,4.50,2.530,False,,160.0,,,,52.0,,36.0,16.0,2750.0,,3,,,False,992.0,1989-09-30,18.0,,WPZ,Nakuru,,,>1.045,False,False,,,,False,,,,,,6.0,1+,,False,,,,,,False,False,False,False,False,False,,,False,,,,,False,,,,,,,725.0,1989-09-29
28,470.0,880014,Claudius,1989-11-24,,,,8.24,139,48.0,,24.60,72.0,,,,22.0,,,,6.0,277,,,,,,,,True,624.0,1989-11-24,575.0,,Claudius,False,19.00,False,141.0,68.0,40.0,28.0,False,34.0,4.20,2.710,False,,165.0,,,,1153.0,26.0,165.0,103.0,2327.0,,1,,,True,216.0,1989-11-28,475.0,,,Claudius,,,>1.050,False,False,,,,False,,,,,,,,,False,,,,,,False,False,False,False,False,False,,,False,,,,,False,,,,,,,227.0,1989-11-23
29,135.0,880014,Claudius,1989-11-28,,,,6.13,10,34.3,,26.10,62.0,1.00,16.20,3.90,15.0,0.50,2.0,5.20,20.0,,,,56.0,,,,"anisocytosis +, macrocytosis +, polychromasia +",True,144.0,1989-11-28,575.0,,Claudius,False,19.00,False,141.0,68.0,40.0,28.0,False,34.0,4.20,2.710,False,,165.0,,,,1153.0,26.0,165.0,103.0,2327.0,,1,,,True,216.0,1989-11-28,475.0,,,Claudius,,,>1.050,False,False,,,,False,,,,,,,,,False,,,,,,False,False,False,False,False,False,,,False,,,,,False,,,,,,,227.0,1989-11-23
30,389.0,750011,X-Ray,1990-05-31,,Taronga Zoo,,,,30.0,,7.70,,,,,,,,,,,,,,,,,,False,52.0,1990-05-31,425.0,,X-Ray,True,22.60,True,432.0,73.0,36.0,37.0,False,30.0,3.70,2.360,False,,154.0,118.0,15.0,6.2000,99.0,13.0,39.0,20.0,,8.7000,2,,,True,1630.0,1990-05-31,375.0,617,1,X-Ray,,,1.014,True,True,,,,False,,nil,nil,nil,1+,5.5,1+,,False,,,10-100,<10,,False,False,False,False,False,False,"3+ amorphous urates, occasional spermatozoa",,True,,,,,False,,,,,,,1225.0,1990-05-31
31,390.0,750011,X-Ray,1990-07-24,,Taronga Zoo,,,,26.0,83.0,9.13,93.0,2.00,8.49,0.36,4.0,,,0.09,1.0,,,,,,,,,True,23.0,1990-07-24,426.0,,X-Ray,True,32.00,False,296.0,83.0,,,False,26.0,,,False,,,,,5.6000,75.0,,,66.0,,,,,,True,1631.0,1990-07-24,376.0,901,1,X-Ray,Manually expressed,,1.013,True,False,,,,False,,nil,nil,nil,1+,5,trace,,False,,,>100,<10,,True,False,False,False,False,False,,,True,,,,,False,,,,,,,1224.0,1990-07-24


In [17]:
# add colums that show the date differences
everything['haem biochem date diff'] = abs(everything['haem date'] - everything['biochem date'])
everything['haem urine date diff'] = abs(everything['haem date'] - everything['urine date'])
everything['urine biochem date diff'] = abs(everything['urine date'] - everything['biochem date'])
everything.head()

Unnamed: 0,haem: ID,ARKS No,haem: Animal Name,Date,haem: Laboratory Report No,haem: Laboratory,haem: Laboratory text only,haem: RBC (x 10^12/L),haem: haemoglobin (g/L),haem: HCT (%),haem: Total Protein (refractometer),haem: leukocyte count (x10^9/L),haem: neutrophils (%),haem: neutrophils (band),haem: neutrophils(seg) (x10^9/L),haem: lymphocytes (x10^9/L),haem: lymphocytes (%),haem: monocytes (x10^9/L),haem: monocytes (%),haem: eosinophils (x10^9/L),haem: eosinophils (%),haem: estimated platelets x 10^9/L (Hi),haem: estimated platelets /HPF (Low),haem: plasma sample appearance,haem: MCV (fL),haem: MCHC (g/L),haem: MCH (pg),haem: RBC morphology,haem: notes:,haem: validated,haem id,haem date,biochem: ID,biochem: Biochem Report No,biochem: Animal Name,biochem: Low USG?,biochem: BUN (mmol/L),biochem: Hypercreatininaemia?,biochem: Creatinine (umol/L),biochem: Total Plasma Protein (g/L),biochem: Serum Albumin (g/L),biochem: Globulins (g/L),biochem: Anaemic? <25,biochem: Haematocrit (%),biochem: Potassium (mmol/L),biochem: Calcium (mmol/L),biochem: Hyperphosphataemia? >2_9 to 3,biochem: Phosphate (mmol/L),biochem: Sodium (mmol/L),biochem: Chloride (mmol/L),biochem: Bicarbonate,biochem: Glucose,biochem: CK (U/L),biochem: ALP (U/L),biochem: ALT (U/L),biochem: AST,biochem: Amylase,biochem: Cholesterol (mmol/L),biochem: Bilirubin,biochem: Serum/plasma appearance,biochem: Comments:,biochem: Validated,biochem id,biochem date,urine: ID,urine: Laboratory Report Number,urine: Laboratory,urine: Animal name,urine: Sample Collection Method,urine: Urine Appearance,urine: Urine Specific Gravity,urine: Low USG?,urine: Azotaemia?,urine: Urine protein (g/L),urine: urine creatinine (umol/l),urine: Protein:creatinine ratio,urine: Proteinuric UPC greater 0:4,urine: Heska Microalbuminuria test,urine: glucose (Multistix),urine: bilirubin (Multistix),urine: ketones (Multistix),urine: blood (Hb -Multistix),urine: pH,urine: protein (Multistix),urine: protein (sulfanilic acid),urine: Positive urine culture,urine: Positive urine culture comments,urine: Culture results,urine: leukocytes (/cumm)x,urine: blood (RBC/ ul) (haematuriax,urine: Fat droplets,urine: Lipid,urine: Casts,urine: Epith cells,urine: Crystals,urine: Organisms,urine: Contaminants?,urine: Comments:,urine: Fractional Excretion,urine: Validated,urine: Date validated by RD,urine: Feline specific USG,urine: Refractive Index nR,urine: Fractional Excretion1,urine: Validated1,urine: Date validated by RD1,urine: Fe Sodium (RI 0-0_0-07),urine: Fe Potassium (RI 0-0_11-7),urine: Fe Chloride (RI 0-0_0-20),urine: FE Calcium (RI 0-0_0-13),urine: FE Phosphorus (RI 0_16),urine id,urine date,haem biochem date diff,haem urine date diff,urine biochem date diff
15,464.0,670005,Quintus,1988-05-08,,Taronga Zoo,,,85.0,22.0,83.0,13.2,98.0,,,,2.0,,,,,,,,,,,,,False,12.0,1988-05-08,515.0,,Quintus,False,9.8,False,314.0,75.0,31.0,44.0,True,22.0,3.9,2.11,False,1.25,145.0,113.0,15.0,11.7,184.0,5.0,35.0,17.0,,5.5,,,,True,1157.0,1988-05-09,472.0,,1,Quintus,,,1.025,True,False,,,,False,,,,,,6.0,,,False,,,,,,False,False,False,False,False,False,,,False,,,,,False,,,,,,,809.0,1988-05-13,1 days,5 days,4 days
16,463.0,670005,Quintus,1988-05-20,,Taronga Zoo,,4.4,87.0,28.3,,,89.0,,,,5.0,,4.0,,2.0,,,,,,,,,False,41.0,1988-05-20,512.0,,Quintus,True,23.5,True,485.0,71.0,24.0,47.0,False,28.0,3.9,2.07,False,2.28,155.0,127.0,19.0,17.8,102.0,5.0,32.0,22.0,,4.0,4.0,,,True,1158.0,1988-05-19,425.0,215.0,1,Quintus,,,1.019,True,True,,,,False,,nil,nil,nil,3+,6.0,2+,,False,,,14.0,8.0,,False,True,False,True,False,False,"scanty hyaline casts, amorphous urates 2+",,True,,,,,False,,,,,,,808.0,1988-05-19,1 days,1 days,0 days
17,462.0,670005,Quintus,1988-05-25,,Taronga Zoo,,3.3,75.0,19.0,80.0,25.0,99.0,,,,,,1.0,,,,,,,,,,,False,7.0,1988-05-25,512.0,,Quintus,True,23.5,True,485.0,71.0,24.0,47.0,False,28.0,3.9,2.07,False,2.28,155.0,127.0,19.0,17.8,102.0,5.0,32.0,22.0,,4.0,4.0,,,True,1158.0,1988-05-19,471.0,,1,Quintus,Cystocentesis,,,False,False,,,,False,,nil,nil,nil,1+,6.0,1+,,False,,,,50.0,,True,False,False,False,False,False,3+ fat globules,,True,,,,,False,,,,,,,807.0,1988-05-25,6 days,0 days,6 days
19,388.0,750011,X-Ray,1989-03-07,,Taronga Zoo,,6.0,,30.0,92.0,5.3,90.0,,,,7.0,,1.0,,2.0,,10.0,,,,,,,False,53.0,1989-03-07,424.0,,X-Ray,True,13.0,False,,92.0,,,False,30.0,,,False,,,,,15.0,,,,,,,,,"concomitant USG 1.015, inappetant and depresse...",True,1629.0,1989-03-07,374.0,,1,X-Ray,,,1.016,True,False,,,,False,,nil,nil,nil,nil,5.0,1+,,False,,,,,,False,False,False,False,False,False,,,True,,,,,False,,,,,,,1226.0,1989-03-07,0 days,0 days,0 days
25,551.0,A20046,Ovambo,1989-09-29,,Orana Pathology Service,,7.37,129.0,43.0,75.0,12.5,66.0,,,,21.0,,1.0,,12.0,352.0,,,,,,,,False,421.0,1989-09-29,771.0,,Ovambo,False,6.6,False,131.0,75.0,31.0,44.0,False,,4.5,2.48,False,,161.0,,,,44.0,,31.0,17.0,2679.0,,3.0,,,True,1116.0,1989-09-30,20.0,,WPZ,Ovambo,,,>1.045,False,False,,,,False,,,,,,,,,False,,,,,,False,False,False,False,False,False,,,False,,,,,False,,,,,,,776.0,1989-09-29,1 days,0 days,1 days


In [14]:
# write to file
from datetime import datetime
today = datetime.today().strftime('%Y-%m-%d')
out_file = '../Data/big_cats/processed_data/merged_tables_tol_' +str(tol.days) +'days_' + today + '.csv'
everything.to_csv(out_file, index = False)

In [15]:
print('using a tolerance of: ' + str(tol))
print('there are total records: ' + str(len(everything)))
print('there are ' +str(len(triples)) + ' triple matches' )
print('there are ' +str(len(double_hb)) + ' double hb matches' )
print('there are ' +str(len(double_hu)) + ' double hu matches' )
print('there are ' +str(len(double_ub)) + ' double ub matches' )
print('there are ' +str(len(haem_singles)) + ' single haem records' )
print('there are ' +str(len(biochem_singles)) + ' single biochem records' )
print('there are ' +str(len(urine_singles)) + ' single urine records' )

using a tolerance of: 14 days 00:00:00
there are total records: 2355
there are 244 triple matches
there are 413 double hb matches
there are 3 double hu matches
there are 378 double ub matches
there are 60 single haem records
there are 673 single biochem records
there are 584 single urine records


using a tolerance of: 3 days 00:00:00
there are total records: 2388
there are 219 triple matches
there are 431 double hb matches
there are 5 double hu matches
there are 305 double ub matches
there are 65 single haem records
there are 695 single biochem records
there are 668 single urine records

using a tolerance of: 7 days 00:00:00
there are total records: 2369
there are 234 triple matches
there are 420 double hb matches
there are 3 double hu matches
there are 343 double ub matches
there are 63 single haem records
there are 683 single biochem records
there are 623 single urine records

using a tolerance of: 14 days 00:00:00
there are total records: 2355
there are 244 triple matches
there are 413 double hb matches
there are 3 double hu matches
there are 378 double ub matches
there are 60 single haem records
there are 673 single biochem records
there are 584 single urine records