In [503]:
import pandas as pd
import numpy as np
import re

In [504]:
columns = ['DatabaseName', 'PatientNumber', 'ClientNumber', 'ConsultationNumber', 'ItemName', 'Units',
           'ItemLabel', 'ClinicCode', 'ConsultingVet', 'PatientDesexed', 'DeceasedDate',
           'VisitDate', 'ConsultationDate', 'ExaminationText' ,'PatientSpecies', 'PatientBreed',
           'PatientColour', 'PatientSex', 'PatientDateOfBirth', 'Insured', 'weight', 'temperature',
           'HeartRate', 'BodyScore', 'DentalGrade', 'RespRate', 'PainScore', 'BP', 'CRT', 'MMColour']

In [505]:
vetcompass_df = pd.read_csv("../Data/vet_compass/STR024A03 20190719 sample x10,000.csv", header=None, names = columns)

  interactivity=interactivity, compiler=compiler, result=result)


### make everything lowercase

In [506]:
# vetcompass_df['ExaminationText'] = vetcompass_df['ExaminationText'].str.lower()

### functions to create regex and add columns 

In [507]:
def make_regex(values):
    return f'({values})[\s\:\=\(\-]*(\d*\.?\/?\d*)'

def add_column(df, values, colname):
    df[colname] = df.ExaminationText.str.extract(make_regex(values), expand=True, flags=re.IGNORECASE)[1]
    return df

### function to find patters (from nushki)

In [508]:
def get_patterns(df, pattern):
    note_captions_lists = [re.findall(string=text, pattern=pattern) for text in df['ExaminationText'] if isinstance(text, str)]
    note_captions = [text for text_list in note_captions_lists for text in text_list]
    note_captions_set = set(note_captions)
    return note_captions_set

### find strings that precede values to find possible lab value keys (maybe pointless)

In [513]:
possible_keys = set(get_patterns(vetcompass_df, pattern="(.{0,4})[\s\:\=\(\-]*\d*\.?\/?\d*"))

In [516]:
ending_in_colon = [re.findall(string=text, pattern='.{0,3}:') for text in possible_keys]

In [523]:
set(x[0] for x in ending_in_colon if len(x)>0)

{'eed:',
 'le#:',
 'On:',
 'SFC:',
 ' TV:',
 'GW:',
 'pet:',
 'ste:',
 'WS6:',
 'Try:',
 'T 4:',
 'BST:',
 '?:',
 'MA0:',
 'LI2:',
 'DSP:',
 ':21:',
 ' Ga:',
 'BG1:',
 'WK4:',
 'AS8:',
 'ady:',
 'hed:',
 'Z :',
 '-15:',
 'Csx:',
 'day:',
 'HS2:',
 'des:',
 'ink:',
 'D T:',
 'MJP:',
 'UTI:',
 'BE5:',
 'LAN:',
 'V1 :',
 'PO:',
 'n::',
 't4:',
 'BF2:',
 'pic:',
 'IBS:',
 'NJ:',
 'NL :',
 's 2:',
 'lse:',
 'n E:',
 'CDP:',
 'CN1:',
 'BAL:',
 'Wt:',
 'nl :',
 'o O:',
 'ILE:',
 'nl:',
 ' AT:',
 'x b:',
 ':01:',
 'VWJ:',
 'BI1:',
 'WC :',
 'RA6:',
 'HK1:',
 'Mcv:',
 'M1 :',
 'BSA:',
 'ks :',
 'dx :',
 'rk:',
 'AL2:',
 'mg:',
 '@10:',
 'BRG:',
 'ky:',
 'B4:',
 'KN5:',
 ':58:',
 'BAR:',
 'try:',
 ' JN:',
 'Dd:',
 '~15:',
 'WI1:',
 ' LJ:',
 'JE2:',
 'L2:',
 'hts:',
 'HKA:',
 '? P:',
 'PSF:',
 'PK3:',
 'JK1:',
 'TAL:',
 'pm :',
 'tmt:',
 'DG1:',
 'ni :',
 'ni:',
 'ac):',
 'ms:',
 'neg:',
 'PJ:',
 'r?:',
 'N :',
 'ath:',
 'ss:',
 ' NL:',
 'n 1:',
 'lu:',
 'OUR:',
 'go:',
 '+RR:',
 'SWO:',
 'Ass:',

### column names and search phrases for them 

In [525]:
columns_to_add = {
    'creatinine': 'creatinine|creat|crea|cre', 
    'usg': 'usg', 
    'phosphate': 'phosphate|phosp|phos|ph', 
    'protein': 'protein|prot', 
    'calcium': 'calcium|calc|cal|ca', 
    'calcium:phosphate': 'ca:p ratio',
    'bun': 'bun', 
    'sap': 'sap',
    'salt': 'salt',
    'glucose': 'glucose|gluc|glu',
    'tp': 't.p.|tp|tpp', 
    'pcv': 'pcv', 
    'alp': 'alp', 
    'alt': 'alt',
    'acp': 'acp', 
    'alkp': 'alkp', 
    'bg': 'bg',
    'wbc': 'wbc', 
    'eos': 'eos', 
    'baso': 'baso', 
    'lymph': 'lymph', 
    'sodium': 'na|sodium|sod', 
    'potassium': 'k\+|k|potassium|pot', 
    'urea': 'urea', 
    'ggt': 'ggt', 
    'ket': 'ket', 
    'nit': 'nit', 
    'leu': 'leu'
}

In [526]:
for column_name in columns_to_add.keys():
    print("processing: "+ column_name + " with patter " + columns_to_add[column_name])
    vetcompass_df = add_column(vetcompass_df, columns_to_add[column_name], column_name)
    print('\x1b[32m' + str(len(vetcompass_df[column_name].unique())) + " unique values found for " + column_name + '\x1b[0m')

processing: creatinine with patter creatinine|creat|crea|cre
[32m404 unique values found for creatinine[0m
processing: usg with patter usg
[32m124 unique values found for usg[0m
processing: phosphate with patter phosphate|phosp|phos|ph
[32m188 unique values found for phosphate[0m
processing: protein with patter protein|prot
[32m97 unique values found for protein[0m
processing: calcium with patter calcium|calc|cal|ca
[32m77 unique values found for calcium[0m
processing: calcium:phosphate with patter ca:p ratio
[32m15 unique values found for calcium:phosphate[0m
processing: bun with patter bun
[32m237 unique values found for bun[0m
processing: sap with patter sap
[32m33 unique values found for sap[0m
processing: salt with patter salt
[32m10 unique values found for salt[0m
processing: glucose with patter glucose|gluc|glu
[32m396 unique values found for glucose[0m
processing: tp with patter t.p.|tp|tpp
[32m175 unique values found for tp[0m
processing: pcv with patter 