# Characters and Strings

> with reference to EpiRHandbook Chapter 10

In [None]:
#|hide
from nbdev import *
from fastcore.test import *
from fastcore.utils import *

In [None]:
import pandas as pd
# import numpy as np
# from scipy import stats
# import matplotlib.pyplot as plt
import matplotlib.style as style
from datetime import datetime

In [None]:
#|hide
# To use colourbind colour schemes (optional).  To get the color, see https://github.com/matplotlib/matplotlib/blob/main/lib/matplotlib/mpl-data/stylelib/tableau-colorblind10.mplstyle

In [None]:
#|hide
style.use('tableau-colorblind10')

In [None]:
#|hide
# set display options in pandas
pd.set_option('display.max_columns', 100)  
pd.set_option('display.max_rows',100)
pd.set_option('display.width', 1000)

## Data

Import data from https://github.com/appliedepi/epiRhandbook_eng/blob/master/data/linelist_cleaned.xlsx and then save under "epiRhandbook_data" folder.  
Note: Installation of "openpyxl" is required.

In [None]:
linelist = pd.read_excel('../epiRhandbook_data/linelist_cleaned.xlsx')
linelist.head(3)

Unnamed: 0,case_id,generation,date_infection,date_onset,date_hospitalisation,date_outcome,outcome,gender,age,age_unit,age_years,age_cat,age_cat5,hospital,lon,lat,infector,source,wt_kg,ht_cm,ct_blood,fever,chills,cough,aches,vomit,temp,time_admission,bmi,days_onset_hosp
0,5fe599,4,2014-05-08,2014-05-13,2014-05-15,NaT,,m,2.0,years,2.0,0-4,0-4,Other,-13.215735,8.468973,f547d6,other,27,48,22,no,no,yes,no,yes,36.8,,117.1875,2.0
1,8689b7,4,NaT,2014-05-13,2014-05-14,2014-05-18,Recover,f,3.0,years,3.0,0-4,0-4,Missing,-13.215234,8.451719,,,25,59,22,,,,,,36.9,09:36,71.818443,1.0
2,11f8ea,2,NaT,2014-05-16,2014-05-18,2014-05-30,Recover,m,56.0,years,56.0,50-69,55-59,St. Mark's Maternity Hospital (SMMH),-13.212911,8.464817,,,91,238,21,,,,,,36.9,16:48,16.06525,2.0


# Unite, Split, and Arrange

In [None]:
str_1 = "String1"
str_2 = "String2" 
str_3 = "String3"

## Combine Strings

In [None]:
str_c = str_1 + str_2 + str_3
str_c

'String1String2String3'

In [None]:
combined_string = " ".join([str_1, str_2, str_3])
combined_string

'String1 String2 String3'

In [None]:
first_names = ("abdul", "fahruk", "janice") 
last_names  = ("hussein", "akinleye", "okeke")
names = list(zip(first_names, last_names))
print(names)
combined_names = [f"{first} {last}" for first, last in names]
";  ".join(combined_names)

[('abdul', 'hussein'), ('fahruk', 'akinleye'), ('janice', 'okeke')]


'abdul hussein;  fahruk akinleye;  janice okeke'

In [None]:
print("; \n".join(combined_names))

abdul hussein; 
fahruk akinleye; 
janice okeke


## Dynamic Strings

In [None]:
print(f"Data include {len(linelist)} cases and are current to {format(datetime.now(), '%d %b %Y')}.")

Data include 5888 cases and are current to 09 Jan 2023.


In [None]:
current_date  = format(datetime.now(), '%d %b %Y')
last_hospital = format((max(linelist['date_hospitalisation'])), '%d %B %Y')

In [None]:
last_hospital

'30 April 2015'

In [None]:
n_missing_onset = linelist['date_onset'].isnull().sum()
n_missing_onset

256

In [None]:
print(f"Linelist as of {current_date}.\nLast case hospitalized on {last_hospital}.\n{n_missing_onset} cases are missing date of onset and not shown.")

Linelist as of 09 Jan 2023.
Last case hospitalized on 30 April 2015.
256 cases are missing date of onset and not shown.


## Pulling from a Dataframe 

In [None]:
zone        = ("Zone 1", "Zone 2", "Zone 3", "Zone 4", "Zone 5")
new_cases   = (3, 0, 7, 0, 15)
total_cases = (40, 4, 25, 10, 103)
df = pd.DataFrame(list(zip(zone, new_cases, total_cases)), columns=['zone', 'new_cases', 'total_cases'])
df

Unnamed: 0,zone,new_cases,total_cases
0,Zone 1,3,40
1,Zone 2,0,4
2,Zone 3,7,25
3,Zone 4,0,10
4,Zone 5,15,103


In [None]:
for index, row in df.iterrows():
    print(f"{row['zone']}: {row['new_cases']} ({row['total_cases']} total cases)")

Zone 1: 3 (40 total cases)
Zone 2: 0 (4 total cases)
Zone 3: 7 (25 total cases)
Zone 4: 0 (10 total cases)
Zone 5: 15 (103 total cases)


## Data frame to one line

In [None]:
for index, row in df.iterrows():
    print(f"{row['zone']}= {row['new_cases']}")

Zone 1= 3
Zone 2= 0
Zone 3= 7
Zone 4= 0
Zone 5= 15


In [None]:
results = [f"{row['zone']}= {row['new_cases']}" for _, row in df.iterrows()]
results

['Zone 1= 3', 'Zone 2= 0', 'Zone 3= 7', 'Zone 4= 0', 'Zone 5= 15']

## Unite Columns

In [None]:
case_ID = range(1,7)
symptoms = ("jaundice, fever, chills",  # patient 1
         "chills, aches, pains",        # patient 2 
         "fever",                       # patient 3
         "vomiting, diarrhoea",         # patient 4
         "bleeding from gums, fever",   # patient 5
         "rapid pulse, headache")       # patient 6
outcome = ("Recover", "Death", "Death", "Recover", "Recover", "Recover")

In [None]:
df = pd.DataFrame(list(zip(case_ID, symptoms, outcome)), columns=['case_ID', 'symtons', 'outcome'])
df

Unnamed: 0,case_ID,symtons,outcome
0,1,"jaundice, fever, chills",Recover
1,2,"chills, aches, pains",Death
2,3,fever,Death
3,4,"vomiting, diarrhoea",Recover
4,5,"bleeding from gums, fever",Recover
5,6,"rapid pulse, headache",Recover


In [None]:
df_split = pd.concat([df[['case_ID']], df['symtons'].str.split(', ', expand=True), df['outcome']], axis=1)
df_split = df_split.rename({0: 'sym1', 1: 'sym2', 2:'sym3'}, axis='columns')
df_split

Unnamed: 0,case_ID,sym1,sym2,sym3,outcome
0,1,jaundice,fever,chills,Recover
1,2,chills,aches,pains,Death
2,3,fever,,,Death
3,4,vomiting,diarrhoea,,Recover
4,5,bleeding from gums,fever,,Recover
5,6,rapid pulse,headache,,Recover


## combine columns using `+` Operator (Not working if there are empty cells)

In [None]:
df_split['all_symptons'] = df_split['sym1'] + ", " + df_split['sym2'] + ", " + df_split['sym3']
df_split

Unnamed: 0,case_ID,sym1,sym2,sym3,outcome,all_symptons
0,1,jaundice,fever,chills,Recover,"jaundice, fever, chills"
1,2,chills,aches,pains,Death,"chills, aches, pains"
2,3,fever,,,Death,
3,4,vomiting,diarrhoea,,Recover,
4,5,bleeding from gums,fever,,Recover,
5,6,rapid pulse,headache,,Recover,


### Using `.apply()` Method to Combine String Columns

Important: fill None with " "

In [None]:
df_all = df_split.copy(deep=True)
df_all_1 = df_all.fillna("")

In [None]:
df_all_1['all_symptons'] = df_all_1[['sym1', 'sym2', 'sym3']].apply(lambda x: " ".join(x), axis =1)
df_all_1

Unnamed: 0,case_ID,sym1,sym2,sym3,outcome,all_symptons
0,1,jaundice,fever,chills,Recover,jaundice fever chills
1,2,chills,aches,pains,Death,chills aches pains
2,3,fever,,,Death,fever
3,4,vomiting,diarrhoea,,Recover,vomiting diarrhoea
4,5,bleeding from gums,fever,,Recover,bleeding from gums fever
5,6,rapid pulse,headache,,Recover,rapid pulse headache


In [None]:
df_all_1[['case_ID', 'all_symptons', 'outcome']]

Unnamed: 0,case_ID,all_symptons,outcome
0,1,jaundice fever chills,Recover
1,2,chills aches pains,Death
2,3,fever,Death
3,4,vomiting diarrhoea,Recover
4,5,bleeding from gums fever,Recover
5,6,rapid pulse headache,Recover


## Split

In [None]:
string = "jaundice, fever, chills"
string.split(',')

['jaundice', ' fever', ' chills']

In [None]:
string.split(', ')

['jaundice', 'fever', 'chills']

In [None]:
pt1_symptoms = string.split(', ')
pt1_symptoms[1]

'fever'

In [None]:
symptoms =   ["jaundice, fever, chills",     # patient 1
              "chills, aches, pains",        # patient 2 
              "fever",                       # patient 3
              "vomiting, diarrhoea",         # patient 4
              "bleeding from gums, fever",   # patient 5
              "rapid pulse, headache"]       # patient 6

In [None]:
symptoms[0]   # Python indexing starts from zero

'jaundice, fever, chills'

In [None]:
symptoms[1]

'chills, aches, pains'

In [None]:
symptoms[2]

'fever'

In [None]:
symptoms[3]

'vomiting, diarrhoea'

In [None]:
symptoms[4]

'bleeding from gums, fever'

In [None]:
symptoms[5]

'rapid pulse, headache'

In [None]:
df

Unnamed: 0,case_ID,symtons,outcome
0,1,"jaundice, fever, chills",Recover
1,2,"chills, aches, pains",Death
2,3,fever,Death
3,4,"vomiting, diarrhoea",Recover
4,5,"bleeding from gums, fever",Recover
5,6,"rapid pulse, headache",Recover


Note: `None` for empty cell

In [None]:
symt_all = df['symtons'].str.split(', ', expand=True)

In [None]:
symt_all = symt_all.rename({0: 'sym1', 1: 'sym2', 2: 'sym3'}, axis='columns')
symt_all

Unnamed: 0,sym1,sym2,sym3
0,jaundice,fever,chills
1,chills,aches,pains
2,fever,,
3,vomiting,diarrhoea,
4,bleeding from gums,fever,
5,rapid pulse,headache,


In [None]:
symt_all = symt_all.fillna("")

In [None]:
symt_all['sym_2'] = symt_all[['sym2', 'sym3']].apply(lambda x: " ".join(x), axis =1)
symt_all

Unnamed: 0,sym1,sym2,sym3,sym_2
0,jaundice,fever,chills,fever chills
1,chills,aches,pains,aches pains
2,fever,,,
3,vomiting,diarrhoea,,diarrhoea
4,bleeding from gums,fever,,fever
5,rapid pulse,headache,,headache


In [None]:
df_split1 = pd.concat([df[['case_ID']], symt_all['sym1'], symt_all['sym_2'], df['outcome']], axis=1)
df_split1

Unnamed: 0,case_ID,sym1,sym_2,outcome
0,1,jaundice,fever chills,Recover
1,2,chills,aches pains,Death
2,3,fever,,Death
3,4,vomiting,diarrhoea,Recover
4,5,bleeding from gums,fever,Recover
5,6,rapid pulse,headache,Recover


Warnings: If limited the expanded columns, your data may be truncated.

In [None]:
symt2 = symt_all[symt_all.columns[0:2]]
symt2

Unnamed: 0,sym1,sym2
0,jaundice,fever
1,chills,aches
2,fever,
3,vomiting,diarrhoea
4,bleeding from gums,fever
5,rapid pulse,headache


In [None]:
df_split2 = pd.concat([df[['case_ID']], symt2, df['outcome']], axis=1)
df_split2

Unnamed: 0,case_ID,sym1,sym2,outcome
0,1,jaundice,fever,Recover
1,2,chills,aches,Death
2,3,fever,,Death
3,4,vomiting,diarrhoea,Recover
4,5,bleeding from gums,fever,Recover
5,6,rapid pulse,headache,Recover


## Arrange Alphabetically

In [None]:
health_zones = ["Alba", "Takota", "Delta"]
health_zones.sort()
health_zones

['Alba', 'Delta', 'Takota']

In [None]:
n_beds = 10
n_masks = 20

In [None]:
print(f"Regional hospital needs", n_beds, "beds and", n_masks, "masks.")

Regional hospital needs 10 beds and 20 masks.


In [None]:
print(f"Regional hospital needs {n_beds} beds and {n_masks} masks.")

Regional hospital needs 10 beds and 20 masks.


# Clean and Standardise

## Change Case

In [None]:
str1 = "California"
str1.upper()

'CALIFORNIA'

In [None]:
str1.lower()

'california'

In [None]:
str1.title()

'California'

## Title Case

In [None]:
str2 = "go to the US state of california "
str2.title()

'Go To The Us State Of California '

In [None]:
str3 = "the patient must be transported"
str3.capitalize()

'The patient must be transported'

## Pad Length

In [None]:
ICD_codes = ["R10.13", "R10.819", "R17"]
[i.ljust(7) for i in ICD_codes]

['R10.13 ', 'R10.819', 'R17    ']

In [None]:
[i.ljust(7, '.') for i in ICD_codes]

['R10.13.', 'R10.819', 'R17....']

In [None]:
str(4).rjust(2, '0')

'04'

In [None]:
[str(i).rjust(2, '0') for i in range(0, 4)]

['00', '01', '02', '03']

## Truncate

In [None]:
original = "Symptom onset on 4/3/2020 with vomiting"

In [None]:
original[:4] + "..." + original[-3:]

'Symp...ing'

In [None]:
import textwrap
textwrap.shorten("Symptom onset on 4/3/2020 with vomiting", width=10, placeholder="...")+ original[-3:]

'Symptom...ing'

## Standarize Length

Set as length of 6

In [None]:
[i.ljust(6, ".")[:6] for i in ICD_codes]

['R10.13', 'R10.81', 'R17...']

## Remove leading/trailing whitespace

In [None]:
IDs =   ["provA_1852  ", # two excess spaces
         "provA_2345",   # zero excess spaces
         "provA_9460 "]  # one excess space

In [None]:
[i.strip() for i in IDs]

['provA_1852', 'provA_2345', 'provA_9460']

## Remove Repeated Whitespace within

In [None]:
str_squish = "  Pt requires   IV saline\n"
str_squish

'  Pt requires   IV saline\n'

In [None]:
' '.join(str_squish.split())

'Pt requires IV saline'

## Wrap into Paragraphs

In [None]:
pt_course = "Symptom onset 1/4/2020 vomiting chills fever. Pt saw traditional healer in home village on 2/4/2020. On 5/4/2020 pt symptoms worsened and was admitted to Lumta clinic. Sample was taken and pt was transported to regional hospital on 6/4/2020. Pt died at regional hospital on 7/4/2020."
pt_course

'Symptom onset 1/4/2020 vomiting chills fever. Pt saw traditional healer in home village on 2/4/2020. On 5/4/2020 pt symptoms worsened and was admitted to Lumta clinic. Sample was taken and pt was transported to regional hospital on 6/4/2020. Pt died at regional hospital on 7/4/2020.'

In [None]:
textwrap.wrap(pt_course, 40)

['Symptom onset 1/4/2020 vomiting chills',
 'fever. Pt saw traditional healer in home',
 'village on 2/4/2020. On 5/4/2020 pt',
 'symptoms worsened and was admitted to',
 'Lumta clinic. Sample was taken and pt',
 'was transported to regional hospital on',
 '6/4/2020. Pt died at regional hospital',
 'on 7/4/2020.']

# Handle by Position

In [None]:
str_sub = "pneumonia"
str_sub[2]   # 3rd letter from left

'e'

In [None]:
str_sub[0]   # Index zero is available in Python

'p'

In [None]:
str_sub[-4:]  # 6th from left, to the 1st from right

'onia'

In [None]:
str_sub[4:-1]  # 5th from right, to the 2nd from right

'moni'

In [None]:
str_sub[3:]  # 4th from left to a position outside the string

'umonia'

## Extract by Word Position

In [None]:
chief_complaints =   ["I just got out of the hospital 2 days ago, but still can barely breathe.",
                      "My stomach hurts",
                      "Severe ear pain"]

In [None]:
[" ".join(x.split()[:3]) for x in chief_complaints]  # extract 1st to 3rd words of each string

['I just got', 'My stomach hurts', 'Severe ear pain']

## Replace by Character Position

In [None]:
word = "pneumonia"

In [None]:
word[2:4] 

'eu'

In [None]:
new_ele = "XX"
word[:2] + new_ele + word[4:]  # convert the third and fourth characters to X 

'pnXXmonia'

In [None]:
words = "pneumonia", "tubercolosis", "HIV"
[word[:2] + new_ele + word[4:] for word in words]

['pnXXmonia', 'tuXXrcolosis', 'HIXX']

# Patterns

## Detect a pattern

In [None]:
string = "primary school teacher"
pattern = 'teach'

Note: If return a numeric number, that is the index of the given string.  If return -1, no pattern found.

In [None]:
string.find(pattern) 

15

Note: Try to apply the pattern to all of the string, returning a Match object, or None if no match was found.

In [None]:
import re
re.fullmatch(pattern, string)

In [None]:
if not re.fullmatch(pattern, string):
    print("FALSE")

FALSE


In [None]:
occupations = ["field laborer",
                 "university professor",
                 "primary school teacher & tutor",
                 "tutor",
                 "nurse at regional hospital",
                 "lineworker at Amberdeen Fish Factory",
                 "physican",
                 "cardiologist",
                 "office worker",
                 "food service"]

In [None]:
[pattern in o for o in occupations]

[False, False, True, False, False, False, False, False, False, False]

Using in a for loop:

In [None]:
n = 0
for o in occupations:
    if pattern in o:
        n+=1
n

1

Or, using list comprehension

In [None]:
sum([1 if pattern in o else 0 for o in occupations])

1

## Detect Multiple Terms, separated by OR bars `|`

In [None]:
pattern = r"\b(teach|professor|tutor)\b"  # Use raw string and word boundary anchors

In [None]:
import re

def count_occurrences(strings, pattern):
    count = 0
    for string in strings:
        count += sum(1 for _ in re.finditer(pattern, string, re.IGNORECASE))
    return count

result = count_occurrences(strings=occupations, pattern=pattern)
print(result)  # Expected output: 3

3


In [None]:
occupation_med_frontline =    ["medical", "medicine", "hcw", "healthcare", "home care", "home health",
                                "surgeon", "doctor", "doc", "physician", "surgery", "peds", "pediatrician",
                               "intensivist", "cardiologist", "coroner", "nurse", "nursing", "rn", "lpn",
                               "cna", "pa", "physician assistant", "mental health",
                               "emergency department technician", "resp therapist", "respiratory",
                                "phlebotomist", "pharmacy", "pharmacist", "hospital", "snf", "rehabilitation",
                               "rehab", "activity", "elderly", "subacute", "sub acute",
                                "clinic", "post acute", "therapist", "extended care",
                                "dental", "dential", "dentist"]

In [None]:
med_pattern = str("|".join(occupation_med_frontline))
med_pattern

'medical|medicine|hcw|healthcare|home care|home health|surgeon|doctor|doc|physician|surgery|peds|pediatrician|intensivist|cardiologist|coroner|nurse|nursing|rn|lpn|cna|pa|physician assistant|mental health|emergency department technician|resp therapist|respiratory|phlebotomist|pharmacy|pharmacist|hospital|snf|rehabilitation|rehab|activity|elderly|subacute|sub acute|clinic|post acute|therapist|extended care|dental|dential|dentist'

In [None]:
med_pattern = r"\b(medical|medicine|hcw|healthcare|home care|home health|surgeon|doctor|doc|physician|surgery|peds|pediatrician|intensivist|cardiologist|coroner|nurse|nursing|rn|lpn|cna|pa|physician assistant|mental health|emergency department technician|resp therapist|respiratory|phlebotomist|pharmacy|pharmacist|hospital|snf|rehabilitation|rehab|activity|elderly|subacute|sub acute|clinic|post acute|therapist|extended care|dental|dential|dentist)\b"

In [None]:
result = count_occurrences(strings=occupations, pattern=med_pattern)
print(result)  ##expected answer is 2 #TODO debug  

3


In [None]:
import re

def count_occurrences(strings, pattern):
    count = 0
    for string in strings:
        count += sum(1 for _ in re.finditer(pattern, string, re.IGNORECASE))
    return count

occupations = ["field laborer",
                 "university professor",
                 "primary school teacher & tutor",
                 "tutor",
                 "nurse at regional hospital",
                 "lineworker at Amberdeen Fish Factory",
                 "physican",
                 "cardiologist",
                 "office worker",
                 "food service"]
med_pattern = r"\b(medical|medicine|hcw|healthcare|home care|home health|surgeon|doctor|doc|physician|surgery|peds|pediatrician|intensivist|cardiologist|coroner|nurse|nursing|rn|lpn|cna|pa|physician assistant|mental health|emergency department technician|resp therapist|respiratory|phlebotomist|pharmacy|pharmacist|hospital|snf|rehabilitation|rehab|activity|elderly|subacute|sub acute|clinic|post acute|therapist|extended care|dental|dential|dentist)\b"

result = count_occurrences(strings=occupations, pattern=med_pattern)
print(result)  # Expected output: 2  #TODO debug


3


## Convert Commas `,` to Period `.`

In [None]:
lengths = ["2.454,56", "1,2", "6.096,5"]

# Replace periods and commas with no spaces and then convert the strings to floats
y = [float(l.replace(".", "").replace(",", ".")) for l in lengths]
y

[2454.56, 1.2, 6096.5]

## Replace All

In [None]:
outcome =  ["Karl: dead",
            "Samantha: dead",
            "Marco: not dead"]

In [None]:
[o.replace('dead', 'deceased') for o in outcome]

['Karl: deceased', 'Samantha: deceased', 'Marco: not deceased']

## Detect with Logic

see above `Detect Multiple Terms, separated by OR bars |`

## Locate Pattern Position

In [None]:
string = 'I wish'
pattern = 'sh'
string.find(pattern)  # return start position (Note: Python indexing starts from zero)

4

To find end position

In [None]:
string.find(pattern) + len(pattern) - 1

5

In [None]:
phrases = ["I wish", "I hope", "he hopes", "He hopes"]
pattern = 'h'

In [None]:
start = []
end = []
for p in phrases:
    start.append(p.find(pattern))
    end.append((p.find(pattern) + len(pattern) -1))
print(start)
print(end)

[5, 2, 0, 3]
[5, 2, 0, 3]


In [None]:
list(zip(start, end))

[(5, 5), (2, 2), (0, 0), (3, 3)]

## Extract a Match

In [None]:
occupations

['field laborer',
 'university professor',
 'primary school teacher & tutor',
 'tutor',
 'nurse at regional hospital',
 'lineworker at Amberdeen Fish Factory',
 'physican',
 'cardiologist',
 'office worker',
 'food service']

In [None]:
for i, o in enumerate(occupations):
    if 'teach' in o or 'prof' in o or 'tutor' in o:
        print(i, o)
    else:
        print(i, 0)

0 0
1 university professor
2 primary school teacher & tutor
3 tutor
4 0
5 0
6 0
7 0
8 0
9 0


In [None]:
[o if 'teach' in o or 'prof' in o or 'tutor' in o else 'NA' for o in occupations]

['NA',
 'university professor',
 'primary school teacher & tutor',
 'tutor',
 'NA',
 'NA',
 'NA',
 'NA',
 'NA',
 'NA']

## Subset and Count 

In [None]:
[o for o in occupations if 'teach' in o or 'prof' in o or 'tutor' in o]  # ignore the else 'NA' to get a shorter answer

['university professor', 'primary school teacher & tutor', 'tutor']

In [None]:
[(i, o) for i, o in enumerate(occupations) if 'teach' in o or 'prof' in o or 'tutor' in o]

[(1, 'university professor'),
 (2, 'primary school teacher & tutor'),
 (3, 'tutor')]

In [None]:
for o in occupations:
    n = 0
    if 'teach' in o.lower():
        n+=1
    if 'prof' in o.lower():
        n+=1
    if 'tutor' in o.lower():
        n+=1
    print(n)

0
1
2
1
0
0
0
0
0
0


# Special Characters

The backslash \ is used to “escape” the meaning of the next character. For example, `\n`	a new line

quotes

In [None]:
"'"

"'"

In [None]:
'"'

'"'

# Regular Expression and Special Characters

In [None]:
test = "A-AA-AAA-AAAA"

Use `re.findall` to find all possible matches

## Quantifiers `{}`

In [None]:
result = re.findall(r'A{2}', test)
result

['AA', 'AA', 'AA', 'AA']

Alternatively, use `re.match` to find the first match but preparation of text required.

In [None]:
test = test.replace('-', ' ')
test = test.split()
test

['A', 'AA', 'AAA', 'AAAA']

In [None]:
for t in test:
    result = re.match(r'A{2}', t)
    print(result)

None
<re.Match object; span=(0, 2), match='AA'>
<re.Match object; span=(0, 2), match='AA'>
<re.Match object; span=(0, 2), match='AA'>


When a quantifier of {2,4} is used, groups of consecutive A’s that are two to four in length are returned.

In [None]:
test = "A-AA-AAA-AAAA"
result = re.findall(r'A{2,4}', test)
result

['AA', 'AAA', 'AAAA']

With the quantifier `+`, groups of one or more are returned:

In [None]:
result = re.findall(r'A+', test)
result

['A', 'AA', 'AAA', 'AAAA']

## Relative Position

In [None]:
result = re.findall(r'', test)
result

['', '', '', '', '', '', '', '', '', '', '', '', '', '']

## RegEx Examples

In [None]:
pt_note = "Patient arrived at Broward Hospital emergency ward at 18:00 on 6/12/2005. Patient presented with radiating abdominal pain from LR quadrant. Patient skin was pale, cool, and clammy. Patient temperature was 99.8 degrees farinheit. Patient pulse rate was 100 bpm and thready. Respiratory rate was 29 per minute."
pt_note

'Patient arrived at Broward Hospital emergency ward at 18:00 on 6/12/2005. Patient presented with radiating abdominal pain from LR quadrant. Patient skin was pale, cool, and clammy. Patient temperature was 99.8 degrees farinheit. Patient pulse rate was 100 bpm and thready. Respiratory rate was 29 per minute.'

In [None]:
pattern = r"[A-Za-z]+"
result = re.findall(pattern, pt_note)
result

['Patient',
 'arrived',
 'at',
 'Broward',
 'Hospital',
 'emergency',
 'ward',
 'at',
 'on',
 'Patient',
 'presented',
 'with',
 'radiating',
 'abdominal',
 'pain',
 'from',
 'LR',
 'quadrant',
 'Patient',
 'skin',
 'was',
 'pale',
 'cool',
 'and',
 'clammy',
 'Patient',
 'temperature',
 'was',
 'degrees',
 'farinheit',
 'Patient',
 'pulse',
 'rate',
 'was',
 'bpm',
 'and',
 'thready',
 'Respiratory',
 'rate',
 'was',
 'per',
 'minute']

In [None]:
pattern = r"[0-9]{1,2}"
result = re.findall(pattern, pt_note)
result

['18', '00', '6', '12', '20', '05', '99', '8', '10', '0', '29']

Tips: Go go https://regexr.com/, you can put your text and pattern there to visualise the output.  Note: Also, you can apply different flags to achieve the results you wanted.

In [None]:
#| hide
nbdev_export()