# NLP - Spacy (Pre-Trained Models)


> pip install setuptools wheel


> pip install spacy

> python spacy download en_core_web_sm

In [21]:
import spacy
from spacy import displacy
import pandas as  pd

# Load the pre-trained model

In [2]:
nlp = spacy.load("en_core_web_sm")

# Create a Text Spacy 

In [3]:
doc = nlp("You are a great person")

print(doc, type(doc))

You are a great person <class 'spacy.tokens.doc.Doc'>


# Tokenization

In [8]:
print(doc[0])
print(doc[1])
print(doc[2])
print(doc[3])
print(doc[4])

You
are
a
great
person


In [11]:
for token in doc:
    print(token.text)

You
are
a
great
person


In [12]:
tokens = [token.text for token in doc]
tokens

['You', 'are', 'a', 'great', 'person']

# Lemmatization

bring the words to their root words from dictionary

reduces the count of the unique words

In [13]:
doc = nlp("She has a nice hair")
doc2 = nlp("They have a nice hair")

lemma = [token.lemma_ for token in doc]
print(lemma)



lemma = [token.lemma_ for token in doc2]
print(lemma)

['she', 'have', 'a', 'nice', 'hair']
['they', 'have', 'a', 'nice', 'hair']


# Part of Speech Tagging (POS)

In [14]:
lemma = [token.pos_ for token in doc]
print(lemma)

['PRON', 'VERB', 'DET', 'ADJ', 'NOUN']


# Named Entity Recognition (NER)

classification of words

In [15]:
doc = nlp("Net income was $9.4 million compared to the prior year of $2.7 million.")


for token in doc.ents:
    print(token.text, token.label_)

$9.4 million MONEY
the prior year DATE
$2.7 million MONEY


In [17]:
displacy.render(doc, style= "ent", jupyter= True)

# Pipeline

<img src="./pipeline.svg">

In [19]:
# Example with several sentences

texts = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]



for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribut_ruler", "lemmatizer"]):
    print([(ent.text, ent.label_) for ent in doc.ents])
    displacy.render(doc, style= "ent", jupyter= True)

[('$9.4 million', 'MONEY'), ('the prior year', 'DATE'), ('$2.7 million', 'MONEY')]


[('twelve billion dollars', 'MONEY'), ('1b', 'MONEY')]


# Example Project (MedicalHealth Care)

In [20]:
clinical_notes = {
    "Patient 1": """
        The patient is a 45-year-old male with a history of hypertension and diabetes. 
        He is currently prescribed metformin and losartan. Last visit: 2023-04-15. 
        The patient reports occasional headaches.
    """,
    "Patient 2": """
        A 60-year-old female presents with chest pain and shortness of breath. 
        She has a history of coronary artery disease and is on aspirin and atorvastatin. 
        Last visit: 2023-03-22. Family history of stroke.
    """,
    "Patient 3": """
        The patient is a 30-year-old female with no significant medical history. 
        She complains of frequent migraines and is taking ibuprofen as needed. 
        Last visit: 2023-05-10.
    """,
    "Patient 4": """
        A 70-year-old male with a history of congestive heart failure. 
        Currently on furosemide and lisinopril. He also has chronic kidney disease. 
        Last visit: 2023-04-30. Recent symptoms include fatigue and dizziness.
    """
}

# Extract medical Info using Spacy NER


In [22]:

def extract_medical_info(notes):
    patient_data = []

    for patient, note in notes.items():
        doc = nlp(note)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        patient_data.append({
            'Patient': patient,
            'Clinical Notes': note,
            'Entities': entities
        })

    return pd.DataFrame(patient_data)

In [23]:
# Extract medical information
medical_data_df = extract_medical_info(clinical_notes)

# Display the dataframe with extracted entities
print("Medical Information Extraction")
print(medical_data_df)

Medical Information Extraction
     Patient                                     Clinical Notes  \
0  Patient 1  \n        The patient is a 45-year-old male wi...   
1  Patient 2  \n        A 60-year-old female presents with c...   
2  Patient 3  \n        The patient is a 30-year-old female ...   
3  Patient 4  \n        A 70-year-old male with a history of...   

                                        Entities  
0  [(45-year-old, DATE), (2023-04-15, CARDINAL)]  
1  [(60-year-old, DATE), (2023-03-22, CARDINAL)]  
2  [(30-year-old, DATE), (2023-05-10, CARDINAL)]  
3  [(70-year-old, DATE), (2023-04-30, CARDINAL)]  


In [24]:
doc1 = nlp(clinical_notes["Patient 1"])
displacy.render(doc1, style='ent', jupyter=True)

# Example with Matchers

In [25]:
from spacy.matcher import Matcher

In [26]:
text = """
Patient John Doe, a 45-year-old male, was diagnosed with Type 2 diabetes last year.
He is currently prescribed Metformin 500mg once daily. Recently, he complained of headaches
and fatigue. The doctor recommended increasing the dosage of Metformin to 1000mg and 
starting on a new medication called Lisinopril for blood pressure control.
"""


In [27]:
doc = nlp(text)


In [28]:

# Define patterns to extract medication names and dosages
pattern_medication = [{"LOWER": {"IN": ["metformin", "lisinopril"]}}]
pattern_dosage = [{"LIKE_NUM": True}, {"LOWER": {"IN": ["mg", "milligrams"]}}]


In [29]:
matcher = Matcher(nlp.vocab)

matcher.add("MEDICATION", [pattern_medication])
matcher.add("DOSAGE", [pattern_dosage])

In [30]:
# Apply the matcher to the document
matches = matcher(doc)

In [31]:
# Display entities recognized by spaCy's built-in NER
print("Named Entities (NER):")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

# Display matches from the custom patterns (medications, dosages)
print("\nCustom Pattern Matches (Medications and Dosages):")
for match_id, start, end in matches:
    matched_span = doc[start:end]
    match_label = nlp.vocab.strings[match_id]
    print(f"{match_label}: {matched_span.text}")

Named Entities (NER):
John Doe (PERSON)
45-year-old (DATE)
2 (CARDINAL)
last year (DATE)
Metformin 500 (PERSON)
daily (DATE)
Metformin (PERSON)
Lisinopril (PERSON)

Custom Pattern Matches (Medications and Dosages):
MEDICATION: Metformin
DOSAGE: 500mg
MEDICATION: Metformin
DOSAGE: 1000mg
MEDICATION: Lisinopril
