<a href="https://colab.research.google.com/github/sohv/NLP-Lab/blob/main/Lab_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# POS Tagging

## Implement POS tagging on given text

In [None]:
import spacy
from spacy import displacy

def analyze_text(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    print("POS Tagging:")
    for token in doc:
        print(f"{token.text}: {token.pos_} ({token.dep_})")
    print("\n" + "-"*50 + "\n")

    print("Noun Phrases:")
    for chunk in doc.noun_chunks:
        print(chunk.text)
    print("\n" + "-"*50 + "\n")

    print("Named Entities:")
    for ent in doc.ents:
        print(f"{ent.text} ({ent.label_})")
    print("\n" + "-"*50 + "\n")

    displacy.serve(doc, style="dep")

if __name__ == "__main__":
    text_input = "Apple was founded by Steve Jobs in California in 1976."
    analyze_text(text_input)

POS Tagging:
Apple: PROPN (nsubjpass)
was: AUX (auxpass)
founded: VERB (ROOT)
by: ADP (agent)
Steve: PROPN (compound)
Jobs: PROPN (pobj)
in: ADP (prep)
California: PROPN (pobj)
in: ADP (prep)
1976: NUM (pobj)
.: PUNCT (punct)

--------------------------------------------------

Noun Phrases:
Apple
Steve Jobs
California

--------------------------------------------------

Named Entities:
Apple (ORG)
Steve Jobs (PERSON)
California (GPE)
1976 (DATE)

--------------------------------------------------






Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


## Improve Noun phrase extraction by chunking

In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
import spacy
import nltk
from nltk.chunk import RegexpParser
from nltk import pos_tag, word_tokenize

def extract_noun_phrases(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    spacy_noun_phrases = [chunk.text for chunk in doc.noun_chunks]

    grammar = r"NP: {<DT>?<JJ>*<NN.*>+}"
    chunk_parser = RegexpParser(grammar)
    tagged_words = pos_tag(word_tokenize(text))
    chunk_tree = chunk_parser.parse(tagged_words)
    regex_noun_phrases = [" ".join(leaf[0] for leaf in subtree.leaves())
                          for subtree in chunk_tree.subtrees() if subtree.label() == 'NP']

    combined_noun_phrases = list(set(spacy_noun_phrases + regex_noun_phrases))
    return combined_noun_phrases

if __name__ == "__main__":
    sample_text = "The quick brown fox jumps over the lazy dog near the river bank."
    noun_phrases = extract_noun_phrases(sample_text)
    print("Extracted Noun Phrases:", noun_phrases)

Extracted Noun Phrases: ['The quick brown fox', 'the river bank', 'the lazy dog']


## Multilingual support using Spacy

In [1]:
%pip install langdetect

Note: you may need to restart the kernel to use updated packages.


In [6]:
!pip install -U spacy pydantic

Collecting spacy
  Downloading spacy-3.8.5-cp312-cp312-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting pydantic
  Using cached pydantic-2.11.3-py3-none-any.whl.metadata (65 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting pydantic-core==2.33.1 (from pydantic)
  Using cached pydantic_core-2.33.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting typing-extensions>=4.12.2 (from pydantic)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting typing-inspection>=0.4.0 (from pydantic)
  Using cached typing_inspection-0.4.0-py3-none-any.whl.metadata (2.6 kB)
Collecting blis<1.4.0,>=1.3.0 (from thinc<8.4.0,>=8.3.4->spacy)
  Downloading blis-1.3.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.4 kB)
Collecting numpy>=1.19.0 (from spacy)
  Using cached numpy-2.2.4-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Downloading spacy-3.8.5-cp312-cp312-macosx_11_0_arm64

In [7]:
import spacy  
from langdetect import detect  

lang_models = {  
    "en": "en_core_web_sm", "es": "es_core_news_sm",  
    "fr": "fr_core_news_sm", "de": "de_core_news_sm"  
}  

def analyze(text):  
    try:
        lang = detect(text)  
        nlp = spacy.load(lang_models.get(lang, "en_core_web_sm"))  
        doc = nlp(text)  
        return {  
            "POS": [(w.text, w.pos_) for w in doc],  
            "Nouns": [w.text for w in doc if w.pos_ in {"NOUN", "PROPN"}],  
            "Entities": [(e.text, e.label_) for e in doc.ents]  
        }  
    except Exception as e:
        return {"error": str(e)}

print(analyze("Elon Musk fundó SpaceX en 2002."))


TypeError: ForwardRef._evaluate() missing 1 required keyword-only argument: 'recursive_guard'

In [8]:
import spacy
from spacy import displacy

def load_multilingual_model(lang):
    models = {
        "en": "en_core_web_sm",
        "fr": "fr_core_news_sm",
        "de": "de_core_news_sm",
        "es": "es_core_news_sm"
    }
    return spacy.load(models.get(lang, "en_core_web_sm"))

def analyze_text(text, lang="en"):
    nlp = load_multilingual_model(lang)
    doc = nlp(text)

    # pos tagging
    print("POS Tagging:")
    for token in doc:
        print(f"{token.text}: {token.pos_} ({token.dep_})")
    print("\n" + "-"*50 + "\n")

    # named entity recognition
    print("Named Entities:")
    for ent in doc.ents:
        print(f"{ent.text} ({ent.label_})")
    print("\n" + "-"*50 + "\n")

if __name__ == "__main__":
    text_input = "Apple a été fondée par Steve Jobs en Californie en 1976."
    analyze_text(text_input, lang="fr")

TypeError: ForwardRef._evaluate() missing 1 required keyword-only argument: 'recursive_guard'