<a href="https://colab.research.google.com/github/sohv/NLP-Lab/blob/main/Lab_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# POS Tagging

## Implement POS tagging on given text

In [None]:
import spacy
from spacy import displacy

def analyze_text(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    print("POS Tagging:")
    for token in doc:
        print(f"{token.text}: {token.pos_} ({token.dep_})")
    print("\n" + "-"*50 + "\n")

    print("Noun Phrases:")
    for chunk in doc.noun_chunks:
        print(chunk.text)
    print("\n" + "-"*50 + "\n")

    print("Named Entities:")
    for ent in doc.ents:
        print(f"{ent.text} ({ent.label_})")
    print("\n" + "-"*50 + "\n")

    displacy.serve(doc, style="dep")

if __name__ == "__main__":
    text_input = "Apple was founded by Steve Jobs in California in 1976."
    analyze_text(text_input)

POS Tagging:
Apple: PROPN (nsubjpass)
was: AUX (auxpass)
founded: VERB (ROOT)
by: ADP (agent)
Steve: PROPN (compound)
Jobs: PROPN (pobj)
in: ADP (prep)
California: PROPN (pobj)
in: ADP (prep)
1976: NUM (pobj)
.: PUNCT (punct)

--------------------------------------------------

Noun Phrases:
Apple
Steve Jobs
California

--------------------------------------------------

Named Entities:
Apple (ORG)
Steve Jobs (PERSON)
California (GPE)
1976 (DATE)

--------------------------------------------------






Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


## Improve Noun phrase extraction by chunking

In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
import spacy
import nltk
from nltk.chunk import RegexpParser
from nltk import pos_tag, word_tokenize

def extract_noun_phrases(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    spacy_noun_phrases = [chunk.text for chunk in doc.noun_chunks]

    grammar = r"NP: {<DT>?<JJ>*<NN.*>+}"
    chunk_parser = RegexpParser(grammar)
    tagged_words = pos_tag(word_tokenize(text))
    chunk_tree = chunk_parser.parse(tagged_words)
    regex_noun_phrases = [" ".join(leaf[0] for leaf in subtree.leaves())
                          for subtree in chunk_tree.subtrees() if subtree.label() == 'NP']

    combined_noun_phrases = list(set(spacy_noun_phrases + regex_noun_phrases))
    return combined_noun_phrases

if __name__ == "__main__":
    sample_text = "The quick brown fox jumps over the lazy dog near the river bank."
    noun_phrases = extract_noun_phrases(sample_text)
    print("Extracted Noun Phrases:", noun_phrases)

Extracted Noun Phrases: ['The quick brown fox', 'the river bank', 'the lazy dog']


## Multilingual support using Spacy

In [None]:
!python -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m96.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
from spacy import displacy

def load_multilingual_model(lang):
    models = {
        "en": "en_core_web_sm",
        "fr": "fr_core_news_sm",
        "de": "de_core_news_sm",
        "es": "es_core_news_sm"
    }
    return spacy.load(models.get(lang, "en_core_web_sm"))

def analyze_text(text, lang="en"):
    nlp = load_multilingual_model(lang)
    doc = nlp(text)

    # pos tagging
    print("POS Tagging:")
    for token in doc:
        print(f"{token.text}: {token.pos_} ({token.dep_})")
    print("\n" + "-"*50 + "\n")

    # named entity recognition
    print("Named Entities:")
    for ent in doc.ents:
        print(f"{ent.text} ({ent.label_})")
    print("\n" + "-"*50 + "\n")

if __name__ == "__main__":
    text_input = "Apple a été fondée par Steve Jobs en Californie en 1976."
    analyze_text(text_input, lang="fr")


POS Tagging:
Apple: NOUN (nsubj:pass)
a: AUX (aux:tense)
été: AUX (aux:pass)
fondée: VERB (ROOT)
par: ADP (case)
Steve: NOUN (obl:agent)
Jobs: PROPN (flat:name)
en: ADP (case)
Californie: NOUN (obl:mod)
en: ADP (case)
1976: NUM (nmod)
.: PUNCT (punct)

--------------------------------------------------

Named Entities:
Apple (ORG)
Steve Jobs (PER)
Californie (LOC)

--------------------------------------------------

