In [2]:
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the English model from spaCy
print("Loading spaCy model...")
nlp = spacy.load('en_core_web_md')  # Medium model with word vectors

def perform_pos_tagging(text):
    """
    Perform Part-of-Speech tagging on input text
    Returns a list of (word, POS tag) tuples
    """
    doc = nlp(text)
    tagged_words = [(token.text, token.pos_) for token in doc]
    return tagged_words

def get_word_embeddings(text):
    """
    Generate word embeddings for each token in the input text
    Returns a dictionary mapping words to their vector representations
    """
    doc = nlp(text)
    word_embeddings = {}

    for token in doc:
        if token.has_vector:
            word_embeddings[token.text] = token.vector

    return word_embeddings

def find_similar_words(word, embeddings_dict, top_n=5):
    """
    Find words most similar to the input word based on cosine similarity
    Returns a list of (word, similarity score) tuples
    """
    if word not in embeddings_dict:
        return []

    target_vector = embeddings_dict[word]
    similarities = {}

    for other_word, vector in embeddings_dict.items():
        if other_word != word:
            similarity = cosine_similarity([target_vector], [vector])[0][0]
            similarities[other_word] = similarity

    # Sort by similarity score (descending)
    similar_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return similar_words[:top_n]

def analyze_text(text):
    """
    Perform comprehensive NLP analysis on input text
    """
    print("\n===== TEXT ANALYSIS =====")
    print(f"Input text: \"{text}\"")

    # POS Tagging
    print("\n=== PART-OF-SPEECH TAGGING ===")
    tagged_words = perform_pos_tagging(text)
    for word, pos in tagged_words:
        print(f"{word}: {pos}")

    # Word Embeddings
    print("\n=== WORD EMBEDDINGS ===")
    embeddings = get_word_embeddings(text)
    print(f"Generated embeddings for {len(embeddings)} words")

    # Sample a few words to show their embedding dimensions
    sample_words = list(embeddings.keys())[:2]
    for word in sample_words:
        vector = embeddings[word]
        print(f"\nEmbedding for '{word}' (showing first 5 dimensions):")
        print(vector[:5])

        # Find similar words
        print(f"\nWords similar to '{word}':")
        similar = find_similar_words(word, embeddings)
        for similar_word, score in similar:
            print(f"  {similar_word}: {score:.4f}")

# Example usage
if __name__ == "__main__":
    # Example sentences to analyze
    examples = [
        "The quick brown fox jumps over the lazy dog.",
        "Natural language processing helps computers understand human language.",
        "Machine learning models require large amounts of training data."
    ]

    print("NLP Analysis Tool: POS Tagging and Word Embeddings\n")
    print("This program analyzes text using spaCy to perform:")
    print("1. Part-of-Speech tagging")
    print("2. Word embedding generation")
    print("3. Similar word finding based on vector similarity\n")

    # Analyze each example
    for example in examples:
        analyze_text(example)
        print("\n" + "-"*50)

    # Interactive mode
    print("\nEnter your own text to analyze (or type 'exit' to quit):")
    user_text = input("> ")
    if user_text.lower() == 'exit':
        analyze_text(user_text)

Loading spaCy model...
NLP Analysis Tool: POS Tagging and Word Embeddings

This program analyzes text using spaCy to perform:
1. Part-of-Speech tagging
2. Word embedding generation
3. Similar word finding based on vector similarity


===== TEXT ANALYSIS =====
Input text: "The quick brown fox jumps over the lazy dog."

=== PART-OF-SPEECH TAGGING ===
The: DET
quick: ADJ
brown: ADJ
fox: NOUN
jumps: VERB
over: ADP
the: DET
lazy: ADJ
dog: NOUN
.: PUNCT

=== WORD EMBEDDINGS ===
Generated embeddings for 10 words

Embedding for 'The' (showing first 5 dimensions):
[-0.65276  0.23873 -0.23325  0.18608  0.37674]

Words similar to 'The':
  the: 1.0000
  .: 0.3697
  quick: 0.3498
  over: 0.3326
  lazy: 0.1891

Embedding for 'quick' (showing first 5 dimensions):
[-0.60053   0.18838  -0.40993   0.3225    0.070322]

Words similar to 'quick':
  .: 0.5222
  lazy: 0.4098
  The: 0.3498
  the: 0.3498
  over: 0.3182

--------------------------------------------------

===== TEXT ANALYSIS =====
Input text: "

In [5]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')  # Correct resource name (no "_eng" suffix)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
'''✅ What the Code Does (Summary)
This Python script is an NLP analysis tool built with spaCy. It performs:

Part-of-Speech (POS) tagging

Word embedding generation

Cosine similarity-based similar word detection

📚 Dataset Used
No external dataset is used.

It analyzes manually input or hardcoded text samples.

📦 Libraries Used
spaCy: For NLP operations (POS tagging and word embeddings)

numpy: For numerical vector manipulation

sklearn.metrics.pairwise.cosine_similarity: For finding similar words via vector comparison

(Optional/incorrectly placed): nltk.download() is at the end, not relevant for this code since POS tagging is handled by spaCy, not NLTK.

🧠 Possible Viva Questions and Answers
🔹 Section 1: POS Tagging
❓ Q1. What is POS tagging?
A:
Part-of-Speech tagging assigns each word in a sentence its grammatical role, such as noun, verb, adjective, etc.

❓ Q2. How is POS tagging performed in this code?
A:
Using spaCy’s nlp() function, which processes text into a Doc object. We then extract the POS tags using token.pos_.

🔹 Section 2: Word Embeddings
❓ Q3. What are word embeddings?
A:
They are dense vector representations of words in a high-dimensional space. Words with similar meanings are close together in this space.

❓ Q4. What model does the code use for embeddings?
A:
It loads en_core_web_md, a medium-sized spaCy model that includes pretrained word vectors for English.

❓ Q5. How are embeddings retrieved in the code?
A:
For each token in the text, token.vector gives the embedding if available (token.has_vector ensures this).

🔹 Section 3: Cosine Similarity
❓ Q6. Why is cosine similarity used?
A:
To measure similarity between word vectors. A higher cosine value (close to 1) means the words are more semantically similar.

❓ Q7. How is cosine similarity calculated?
A:
Using cosine_similarity() from sklearn. It compares the vector of the target word with other vectors in the dictionary.

🔹 Section 4: Design and Execution
❓ Q8. What is the structure of the program?
A:

The main function analyze_text() performs all tasks on input text.

It is called for both predefined examples and interactive user input.

❓ Q9. What's wrong with the NLTK lines at the end?
A:
They are unnecessary and incorrectly placed. This script uses spaCy for tagging, not NLTK. The downloads for punkt_tab and averaged_perceptron_tagger are not relevant here.

❓ Q10. What are some limitations of this code?
A:

It only compares words within the input sentence, not with a larger vocabulary.

It doesn’t use context-aware embeddings like BERT.

It doesn’t preprocess text (e.g., stopword removal, lemmatization).'''