<a href="https://colab.research.google.com/github/satwikakallem/Home_Assignment/blob/main/Home_Assignment4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

def preprocess_nlp(sentence):
    """
    Performs NLP preprocessing on a sentence.

    Args:
        sentence (str): The input sentence to preprocess.

    Returns:
        None: Prints the original tokens, tokens without stopwords, and stemmed words.
    """
    # 1. Tokenization
    tokens = word_tokenize(sentence)
    print("Original Tokens:", tokens)

    # 2. Stopword Removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    print("Tokens Without Stopwords:", filtered_tokens)

    # 3. Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
    print("Stemmed Words:", stemmed_words)

if __name__ == "__main__":
    # Download necessary NLTK data (if not already downloaded)
    try:
        # Check if stopwords are already downloaded
        set(stopwords.words('english'))
    except LookupError:
        nltk.download('stopwords')
    try:
        # Check if punkt (tokenizer model) is downloaded
        word_tokenize("hello")
    except LookupError:
        nltk.download('punkt')
    try:
        # Check if punkt_tab is downloaded.  This was the source of the error.
        nltk.word_tokenize("hello")
    except LookupError:
        nltk.download('punkt_tab')

    sentence = "NLP techniques are used in virtual assistants like Alexa and Siri."
    preprocess_nlp(sentence)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Original Tokens: ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri', '.']
Tokens Without Stopwords: ['nlp', 'techniques', 'used', 'virtual', 'assistants', 'like', 'alexa', 'siri', '.']
Stemmed Words: ['nlp', 'techniqu', 'use', 'virtual', 'assist', 'like', 'alexa', 'siri', '.']


In [4]:
import spacy

def extract_named_entities(sentence):
    """
    Extracts named entities from a sentence using spaCy.

    Args:
        sentence (str): The input sentence.

    Returns:
        None: Prints the entity text, label, and character positions for each entity.
    """
    # Load the spaCy English model
    nlp = spacy.load("en_core_web_sm")  # You might need to download this model

    # Process the sentence with the spaCy model
    doc = nlp(sentence)

    # Iterate over the entities and print their details
    for ent in doc.ents:
        print(f"Entity: {ent.text}, Label: {ent.label_}, Start: {ent.start_char}, End: {ent.end_char}")

if __name__ == "__main__":
    # Download the spaCy model (if not already downloaded)
    try:
        spacy.load("en_core_web_sm")
    except OSError:
        import subprocess
        subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])

    sentence = "Barack Obama served as the 44th President of the United States and won the Nobel Peace Prize in 2009."
    extract_named_entities(sentence)


Entity: Barack Obama, Label: PERSON, Start: 0, End: 12
Entity: 44th, Label: ORDINAL, Start: 27, End: 31
Entity: the United States, Label: GPE, Start: 45, End: 62
Entity: the Nobel Peace Prize, Label: WORK_OF_ART, Start: 71, End: 92
Entity: 2009, Label: DATE, Start: 96, End: 100


In [9]:
import numpy as np

def scaled_dot_product_attention(Q, K, V):
    # Step 1: Dot product of Q and K^T
    matmul_qk = np.dot(Q, K.T)

    # Step 2: Scale by sqrt(d)
    d = K.shape[1]
    scaled_attention_logits = matmul_qk / np.sqrt(d)

    # Step 3: Apply softmax to get attention weights
    attention_weights = softmax(scaled_attention_logits)

    # Step 4: Multiply attention weights with V
    output = np.dot(attention_weights, V)

    return attention_weights, output

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))  # stability improvement
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

# Test inputs
Q = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
K = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
V = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

# Run attention
attention_weights, output = scaled_dot_product_attention(Q, K, V)

# Print results
print("Attention Weights:\n", attention_weights)
print("Output:\n", output)


Attention Weights:
 [[0.73105858 0.26894142]
 [0.26894142 0.73105858]]
Output:
 [[2.07576569 3.07576569 4.07576569 5.07576569]
 [3.92423431 4.92423431 5.92423431 6.92423431]]


In [10]:
from transformers import pipeline

def analyze_sentiment(sentence):
    """
    Analyzes the sentiment of a sentence using Hugging Face transformers.

    Args:
        sentence (str): The input sentence to analyze.

    Returns:
        None: Prints the sentiment label and confidence score.
    """
    # Load the sentiment analysis pipeline
    sentiment_pipeline = pipeline("sentiment-analysis")

    # Analyze the sentiment of the sentence
    result = sentiment_pipeline(sentence)[0]

    # Print the results
    print(f"Sentiment: {result['label']}")
    print(f"Confidence Score: {result['score']:.4f}")

if __name__ == "__main__":
    sentence = "Despite the high price, the performance of the new MacBook is outstanding."
    analyze_sentiment(sentence)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


Sentiment: POSITIVE
Confidence Score: 0.9998
