In [1]:
#Q1
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import random
import os
import requests

# Step 1: Loading text dataset (e.g., The Little Prince)
url = 'https://www.gutenberg.org/files/147/147-0.txt'
response = requests.get(url)
text = response.text.lower()

# Step 2: Creating character vocabulary
chars = sorted(set(text))
char_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_char = {i: c for i, c in enumerate(chars)}
vocab_size = len(chars)

# Step 3: Creating training sequences
seq_length = 100
step = 1
sentences = []
next_chars = []

for i in range(0, len(text) - seq_length, step):
    sentences.append(text[i: i + seq_length])
    next_chars.append(text[i + seq_length])

# One-hot encoding input and output
x = np.zeros((len(sentences), seq_length, vocab_size), dtype=np.bool_)
y = np.zeros((len(sentences), vocab_size), dtype=np.bool_)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_to_idx[char]] = 1
    y[i, char_to_idx[next_chars[i]]] = 1

# Step 4: Defining the LSTM model
model = Sequential([
    LSTM(128, input_shape=(seq_length, vocab_size)),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

# Step 5: Training the model
model.fit(x, y, batch_size=128, epochs=5)

# Step 6: Texting generation function
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_text(seed, length=300, temperature=1.0):
    generated = seed
    sentence = seed[-seq_length:]
    for i in range(length):
        x_pred = np.zeros((1, seq_length, vocab_size))
        for t, char in enumerate(sentence):
            if char in char_to_idx:
                x_pred[0, t, char_to_idx[char]] = 1
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = idx_to_char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char
    return generated

# Step 7: Generate and print sample text
seed_text = "once upon a time in a far away land"
print("Generated Text:\n")
print(generate_text(seed_text.lower()))


  super().__init__(**kwargs)


Epoch 1/5
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 287ms/step - loss: 2.8761
Epoch 2/5
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 286ms/step - loss: 2.2139
Epoch 3/5
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 287ms/step - loss: 2.0702
Epoch 4/5
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 287ms/step - loss: 1.9866
Epoch 5/5
[1m1006/1006[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m289s[0m 288ms/step - loss: 1.9216
Generated Text:

bue wltlesni'rsitttavry,selyide,e  jssttu s,,m,tupoiueiioitliewatuaoiisinanru&!soe'iiantroeantmhbsm,eiiuetitrlvonue


In [None]:
# Temperature controls randomness in text generation:

Low temperature (e.g., 0.2): Generates more predictable and repetitive text.

High temperature (e.g., 1.0 or more): Produces more diverse and creative text but may be less coherent.

In [3]:
#Q2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

def preprocess_nlp(sentence):
    # Step 1: Tokenization
    tokens = word_tokenize(sentence)
    print("Original Tokens:", tokens)

    # Step 2: Remove Stopwords
    stop_words = set(stopwords.words('english'))
    tokens_no_stop = [word for word in tokens if word.lower() not in stop_words]
    print("Tokens Without Stopwords:", tokens_no_stop)

    # Step 3: Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens_no_stop]
    print("Stemmed Words:", stemmed_tokens)

# Run the function with the given sentence
sentence = "NLP techniques are used in virtual assistants like Alexa and Siri."
preprocess_nlp(sentence)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Original Tokens: ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri', '.']
Tokens Without Stopwords: ['NLP', 'techniques', 'used', 'virtual', 'assistants', 'like', 'Alexa', 'Siri', '.']
Stemmed Words: ['nlp', 'techniqu', 'use', 'virtual', 'assist', 'like', 'alexa', 'siri', '.']


In [None]:
# Difference between Stemming and Lemmatization (with example: “running”)
Stemming and lemmatization are both used to reduce words to their root form in NLP.
Stemming simply chops off word endings, so it may produce non-real words.
For example, “running” becomes “run” using a stemmer, but sometimes it might become “runn” too.
Lemmatization is smarter—it uses grammar rules and a dictionary, so it always returns a real word. For “running”, a lemmatizer also gives “run”,
but in a more accurate and meaningful way. So, stemming is faster but less accurate, while lemmatization is slower but more correct.

In [4]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

word = "running"
print("Stemming:", stemmer.stem(word))            # Output: run
print("Lemmatization:", lemmatizer.lemmatize(word, pos='v'))  # Output: run


[nltk_data] Downloading package wordnet to /root/nltk_data...


Stemming: run
Lemmatization: run


In [None]:
#Removing stop words is useful in NLP tasks like search engines or topic modeling because words like “the”, “is”, “in”, etc.,
appear very often but don’t add much meaning. Removing them helps focus on important words and reduces noise in the data.

However, it can be harmful in tasks like sentiment analysis, question answering, or machine translation,
where stop words may carry meaning or affect sentence structure. For example, the sentence “I do not like it” loses its
negative meaning if “not” is removed. So, it’s important to decide based on the task whether to keep or remove stop words.

In [5]:
#Q3
import spacy

# Load SpaCy's English model
nlp = spacy.load("en_core_web_sm")

# Input sentence
text = "Barack Obama served as the 44th President of the United States and won the Nobel Peace Prize in 2009."

# Apply NLP pipeline
doc = nlp(text)

# Print named entities with their details
for ent in doc.ents:
    print(f"Entity: {ent.text}")
    print(f"Label: {ent.label_}")
    print(f"Start Char: {ent.start_char}, End Char: {ent.end_char}")
    print("---")


Entity: Barack Obama
Label: PERSON
Start Char: 0, End Char: 12
---
Entity: 44th
Label: ORDINAL
Start Char: 27, End Char: 31
---
Entity: the United States
Label: GPE
Start Char: 45, End Char: 62
---
Entity: the Nobel Peace Prize
Label: WORK_OF_ART
Start Char: 71, End Char: 92
---
Entity: 2009
Label: DATE
Start Char: 96, End Char: 100
---


In [None]:
#
NER (Named Entity Recognition) and POS (Part-of-Speech) tagging are both important tasks in NLP,
but they serve different purposes. POS tagging identifies the grammatical role of each word in a sentence, such as noun, verb, adjective, etc.,
helping to understand sentence structure. In contrast, NER focuses on spotting and classifying specific real-world entities in text,
such as names of people, places, organizations, dates, or monetary values. For example, in the sentence "Barack Obama visited New York in 2010,"
POS tagging tells us "Barack" is a proper noun and "visited" is a verb, while NER identifies "Barack Obama" as a person, "New York" as a location,
and "2010" as a date.

Two real-world applications of NER:

Financial News Monitoring: NER is used to extract company names, stock tickers, and financial figures from articles, helping investors and
analysts track relevant business events and market movements automatically.

Search Engines: NER helps search engines understand user queries better by recognizing named entities like movie titles, product names, or locations,
thereby improving search relevance and results accuracy.











In [6]:
#Q4
import numpy as np
import math
import tensorflow as tf

def scaled_dot_product_attention(Q, K, V):
    # Step 1: Dot product of Q and K^T
    scores = np.dot(Q, K.T)

    # Step 2: Scale the scores by sqrt(d)
    d = K.shape[-1]  # Key dimension
    scaled_scores = scores / math.sqrt(d)

    # Step 3: Apply softmax to get attention weights
    attention_weights = tf.nn.softmax(scaled_scores, axis=-1).numpy()

    # Step 4: Multiply attention weights with V
    output = np.dot(attention_weights, V)

    return attention_weights, output

# Test input
Q = np.array([[1, 0, 1, 0],
              [0, 1, 0, 1]])

K = np.array([[1, 0, 1, 0],
              [0, 1, 0, 1]])

V = np.array([[1, 2, 3, 4],
              [5, 6, 7, 8]])

# Run the function
attn_weights, output = scaled_dot_product_attention(Q, K, V)

# Print results
print("Attention Weights:\n", attn_weights)
print("Final Output:\n", output)


Attention Weights:
 [[0.73105858 0.26894142]
 [0.26894142 0.73105858]]
Final Output:
 [[2.07576569 3.07576569 4.07576569 5.07576569]
 [3.92423431 4.92423431 5.92423431 6.92423431]]


In [None]:
#
Why do we divide the attention score by √d?
In scaled dot-product attention, we divide the attention score by the square root of the key dimension (√d)
to avoid very large values. When the dimension d is large, the dot product between queries and keys can produce
large numbers. This can push the softmax function into regions where it becomes very flat, making the model harder to train.
Dividing by √d keeps the values in a manageable range and improves training stability.

How does self-attention help the model understand relationships between words in a sentence?
Self-attention helps the model figure out which words in a sentence are related to each other,
no matter where they appear. For example, in the sentence "The cat that chased the mouse was hungry,"
self-attention allows the model to learn that "cat" and "was hungry" are connected, even though they are far apart.
It works by assigning different attention scores to each word pair, letting the model focus more on the
important words when understanding the meaning of each part of the sentence.

In [8]:
#Q5
# Q5: Sentiment Analysis using HuggingFace Transformers

from transformers import pipeline

# Load the sentiment analysis pipeline
classifier = pipeline("sentiment-analysis")

# Input sentence
sentence = "Despite the high price, the performance of the new MacBook is outstanding."

# Perform sentiment analysis
result = classifier(sentence)[0]

# Print the sentiment and confidence score
print(f"Sentiment: {result['label']}")
print(f"Confidence Score: {result['score']:.4f}")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


Sentiment: POSITIVE
Confidence Score: 0.9998


In [None]:
#
Main Architectural Difference Between BERT and GPT:
BERT (Bidirectional Encoder Representations from Transformers) uses only the encoder part
of the Transformer architecture. It reads the entire text bidirectionally,
meaning it looks at both the left and right context of a word at the same time.
This helps it understand the meaning of words more deeply.

GPT (Generative Pre-trained Transformer) uses only the decoder part of the Transformer.
It is a unidirectional model, meaning it reads the text from left to right and predicts the
next word one by one, which makes it good for text generation.

Why Using Pre-Trained Models Is Beneficial:
Using pre-trained models like BERT or GPT saves time, computation, and data.
These models are already trained on massive amounts of text data,
so they understand language well. When we use them, we just fine-tune on our specific task
 (like sentiment analysis or question answering), which requires much less data and training.
 It gives high accuracy and performance, especially when we don’t have huge datasets or resources to train from scratch.









