In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd


In [27]:
#1 Implementing an RNN for Text Generation
import tensorflow as tf
import numpy as np
import os

# === Step 1: Load and preprocess data ===
path_to_file = tf.keras.utils.get_file("shakespeare.txt",
    "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt")

text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print(f"Length of text: {len(text)} characters")

# Character-level tokenization
vocab = sorted(set(text))
char2idx = {char: i for i, char in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])

# Create sequences
seq_length = 100
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    return chunk[:-1], chunk[1:]

dataset = sequences.map(split_input_target)

# Batch and shuffle
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# === Step 2: Build the model ===
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    return tf.keras.Sequential([
        tf.keras.layers.Input(batch_shape=(batch_size, None)),  # ✅ Keras 3 compatible
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(
            rnn_units,
            return_sequences=True,
            stateful=True,
            recurrent_initializer='glorot_uniform'
        ),
        tf.keras.layers.Dense(vocab_size)
    ])

model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

# === Step 3: Compile and train ===
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

# === Step 4: Checkpointing (Keras 3 fix: use .weights.h5) ===
checkpoint_dir = './training_checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt.weights.h5")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

# === Step 5: Train ===
EPOCHS = 10
model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])


Length of text: 98939 characters
Epoch 1/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 8s/step - loss: 3.7119
Epoch 2/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 8s/step - loss: 3.0415
Epoch 3/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 7s/step - loss: 2.6128
Epoch 4/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 7s/step - loss: 2.3443
Epoch 5/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 8s/step - loss: 2.1866
Epoch 6/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 7s/step - loss: 2.0656
Epoch 7/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 7s/step - loss: 1.9950
Epoch 8/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 8s/step - loss: 1.9227
Epoch 9/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 7s/step - loss: 1.8687
Epoch 10/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<keras.src.callbacks.history.History at 0x7853a0b02750>

In [29]:
# === Step 6: Load model for generation (batch_size = 1) ===
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(checkpoint_prefix)  # ✅ Use full .weights.h5 path
model.build(tf.TensorShape([1, None]))

# === Step 7: Text generation function ===
def generate_text(model, start_string, temperature=1.0, num_generate=1000):
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []

    # 🔄 Reset only the stateful RNN layers
    for layer in model.layers:
        if hasattr(layer, 'reset_states'):
            layer.reset_states()

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)

        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)


# === Step 8: Try generating text ===
print("\nGenerated text with temperature = 1.0:\n")
print(generate_text(model, start_string="ROMEO:", temperature=1.0))

print("\nGenerated text with temperature = 0.5:\n")
print(generate_text(model, start_string="ROMEO:", temperature=0.5))



Generated text with temperature = 1.0:

ROMEO:

The verir solm, ast glanth his and meys
So  le leive the dost nome ture aruvenst fall farne steem fil ste till for of for thin the deev.

Cokt des, be bome’s faot wolds perais:
Sak ipie drencoun oftingece:
But if eyes dear’d be the ottreis ger?
And ane haring thou sweet re bynenery wall lovelaty be,
The pance to llove my comE,
Thy in suplancs mif to me:
    Buk no ly dayy-or ferp-decutes,
Than sheey and your lives with fae hast,
Whin bes paze of baking shaye hin toum’dt exved thy trom sould.

CV

  And my ptorn my write endrencore exfion amon ey-be’cece;
Thenet,
Or bot thee and arawht my hmapk?
And fornich exsuraines to of stiel,
T  love y wiTh wall eyed dleave lokss the commalion.
    WiYing be cam see,
Theme or f it, you weethoun wind love steeps grisw
Thin sull e sown the thon mower unceafse rave?
Alven is sillfouss?
What im delf tise youl of and d,
   Huther cance’d swell rive bustong swrings;
Wheve stroul’s doth withy sweether the m

In [None]:
"""Temperature Scaling Explanation
Temperature scaling controls randomness in text generation:
- High temperature (>1.0) introduces more randomness, making outputs creative but potentially incoherent.
- Low temperature (<1.0) leads to deterministic predictions, producing repetitive and predictable text.
- Moderate temperature (≈0.7) strikes a balance, allowing diverse yet meaningful sequences."""


In [None]:
#2.NLP Preprocessing Pipeline
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
# Download the specific resource suggested by the error message
nltk.download('punkt_tab')

def nlp_preprocessing(sentence):
    # Step 1: Tokenize the sentence
    tokens = word_tokenize(sentence)

    # Step 2: Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens_without_stopwords = [word for word in tokens if word.lower() not in stop_words]

    # Step 3: Apply stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in tokens_without_stopwords]

    # Print outputs
    print("Original Tokens:", tokens)
    print("Tokens Without Stopwords:", tokens_without_stopwords)
    print("Stemmed Words:", stemmed_words)

# Example sentence
sentence = "NLP techniques are used in virtual assistants like Alexa and Siri."
nlp_preprocessing(sentence)

Original Tokens: ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri', '.']
Tokens Without Stopwords: ['NLP', 'techniques', 'used', 'virtual', 'assistants', 'like', 'Alexa', 'Siri', '.']
Stemmed Words: ['nlp', 'techniqu', 'use', 'virtual', 'assist', 'like', 'alexa', 'siri', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
"""### **1. Difference Between Stemming and Lemmatization**
Both **stemming** and **lemmatization** reduce words to their root forms, but they do so differently:
- **Stemming** is a **rule-based approach** that simply chops off word suffixes. It doesn’t always produce valid words.
- **Lemmatization** uses **linguistic analysis** to find a word's **dictionary base form (lemma)**, ensuring proper meaning.

#### **Example with "Running"**
- **Stemming (Porter Stemmer)** → `"running"` → `"run"`
- **Lemmatization (WordNet Lemmatizer)** → `"running"` → `"run"` (verb) / `"running"` (noun, unchanged)

Lemmatization is **more accurate** because it considers the word's context, whereas stemming may create non-existent words like `"play" → "playe"`.

### **2. Stop Word Removal: When It Helps vs. When It Harms**
Advantages:
- **Search engines** – Removing stopwords like `"the"`, `"is"` can improve indexing efficiency.
- **Sentiment analysis** – Focuses on meaningful words like `"excellent"`, `"terrible"` rather than `"the"`, `"and"`.
- **Topic modeling** – Helps extract **key themes** from text by eliminating common filler words.

Disadvantages
- **Legal or medical text processing** – Words like `"not"` or `"no"` are critical for **meaning**, e.g., `"Patient has no allergies"` vs. `"Patient has allergies"`.
- **Machine translation** – Stopwords often impact **grammatical correctness**, making sentence structure incomplete.
- **Question answering systems** – `"What is the capital of France?"` loses meaning if `"What"` is removed."""




In [None]:
#3.Named Entity Recognition with SpaCy
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_named_entities(text):
    # Process the text with spaCy
    doc = nlp(text)

    # Print detected entities
    for ent in doc.ents:
        print(f"Entity: {ent.text}, Label: {ent.label_}, Start: {ent.start_char}, End: {ent.end_char}")

# Example sentence
sentence = "Barack Obama served as the 44th President of the United States and won the Nobel Peace Prize in 2009."
extract_named_entities(sentence)

Entity: Barack Obama, Label: PERSON, Start: 0, End: 12
Entity: 44th, Label: ORDINAL, Start: 27, End: 31
Entity: the United States, Label: GPE, Start: 45, End: 62
Entity: the Nobel Peace Prize, Label: WORK_OF_ART, Start: 71, End: 92
Entity: 2009, Label: DATE, Start: 96, End: 100


In [None]:
"""### **1. Difference Between NER and POS Tagging**
Named Entity Recognition (**NER**) and Part-of-Speech (**POS**) tagging are both NLP techniques, but they serve different purposes:
- **NER** identifies and classifies named entities such as **persons, locations, organizations, dates, and monetary values** within text. Example: In *"Microsoft was founded in 1975 by Bill Gates,"* NER would detect **"Microsoft" (ORG), "1975" (DATE), "Bill Gates" (PERSON).**
- **POS Tagging** assigns grammatical roles to words (e.g., **noun, verb, adjective, adverb**). Example: *"The quick brown fox jumps over the lazy dog."* POS tagging would label **"fox" (noun), "jumps" (verb), "quick" (adjective).**

Essentially, **NER focuses on meaning and identifying named entities, while POS tagging focuses on grammatical function.**

### **2. Real-World Applications of NER**
 **Financial News & Sentiment Analysis:**
Banks and financial analysts use **NER to detect company names, stock symbols, and dates** in news articles. For example, if an article states *"Tesla’s stock rose by 5% on Tuesday,"* NER can extract **"Tesla" (ORG), "5%" (PERCENT), "Tuesday" (DATE)** to predict market trends.

**Search Engines & AI Chatbots:**
Search engines use **NER to improve query understanding and deliver relevant results.** If a user searches *"Restaurants in New York open past midnight,"* NER helps categorize **"New York" (GPE - Geopolitical Entity), "Restaurants" (ORG), "midnight" (TIME)** so the search algorithm can refine location-based results."""


In [None]:
#4.Scaled Dot-Product Attention
import numpy as np

def softmax(x):
    exp_x = np.exp(x - np.max(x))  # Stability trick for softmax
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

def scaled_dot_product_attention(Q, K, V):
    d_k = Q.shape[-1]  # Key dimension
    attention_scores = np.dot(Q, K.T) / np.sqrt(d_k)  # Dot product and scaling

    attention_weights = softmax(attention_scores)  # Softmax to get attention weights

    output = np.dot(attention_weights, V)  # Weighted sum with Value matrix

    return attention_weights, output

# Test inputs
Q = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
K = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
V = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

# Run scaled dot-product attention
attention_weights, output_matrix = scaled_dot_product_attention(Q, K, V)

# Print results
print("Attention Weights:\n", attention_weights)
print("Final Output:\n", output_matrix)

Attention Weights:
 [[0.73105858 0.26894142]
 [0.26894142 0.73105858]]
Final Output:
 [[2.07576569 3.07576569 4.07576569 5.07576569]
 [3.92423431 4.92423431 5.92423431 6.92423431]]


In [None]:
""" **1. Why Divide the Attention Score by √d?**
In **scaled dot-product attention**, the attention score is computed as the dot product of **Query (Q)** and **Key (K)**. Since dot products tend to produce large values when working with high-dimensional vectors, dividing by **√d** (where `d` is the key dimension) helps to:
- **Stabilize gradients** – Without scaling, large dot products can cause the softmax function to produce very small probabilities, leading to vanishing gradients.
- **Prevent extreme softmax outputs** – Without scaling, softmax might assign overly dominant attention weights, reducing the model's ability to focus on multiple relevant tokens.
This ensures the model maintains balanced attention across different words.

**2. How Does Self-Attention Help Understand Word Relationships?**
Self-attention enables models to **capture contextual dependencies** between words within a sentence by:
- **Assigning attention scores to all words** – Instead of looking at words sequentially, self-attention weighs relationships between all tokens in the input, considering long-range dependencies.
- **Handling polysemy (multiple meanings)** – For example, in "The bank on the river" vs. "She went to the bank," self-attention helps determine whether "bank" refers to a financial institution or a riverbank based on surrounding words.
- **Enhancing word interactions** – In complex sentences, self-attention allows words to form deeper connections, making it possible for transformers like BERT and GPT to understand meaning effectively."""



In [None]:
#Q5: Sentiment Analysis using HuggingFace Transformers
from transformers import pipeline

# Load the pre-trained sentiment analysis model
sentiment_pipeline = pipeline("sentiment-analysis")

# Input sentence
sentence = "Despite the high price, the performance of the new MacBook is outstanding."

# Analyze sentiment
result = sentiment_pipeline(sentence)[0]

# Print results
print(f"Sentiment: {result['label']}")
print(f"Confidence Score: {result['score']:.4f}")




No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Sentiment: POSITIVE
Confidence Score: 0.9998


In [None]:
""" **1. Main Architectural Difference Between BERT and GPT**
The key architectural difference lies in their transformer design:
- **BERT (Bidirectional Encoder Representations from Transformers)** uses an **encoder-only** architecture. It **processes input in both directions** (left to right and right to left), allowing for deep contextual understanding.
- **GPT (Generative Pre-trained Transformer)** is **decoder-based**, meaning it **processes input sequentially (left to right)** and is optimized for **generative tasks**, like text completion and conversational AI."""

""" **2. Why Use Pre-Trained Models Like BERT or GPT?**
Using pre-trained models instead of training from scratch has several advantages:
- **Reduces Computational Cost** – Training large models from scratch requires immense resources, whereas pre-trained models come optimized.
- **Leverages Transfer Learning** – Since BERT and GPT are pre-trained on massive corpora, they provide **rich language understanding** without needing large datasets for fine-tuning.
- **Faster Development** – Pre-trained models allow for quick adaptation to tasks like sentiment analysis, named entity recognition, and translation with minimal labeled data.
- **Better Accuracy** – They learn complex language patterns, improving accuracy even in nuanced tasks like sarcasm detection or intent recognition."""
