<a href="https://colab.research.google.com/github/smrutipunto/DNN/blob/main/Practical_7_10_DNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Practical 7Perform machine translation using custom sentences to translate from English to Hindi
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Attention, Concatenate
from tensorflow.keras.models import Model
import numpy as np


source_texts = ["hello world", "how are you", "good morning"]
raw_target_texts = ["नमस्ते दुनिया", "आप कैसे हैं", "सुप्रभात"]


target_texts = [f"<sos> {text}" for text in raw_target_texts]
target_texts_output = [f"{text} <eos>" for text in raw_target_texts]


source_vocab = {}
for sentence in source_texts:
    for word in sentence.split(' '):
        if word not in source_vocab:
            source_vocab[word] = len(source_vocab) + 1


target_vocab = {"<sos>": 1, "<eos>": 2}
for sentence in raw_target_texts:
    for word in sentence.split(' '):
        if word not in target_vocab:
            target_vocab[word] = len(target_vocab) + 1


reverse_target_vocab = {v: k for k, v in target_vocab.items()}


source_vocab_size = len(source_vocab) + 1
target_vocab_size = len(target_vocab) + 1


def text_to_sequence(texts, vocab):
    return [[vocab.get(word, 0) for word in sentence.split()] for sentence in texts]


encoder_input_data = text_to_sequence(source_texts, source_vocab)
decoder_input_data = text_to_sequence(target_texts, target_vocab)
decoder_target_data = text_to_sequence(target_texts_output, target_vocab)


max_encoder_seq_length = max(len(seq) for seq in encoder_input_data)
max_decoder_seq_length = max(len(seq) for seq in decoder_input_data)


encoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(encoder_input_data, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(decoder_input_data, maxlen=max_decoder_seq_length, padding='post')
decoder_target_data = tf.keras.preprocessing.sequence.pad_sequences(decoder_target_data, maxlen=max_decoder_seq_length, padding='post')


embedding_dim = 64
units = 128


encoder_inputs = Input(shape=(max_encoder_seq_length,), name="encoder_inputs")
encoder_embedding = Embedding(source_vocab_size, embedding_dim, name="encoder_embedding")(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(units, return_sequences=True, return_state=True, name="encoder_lstm")(encoder_embedding)
encoder_states = [state_h, state_c]


decoder_inputs = Input(shape=(max_decoder_seq_length,), name="decoder_inputs")
decoder_embedding = Embedding(target_vocab_size, embedding_dim, name="decoder_embedding")(decoder_inputs)
decoder_lstm_outputs, _, _ = LSTM(units, return_sequences=True, return_state=True, name="decoder_lstm")(decoder_embedding, initial_state=encoder_states)


attention_layer = Attention(name="attention_layer")
attention_output = attention_layer([decoder_lstm_outputs, encoder_outputs])
decoder_combined_context = Concatenate(axis=-1, name="concatenate_layer")([decoder_lstm_outputs, attention_output])


decoder_outputs = Dense(target_vocab_size, activation='softmax', name="output_layer")(decoder_combined_context)


model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


print("\n--- Starting Training ---")
model.fit(
    [encoder_input_data, decoder_input_data],
    np.expand_dims(decoder_target_data, -1),
    epochs=150,
    batch_size=2,
    verbose=2
)
print("--- Training Complete ---\n")


def translate(input_sentence):
    input_seq = text_to_sequence([input_sentence], source_vocab)
    input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_encoder_seq_length, padding='post')


    decoder_input_seq = np.zeros((1, max_decoder_seq_length))
    decoder_input_seq[0, 0] = target_vocab['<sos>']


    translated_sentence = []
    stop_condition = False

    while not stop_condition:
        output_tokens = model.predict([input_seq, decoder_input_seq], verbose=0)


        sampled_token_index = np.argmax(output_tokens[0, len(translated_sentence), :])
        sampled_word = reverse_target_vocab.get(sampled_token_index, '<unk>')


        if sampled_word == '<eos>' or len(translated_sentence) >= max_decoder_seq_length - 1:
            stop_condition = True
        else:
            translated_sentence.append(sampled_word)
            decoder_input_seq[0, len(translated_sentence)] = sampled_token_index


    return " ".join(translated_sentence)


for sentence in source_texts:
    translation = translate(sentence)
    print(f'Input: "{sentence}"')
    print(f'Predicted Translation: "{translation}"\n')


--- Starting Training ---
Epoch 1/150


KeyboardInterrupt: 

In [None]:
#Practical 8-Create a RAG model which fine tunes a LLM with any external knowledge source and create a app which can answer questions from than knowledge source

# Install required libraries
!pip install -q sentence-transformers transformers


from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import codecs


# Load dataset
dataset = []
with codecs.open('cat-facts.txt', 'r', encoding='utf-8', errors='ignore') as fdata:
    dataset = fdata.readlines()
print(f'Loaded {len(dataset)} entries')


# Embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # lightweight and fast


# Language model (text generation)
chat_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", trust_remote_code=True)
chat_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map='auto', torch_dtype=torch.float16)
chat_pipeline = pipeline("text-generation", model=chat_model, tokenizer=chat_tokenizer, max_new_tokens=200)


# Vector DB setup
VECTOR_DB = []


def add_chunk_to_database(chunk):
    embedding = embedding_model.encode(chunk)
    VECTOR_DB.append((chunk, embedding))


for i, chunk in enumerate(dataset):
    add_chunk_to_database(chunk)
    print(f'Added chunk {i+1}/{len(dataset)} to the database')


# Cosine similarity function
def cosine_similarity(a, b):
    a = torch.tensor(a)
    b = torch.tensor(b)
    return torch.nn.functional.cosine_similarity(a, b, dim=0).item()


# Retrieval
def retrieve(query, top_n=3):
    query_embedding = embedding_model.encode(query)
    similarities = []
    for chunk, embedding in VECTOR_DB:
        similarity = cosine_similarity(query_embedding, embedding)
        similarities.append((chunk, similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]


# Chat loop
input_query = input("Ask me a question: ")
retrieved_knowledge = retrieve(input_query)


print('\nRetrieved knowledge:')
for chunk, similarity in retrieved_knowledge:
    print(f' - (similarity: {similarity:.2f}) {chunk.strip()}')


# Build instruction prompt
instruction_prompt = f'''You are a helpful chatbot.
Use only the following pieces of context to answer the question.
Don't make up any new information:\n
{''.join([' - ' + chunk for chunk, _ in retrieved_knowledge])}\n
Question: {input_query}
Answer:'''


# Generate response
print("\nChatbot response:")
response = chat_pipeline(instruction_prompt)[0]['generated_text']
print(response[len(instruction_prompt):].strip())


In [None]:
#Practical 9- Perform tokenization and next sentence prediction with Bert
!pip install numpy==1.26.4
import numpy as np
import pandas as pd
import os


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")


text = """Artificial Intelligence is transforming industries at a rapid pace.
From healthcare to finance, AI systems are streamlining operations, improving outcomes,
and opening up new possibilities.
The core of modern AI is machine learning, where algorithms learn patterns from data to make predictions or decisions.
With the rise of big data, cloud computing, and powerful hardware, the capabilities of AI continue to expand."""


word_text = text.split(" ")
encoding = tokenizer.encode(text)
print("Token IDs:", encoding)
tokens = tokenizer.convert_ids_to_tokens(encoding)
print(tokens[0:11])


sentences = [
    ("Machine learning is a subset of AI.", "AI encompasses many fields, including machine learning.", 1),
    ("Python is a popular programming language.", "Pizza is a delicious Italian dish.", 0),
    ("Cloud computing enables remote data access.", "Data can be accessed from anywhere via the cloud.", 1),
    ("Quantum computing is still experimental.", "Dogs love playing fetch.", 0),
    ("Cybersecurity is essential for data protection.", "Hackers exploit vulnerabilities in software.", 1),
    ("Tech companies are investing in AI.", "Water boils at 100 degrees Celsius.", 0)
]


from datasets import Dataset
dataset = Dataset.from_dict({
    "sentence1": [s[0] for s in sentences],
    "sentence2": [s[1] for s in sentences],
    "label": [s[2] for s in sentences]
})


train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding="max_length")


train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])


from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="/working/results",
    learning_rate=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="/working/logs",
    logging_steps=10,
    save_strategy="epoch",
)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    from sklearn.metrics import accuracy_score
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}


trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    args=training_args,
    compute_metrics=compute_metrics
)


trainer.train()
eval_results = trainer.evaluate()
print(f"\nEvaluation Results: {eval_results}")


In [None]:
#Practical 10- Generate a sine wave patten with GAN
!pip install torch


import torch
from torch import nn
import math
import matplotlib.pyplot as plt
import numpy as np


train_data_length = 1024
train_data = torch.zeros((train_data_length, 2))
train_data[:, 0] = 2 * math.pi * torch.rand(train_data_length)
train_data[:, 1] = torch.sin(train_data[:, 0])
train_labels = torch.zeros(train_data_length)
train_set = [(train_data[i], train_labels[i]) for i in range(train_data_length)]


print("Visualizing the real data (a sine wave):")
plt.plot(train_data[:, 0], train_data[:, 1], ".")
plt.title("Real Data")
plt.show()


batch_size = 32
train_loader = torch.utils.data.DataLoader(
    train_set, batch_size=batch_size, shuffle=True
)


class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(2, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid(),
        )


    def forward(self, x):
        return self.model(x)


class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(2, 16),
            nn.ReLU(),
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 2),
        )


    def forward(self, x):
        return self.model(x)


discriminator = Discriminator()
generator = Generator()


lr = 0.001
num_epochs = 300
loss_function = nn.BCELoss()


optimizer_discriminator = torch.optim.Adam(discriminator.parameters(), lr=lr)
optimizer_generator = torch.optim.Adam(generator.parameters(), lr=lr)


print("\nStarting GAN training...")
for epoch in range(num_epochs):
    for n, (real_samples, _) in enumerate(train_loader):
        real_samples_labels = torch.ones((batch_size, 1))
        latent_space_samples = torch.randn((batch_size, 2))
        generated_samples = generator(latent_space_samples)
        generated_samples_labels = torch.zeros((batch_size, 1))


        all_samples = torch.cat((real_samples, generated_samples))
        all_samples_labels = torch.cat((real_samples_labels, generated_samples_labels))


        discriminator.zero_grad()
        output_discriminator = discriminator(all_samples)
        loss_discriminator = loss_function(output_discriminator, all_samples_labels)
        loss_discriminator.backward()
        optimizer_discriminator.step()


        latent_space_samples = torch.randn((batch_size, 2))
        generator.zero_grad()
        generated_samples = generator(latent_space_samples)
        output_discriminator_generated = discriminator(generated_samples)
        loss_generator = loss_function(output_discriminator_generated, real_samples_labels)
        loss_generator.backward()
        optimizer_generator.step()


        if epoch % 10 == 0 and n == len(train_loader) - 1:
            print(f"Epoch: {epoch} Loss D.: {loss_discriminator:.4f} Loss G.: {loss_generator:.4f}")


print("Training finished.")


print("\nGenerating new samples from the trained generator...")
latent_space_samples = torch.randn(train_data_length, 2)
generated_samples = generator(latent_space_samples)
generated_samples = generated_samples.detach()


plt.figure(figsize=(8, 6))
plt.plot(train_data[:, 0], train_data[:, 1], ".", label="Real Data")
plt.plot(generated_samples[:, 0], generated_samples[:, 1], ".", label="Generated Data", alpha=0.7)
plt.title("GAN: Real vs. Generated Data")
plt.legend()
plt.show()


