<a href="https://colab.research.google.com/github/pranjalraj28/trial/blob/main/cv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#1
import numpy as np

def compute_tf_idf(docs, vocab):
    n_docs = len(docs)
    n_vocab = len(vocab)
    tf = np.zeros((n_docs, n_vocab))

    # Build term frequency matrix
    for i, doc in enumerate(docs):
        words = doc.lower().split()
        for word in words:
            if word in vocab:
                j = vocab.index(word)
                tf[i, j] += 1
        tf[i] /= len(words)  # Normalize TF

    # Compute Document Frequency (DF)
    df = np.zeros(n_vocab)
    for j, term in enumerate(vocab):
        df[j] = sum(1 for doc in docs if term in doc.lower().split())

    # Compute Inverse Document Frequency (IDF)
    idf = np.log(n_docs / (df + 1))  # Add 1 to avoid division by zero

    # Compute TF-IDF matrix
    tf_idf = tf * idf  # Element-wise multiplication

    return tf_idf

# Example usage:
documents = [
    "cat sat on the mat",
    "dog sat on the log",
    "cat and dog played together"
]

vocabulary = list(set(" ".join(documents).lower().split()))
tf_idf_matrix = compute_tf_idf(documents, vocabulary)

print("Vocabulary:", vocabulary)
print("TF-IDF Matrix:\n", tf_idf_matrix)


Vocabulary: ['together', 'dog', 'sat', 'and', 'cat', 'mat', 'played', 'on', 'the', 'log']
TF-IDF Matrix:
 [[0.         0.         0.         0.         0.         0.08109302
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.08109302]
 [0.08109302 0.         0.         0.08109302 0.         0.
  0.08109302 0.         0.         0.        ]]


In [None]:
#2
def generate_ngrams(sentence, n):
    words = sentence.lower().split()
    ngrams = []
    for i in range(len(words) - n + 1):
        ngram = words[i:i + n]
        ngrams.append(ngram)
    return ngrams


sentence = "The quick brown fox jumps over the lazy dog."
n = 3
ngrams = generate_ngrams(sentence, n)
print(f"{n}-grams:")
for gram in ngrams:
    print(gram)


3-grams:
['the', 'quick', 'brown']
['quick', 'brown', 'fox']
['brown', 'fox', 'jumps']
['fox', 'jumps', 'over']
['jumps', 'over', 'the']
['over', 'the', 'lazy']
['the', 'lazy', 'dog.']


In [None]:
#3
from collections import Counter
def compute_trigram_language_model(documents):
    all_trigrams = []

    # Generate trigrams from each document
    for doc in documents:
        words = doc.lower().split()
        for i in range(len(words) - 2):
            trigram = tuple(words[i:i + 3])  # Create a 3-word tuple
            all_trigrams.append(trigram)

    # Count the frequency of each trigram
    trigram_counts = Counter(all_trigrams)
    total_trigrams = sum(trigram_counts.values())

    # Compute probabilities for each trigram
    trigram_probabilities = {}
    for trigram, count in trigram_counts.items():
        trigram_probabilities[trigram] = count / total_trigrams

    return trigram_probabilities


documents = [
    "The quick brown fox jumps over the lazy dog",
    "The quick blue fox jumps over the lazy cat",
    "The lazy dog sleeps under the blue sky"
]

trigram_model = compute_trigram_language_model(documents)

print("Trigram Probabilities:")
for trigram, prob in trigram_model.items():
    print(f"{trigram}: {prob:.4f}")


Trigram Probabilities:
('the', 'quick', 'brown'): 0.0500
('quick', 'brown', 'fox'): 0.0500
('brown', 'fox', 'jumps'): 0.0500
('fox', 'jumps', 'over'): 0.1000
('jumps', 'over', 'the'): 0.1000
('over', 'the', 'lazy'): 0.1000
('the', 'lazy', 'dog'): 0.1000
('the', 'quick', 'blue'): 0.0500
('quick', 'blue', 'fox'): 0.0500
('blue', 'fox', 'jumps'): 0.0500
('the', 'lazy', 'cat'): 0.0500
('lazy', 'dog', 'sleeps'): 0.0500
('dog', 'sleeps', 'under'): 0.0500
('sleeps', 'under', 'the'): 0.0500
('under', 'the', 'blue'): 0.0500
('the', 'blue', 'sky'): 0.0500


In [None]:
#4
import numpy as np

def create_embedding_matrix(corpus, embedding_dim):
    # Preprocessing
    vocabulary = {}
    index = 0
    for sentence in corpus:
        words = sentence.lower().split()
        for word in words:
            if word not in vocabulary:
                vocabulary[word] = index
                index += 1

    V = len(vocabulary)
    # Initialize embedding matrix with random values between 0 and 1
    E = np.random.rand(V, embedding_dim)

    def get_word_vector(word):
        word = word.lower()
        if word in vocabulary:
            idx = vocabulary[word]
            return E[idx]
        else:
            return np.zeros(embedding_dim)

    return E, vocabulary, get_word_vector

# Example usage:
corpus = [
    "I love machine learning",
    "Machine learning is amazing",
    "I love learning new things"
]
embedding_dim = 3

E, vocabulary, get_word_vector = create_embedding_matrix(corpus, embedding_dim)

print("Vocabulary:", vocabulary)
print("Embedding Matrix E:\n", E)

# Test get_word_vector
word = "learning"
vector = get_word_vector(word)
print(f"Embedding for '{word}':", vector)

# Test with a word not in the vocabulary
word = "unknown"
vector = get_word_vector(word)
print(f"Embedding for '{word}':", vector)



Vocabulary: {'i': 0, 'love': 1, 'machine': 2, 'learning': 3, 'is': 4, 'amazing': 5, 'new': 6, 'things': 7}
Embedding Matrix E:
 [[0.31856895 0.66741038 0.13179786]
 [0.7163272  0.28940609 0.18319136]
 [0.58651293 0.02010755 0.82894003]
 [0.00469548 0.67781654 0.27000797]
 [0.73519402 0.96218855 0.24875314]
 [0.57615733 0.59204193 0.57225191]
 [0.22308163 0.95274901 0.44712538]
 [0.84640867 0.69947928 0.29743695]]
Embedding for 'learning': [0.00469548 0.67781654 0.27000797]
Embedding for 'unknown': [0. 0. 0.]


In [None]:
#5
import numpy as np

def create_embedding_matrix_with_pretrained(corpus, pretrained_embeddings, embedding_dim):
    # Preprocessing
    vocabulary = {}
    index = 0
    for sentence in corpus:
        words = sentence.lower().split()
        for word in words:
            if word not in vocabulary:
                vocabulary[word] = index
                index += 1

    V = len(vocabulary)
    # Initialize embedding matrix
    E = np.random.rand(V, embedding_dim)

    # Assign embeddings
    for word, idx in vocabulary.items():
        if word in pretrained_embeddings:
            E[idx] = np.array(pretrained_embeddings[word])
        else:
            E[idx] = np.random.rand(embedding_dim)  # Random initialization

    # Define get_word_vector function
    def get_word_vector(word):
        word = word.lower()
        if word in vocabulary:
            idx = vocabulary[word]
            return E[idx]
        else:
            return np.zeros(embedding_dim)

    return E, vocabulary, get_word_vector

# Example usage:
corpus = [
    "I love machine learning",
    "Machine learning is amazing",
    "I love learning new things"
]

pretrained_embeddings = {
    "machine": [0.1, 0.2, 0.3],
    "learning": [0.2, 0.3, 0.4],
    "amazing": [0.3, 0.4, 0.5],
    "love": [0.4, 0.5, 0.6]
}

embedding_dim = 3

E, vocabulary, get_word_vector = create_embedding_matrix_with_pretrained(
    corpus, pretrained_embeddings, embedding_dim)

print("Vocabulary:", vocabulary)
print("Embedding Matrix E:\n", E)

# Test get_word_vector
word = "machine"
vector = get_word_vector(word)
print(f"Embedding for '{word}':", vector)

word = "i"
vector = get_word_vector(word)
print(f"Embedding for '{word}':", vector)  # Randomly initialized

word = "unknown"
vector = get_word_vector(word)
print(f"Embedding for '{word}':", vector)  # Returns zeros


Vocabulary: {'i': 0, 'love': 1, 'machine': 2, 'learning': 3, 'is': 4, 'amazing': 5, 'new': 6, 'things': 7}
Embedding Matrix E:
 [[0.2961402  0.11872772 0.31798318]
 [0.4        0.5        0.6       ]
 [0.1        0.2        0.3       ]
 [0.2        0.3        0.4       ]
 [0.41426299 0.0641475  0.69247212]
 [0.3        0.4        0.5       ]
 [0.56660145 0.26538949 0.52324805]
 [0.09394051 0.5759465  0.9292962 ]]
Embedding for 'machine': [0.1 0.2 0.3]
Embedding for 'i': [0.2961402  0.11872772 0.31798318]
Embedding for 'unknown': [0. 0. 0.]


In [None]:
#6
import numpy as np

def create_one_hot_encodings(corpus):
    # Preprocessing
    vocabulary = {}
    index = 0
    for sentence in corpus:
        words = sentence.lower().split()
        for word in words:
            if word not in vocabulary:
                vocabulary[word] = index
                index += 1

    V = len(vocabulary)
    # Initialize one-hot encoding matrix
    one_hot_encodings = {}

    for word, idx in vocabulary.items():
        one_hot_vector = np.zeros(V)
        one_hot_vector[idx] = 1
        one_hot_encodings[word] = one_hot_vector

    return vocabulary, one_hot_encodings

# Example usage:
corpus = [
    "I love machine learning",
    "Machine learning is amazing",
    "I love learning new things"
]

vocabulary, one_hot_encodings = create_one_hot_encodings(corpus)

print("Vocabulary:", vocabulary)
print("\nOne-Hot Encodings:")
for word, one_hot_vector in one_hot_encodings.items():
    print(f"Word: '{word}' - One-Hot Vector: {one_hot_vector}")



Vocabulary: {'i': 0, 'love': 1, 'machine': 2, 'learning': 3, 'is': 4, 'amazing': 5, 'new': 6, 'things': 7}

One-Hot Encodings:
Word: 'i' - One-Hot Vector: [1. 0. 0. 0. 0. 0. 0. 0.]
Word: 'love' - One-Hot Vector: [0. 1. 0. 0. 0. 0. 0. 0.]
Word: 'machine' - One-Hot Vector: [0. 0. 1. 0. 0. 0. 0. 0.]
Word: 'learning' - One-Hot Vector: [0. 0. 0. 1. 0. 0. 0. 0.]
Word: 'is' - One-Hot Vector: [0. 0. 0. 0. 1. 0. 0. 0.]
Word: 'amazing' - One-Hot Vector: [0. 0. 0. 0. 0. 1. 0. 0.]
Word: 'new' - One-Hot Vector: [0. 0. 0. 0. 0. 0. 1. 0.]
Word: 'things' - One-Hot Vector: [0. 0. 0. 0. 0. 0. 0. 1.]


In [None]:
#7
def generate_skip_gram_pairs(sentences, window_size):
    # Preprocessing: Build the vocabulary and word indices
    vocabulary = {}
    index = 0
    for sentence in sentences:
        words = sentence.lower().split()
        for word in words:
            if word not in vocabulary:
                vocabulary[word] = index
                index += 1

    # Generate skip-gram training pairs
    training_pairs = []
    for sentence in sentences:
        words = sentence.lower().split()
        for i, target_word in enumerate(words):
            # Define the context window
            start = max(0, i - window_size)
            end = min(len(words), i + window_size + 1)
            for j in range(start, end):
                if i != j:
                    context_word = words[j]
                    training_pairs.append((target_word, context_word))

    return vocabulary, training_pairs

# Example usage:
sentences = [
    "I love machine learning",
    "Machine learning is amazing",
    "I love learning new things"
]

window_size = 2

vocabulary, training_pairs = generate_skip_gram_pairs(sentences, window_size)

print("Vocabulary:", vocabulary)
print("\nSkip-Gram Training Pairs:")
for pair in training_pairs:
    print(pair)



Vocabulary: {'i': 0, 'love': 1, 'machine': 2, 'learning': 3, 'is': 4, 'amazing': 5, 'new': 6, 'things': 7}

Skip-Gram Training Pairs:
('i', 'love')
('i', 'machine')
('love', 'i')
('love', 'machine')
('love', 'learning')
('machine', 'i')
('machine', 'love')
('machine', 'learning')
('learning', 'love')
('learning', 'machine')
('machine', 'learning')
('machine', 'is')
('learning', 'machine')
('learning', 'is')
('learning', 'amazing')
('is', 'machine')
('is', 'learning')
('is', 'amazing')
('amazing', 'learning')
('amazing', 'is')
('i', 'love')
('i', 'learning')
('love', 'i')
('love', 'learning')
('love', 'new')
('learning', 'i')
('learning', 'love')
('learning', 'new')
('learning', 'things')
('new', 'love')
('new', 'learning')
('new', 'things')
('things', 'learning')
('things', 'new')


In [None]:
#8
def generate_cbow_pairs(sentences, window_size):
    # Preprocessing: Build the vocabulary and word indices
    vocabulary = {}
    index = 0
    for sentence in sentences:
        words = sentence.lower().split()
        for word in words:
            if word not in vocabulary:
                vocabulary[word] = index
                index += 1

    # Generate CBOW training pairs
    training_pairs = []
    for sentence in sentences:
        words = sentence.lower().split()
        for i, target_word in enumerate(words):
            # Define the context window
            start = max(0, i - window_size)
            end = min(len(words), i + window_size + 1)
            context_words = []
            for j in range(start, end):
                if i != j:
                    context_words.append(words[j])
            if context_words:
                training_pairs.append((tuple(context_words), target_word))

    return vocabulary, training_pairs

# Example usage:
sentences = [
    "I love machine learning",
    "Machine learning is amazing",
    "I love learning new things"
]

window_size = 2

vocabulary, training_pairs = generate_cbow_pairs(sentences, window_size)

print("Vocabulary:", vocabulary)
print("\nCBOW Training Pairs:")
for pair in training_pairs:
    print(f"Context: {pair[0]}, Target: {pair[1]}")



Vocabulary: {'i': 0, 'love': 1, 'machine': 2, 'learning': 3, 'is': 4, 'amazing': 5, 'new': 6, 'things': 7}

CBOW Training Pairs:
Context: ('love', 'machine'), Target: i
Context: ('i', 'machine', 'learning'), Target: love
Context: ('i', 'love', 'learning'), Target: machine
Context: ('love', 'machine'), Target: learning
Context: ('learning', 'is'), Target: machine
Context: ('machine', 'is', 'amazing'), Target: learning
Context: ('machine', 'learning', 'amazing'), Target: is
Context: ('learning', 'is'), Target: amazing
Context: ('love', 'learning'), Target: i
Context: ('i', 'learning', 'new'), Target: love
Context: ('i', 'love', 'new', 'things'), Target: learning
Context: ('love', 'learning', 'things'), Target: new
Context: ('learning', 'new'), Target: things


In [None]:
#9
import numpy as np

def rnn_forward(x, Wxh, Whh, Why, bh, by, h0):
    h = h0
    hs, ys = [], []
    for xt in x:
        xt = np.array([[xt]])  # Input at time step (column vector)
        h = np.tanh(np.dot(Whh, h) + np.dot(Wxh, xt) + bh)  # Hidden state update
        y = np.dot(Why, h) + by  # Output at time step
        hs.append(h)
        ys.append(y)
    return ys, hs

# Example usage:
x = [1, 2, 3]  # Input sequence
input_size, hidden_size, output_size = 1, 4, 1  # Sizes for input, hidden, output

# Random initialization of weights and biases
np.random.seed(0)
Wxh = np.random.randn(hidden_size, input_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(output_size, hidden_size) * 0.01
bh = np.zeros((hidden_size, 1))
by = np.zeros((output_size, 1))
h0 = np.zeros((hidden_size, 1))  # Initial hidden state

# Run the RNN
ys, hs = rnn_forward(x, Wxh, Whh, Why, bh, by, h0)

# Output results
print("Outputs at each time step:")
for t, y in enumerate(ys):
    print(f"Time step {t+1}: y = {y.flatten()}")


Outputs at each time step:
Time step 1: y = [-0.00050584]
Time step 2: y = [-0.00101643]
Time step 3: y = [-0.00152624]


In [None]:
#10
import numpy as np

def softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x / np.sum(exp_x)

def self_attention(X, Wq, Wk, Wv):
    # Compute Queries (Q), Keys (K), and Values (V)
    Q = np.dot(X, Wq)
    K = np.dot(X, Wk)
    V = np.dot(X, Wv)


    attention_scores = np.dot(Q, K.T)

    # Apply softmax to attention scores
    attention_weights = softmax(attention_scores)

    # Compute final output: Attention weights * V
    output = np.dot(attention_weights, V)

    return output


np.random.seed(0)
X = np.random.rand(4, 3)
Wq = np.random.rand(3,2)
Wk = np.random.rand(3,2)
Wv = np.random.rand(3,2)

output = self_attention(X, Wq, Wk, Wv)

print("Input Matrix X:")
print(X)
print("\nWeight Matrix Wq:")
print(Wq)
print("\nWeight Matrix Wk:")
print(Wk)
print("\nWeight Matrix Wv:")
print(Wv)
print("\nSelf-Attention Output:")
print(output)


Input Matrix X:
[[0.5488135  0.71518937 0.60276338]
 [0.54488318 0.4236548  0.64589411]
 [0.43758721 0.891773   0.96366276]
 [0.38344152 0.79172504 0.52889492]]

Weight Matrix Wq:
[[0.56804456 0.92559664]
 [0.07103606 0.0871293 ]
 [0.0202184  0.83261985]]

Weight Matrix Wk:
[[0.77815675 0.87001215]
 [0.97861834 0.79915856]
 [0.46147936 0.78052918]]

Weight Matrix Wv:
[[0.11827443 0.63992102]
 [0.14335329 0.94466892]
 [0.52184832 0.41466194]]

Self-Attention Output:
[[0.13865641 0.33354634]
 [0.13573272 0.32654002]
 [0.18386379 0.44127869]
 [0.08689064 0.20968136]]
