Data cleaning and preparaing dataset 


In [None]:
import json
import random

# List of computer science-related keywords
cs_keywords = [
    "algorithm", "data structure", "programming", "computer", "software", 
    "network", "database", "machine learning", "AI", "artificial intelligence",
    "computing", "code", "coding", "neural network", "deep learning", "blockchain",
    "runtime", "compiler", "operating system", "API", "application", "hardware",
    "CPU", "GPU", "RAM", "disk", "cache", "cloud computing", "cybersecurity", "encryption"
    # ... add more keywords as needed
]

def is_cs_related(text):
    # Convert text to lowercase and check if any of the keywords are present
    return any(keyword in text.lower() for keyword in cs_keywords)


selected_data = []

# Open the JSON file for reading
with open("s2ag.valid.0", "r") as json_file:
    for line in json_file:
        try:
            data = json.loads(line)
            # Check if the data has 'text' and 'id' keys and if the text is CS-related
            if 'text' in data and 'id' in data and is_cs_related(data['text']):
                selected_data.append(data)
        except json.JSONDecodeError:
            print("Invalid JSON data:", line)

# Shuffle the selected data to ensure randomness
random.shuffle(selected_data)

# Split the data into train and test sets
train_data = selected_data[:3000]
test_data = selected_data[3000:4000]

# Now you have 'train_data' and 'test_data' containing computer science-related entries
# Each entry is a dictionary with 'id' and 'text' keys

In [None]:
# Assume train_data is a list of dictionaries with keys 'id' and 'text'

# Loop through and create a formatted string for each entry
output_string = ""
for entry in train_data:
    entry_string = "ID: " + str(entry["id"]) + "\n" + "Text: " + entry["text"] + "\n\n"
    output_string += entry_string

# Save to a txt file with utf-8 encoding
with open("output.txt", "w", encoding="utf-8") as file:
    file.write(output_string)

In [None]:
for entry in train_data:
    print("ID:", entry["id"])
    print("Text:", entry["text"])
    print()  # Add an empty line for separation between entries

In [None]:
# Read the txt file
with open("output.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Modify the lines as needed
for i in range(len(lines)):
    line = lines[i].strip()  # Remove leading/trailing white spaces
    if line.startswith("Text:"):
        # Check if the line doesn't end with punctuation (you can extend this list if needed)
        if not (line.endswith('.') or line.endswith('!') or line.endswith('?')):
            lines[i] = line + '.\n'

# Save the modified content back to the txt file
with open("output_modified.txt", "w", encoding="utf-8") as file:
    file.writelines(lines)


In [None]:
# Read the txt file
with open("output_modified.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Initialize a list to store the formatted content
formatted_lines = []

# Initialize buffer for collecting text
text_buffer = []

# Loop through the lines and format them
for line in lines:
    line = line.strip()
    if line.startswith("ID:"):
        # If there's content in the buffer, append it as a single paragraph
        if text_buffer:
            formatted_lines.append(" ".join(text_buffer))
            text_buffer = []
        formatted_lines.append(line)
    else:
        # Collect lines in buffer to eventually combine them into a paragraph
        text_buffer.append(line.replace("Text:", "").strip())

# Add any remaining text in the buffer as a paragraph
if text_buffer:
    formatted_lines.append(" ".join(text_buffer))

# Save the formatted content back to a new txt file
with open("output_paragraphed.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(formatted_lines))


In [None]:
# Read the txt file
with open("output_paragraphed.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Filter out lines that start with "ID:"
filtered_lines = [line for line in lines if not line.startswith("ID:")]

# Save the filtered content back to the txt file (or a new one if you prefer)
with open("output_paragraphed2.txt", "w", encoding="utf-8") as file:
    file.writelines(filtered_lines)


In [None]:
! pip install spacy

In [None]:
import spacy
from spacy.lang.en import English

# Read the text from the file
with open("output_paragraphed2.txt", "r", encoding="utf-8") as file:
    whole_text = file.read()

nlp = English()  # Create an English language object
sentencizer = nlp.add_pipe("sentencizer")  # Add the sentencizer component to the pipeline

# Break text into chunks of 500,000 characters each
chunk_size = 500000
text_chunks = [whole_text[i:i + chunk_size] for i in range(0, len(whole_text), chunk_size)]

all_sentences = []
for chunk in text_chunks:
    doc = nlp(chunk)  # Process the chunk
    # Extract sentences from the doc and extend the all_sentences list
    all_sentences.extend([sent.text for sent in doc.sents])

# Save the sentences to a new txt file
with open("sentences_output.txt", "w", encoding="utf-8") as file:
    for sentence in all_sentences:
        file.write(sentence + "\n\n")  # Each sentence on a new line with a blank line in between


2.Filtering and Cleaning 

In [None]:
import re

In [None]:
def remove_special_characters(text):
    # Remove bullets, apostrophes, hyphens, and enumerators without merging words
    text = re.sub(r'[\•\’\'\-\–]', ' ', text)  # Including the apostrophe ' now
    text = re.sub(r'\d+\.', '', text)  # Removing enumerators
    return text

In [None]:
import re
from word2number import w2n

def remove_urls(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub('', text)

def remove_special_characters(text):
    # Remove bullets, apostrophes, hyphens, and enumerators without merging words
    text = re.sub(r'[\•\’\-\–]', ' ', text) # Replacing with space
    text = re.sub(r'\d+\.', '', text)  # Removing enumerators
    return text

def process_numericals(text):
    # If the line is just a number or a decimal, return an empty string
    if re.fullmatch(r'\d+(\.\d+)?', text.strip()):
        return ''
    # Convert other numbers in the sentence to text
    text = re.sub(r'\b\d+\b', lambda m: str(w2n.word_to_num(m.group())), text)
    return text


with open("sentences_output.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

processed_lines = []
for line in lines:
    line = remove_urls(line)
    line = remove_special_characters(line)
    line = process_numericals(line)
    processed_lines.append(line)

with open("processed_file.txt", "w", encoding="utf-8") as file:
    file.writelines(processed_lines)


In [None]:
import nltk
from nltk.corpus import words as nltk_words

nltk.download('words')

def is_valid_word(word, word_list):
    # Convert to lowercase and check if the word exists in the word list
    return word.lower() in word_list

def process_file(filename):
    # Load English words
    english_words = set(nltk_words.words())

    # Open the file with utf-8 encoding
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    processed_sentences = []
    all_words = set()

    for line in lines:
        # Convert to lowercase
        line = line.strip().lower()

        # Split into words and remove the trailing period
        words = [w.rstrip('.') for w in line.split()]

        # Filter out invalid words
        valid_words = [word for word in words if is_valid_word(word, english_words)]

        if valid_words:  # Only add sentences that have at least one valid word
            processed_sentence = ' '.join(valid_words)
            processed_sentences.append(processed_sentence)
            all_words.update(valid_words)

    # Save to new file
    with open('processed_output2.txt', 'w', encoding='utf-8') as f:
        for sentence in processed_sentences:
            f.write(sentence + '\n')

    # Get ordered unique sentences and words
    unique_sentences = sorted(set(processed_sentences))
    unique_words = sorted(all_words)

    # Print the results
    print("Ordered Unique Sentences:")
    for sentence in unique_sentences:
        print(sentence)
    print("\nSet of Words:")
    for word in unique_words:
        print(word)
    print("\nLength of Unique Sentences:", len(unique_sentences))
    print("Length of Set of Words:", len(unique_words))

process_file('processed_file.txt')


3.Designing input matrix ,tf-idf matrix,output matrix

In [2]:
from scipy.sparse import lil_matrix

def process_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Removing duplicates while maintaining order
    unique_sentences = sorted(set(lines), key=lines.index)
    
    # Extract all unique words, ensuring lowercase representation
    all_words = set()
    for sentence in unique_sentences:
        for word in sentence.split():
            all_words.add(word.lower())

    return unique_sentences, sorted(all_words)

unique_sentences, unique_words = process_file('new.txt')
word_to_index = {word: idx for idx, word in enumerate(unique_words)}

num_sentences = len(unique_sentences)
num_words = len(unique_words)

input_matrix = lil_matrix((num_sentences, num_words), dtype=int)
output_matrix = lil_matrix((num_sentences, num_words), dtype=int)

context_window = 3

for idx, sentence in enumerate(unique_sentences):
    # Input matrix
    for word in sentence.split():
        input_matrix[idx, word_to_index[word.lower()]] = 1

    # Output matrix
    context_start = max(0, idx - (context_window // 2))
    context_end = min(num_sentences, idx + (context_window // 2) + 1)
    
    for i in range(context_start, context_end):
        if i != idx:
            for word in unique_sentences[i].split():
                output_matrix[idx, word_to_index[word.lower()]] = 1

# Converting to CSR format for efficient row-wise operations
input_matrix = input_matrix.tocsr()
output_matrix = output_matrix.tocsr()

# print("Input Matrix - First Row:")
# print(input_matrix[0].toarray())
# print("\nOutput Matrix - First Row:")
# print(output_matrix[0].toarray())

# # Non-zero entries for the first row
# non_zero_input = input_matrix[0].nonzero()[1]
# non_zero_output = output_matrix[0].nonzero()[1]

# print("\nNon-Zero Entries for First Row of Input Matrix:", non_zero_input)
# print("Words corresponding to non-zero entries in the Input Matrix:", [unique_words[idx] for idx in non_zero_input])

# print("\nNon-Zero Entries for First Row of Output Matrix:", non_zero_output)
# print("Words corresponding to non-zero entries in the Output Matrix:", [unique_words[idx] for idx in non_zero_output])


In [None]:
# print(len(unique_sentences))
# print(len(unique_words))  

In [None]:
from collections import defaultdict
import math
from scipy.sparse import lil_matrix

# Step 1: Calculate IDF for each word in the vocabulary

word_sentence_indices = defaultdict(set)

for idx, sentence in enumerate(unique_sentences):
    for word in set(sentence.split()):  # Using set to avoid counting a word multiple times in the same sentence
        word_sentence_indices[word.lower()].add(idx)

idf = {}
for word, sentence_indices in word_sentence_indices.items():
    idf[word] = math.log(len(unique_sentences) / len(sentence_indices))

# Step 2: Calculate TF for each word in each sentence based on the context window and compute TF-IDF

tf_idf_matrix = lil_matrix((len(unique_sentences), len(unique_words)), dtype=float)

context_window_size = 3  # -1, 0, 1

for idx, sentence in enumerate(unique_sentences):
    word_counts = defaultdict(int)
    
    # Calculate TF (Term Frequency) with the context window
    context_start = max(0, idx - (context_window_size // 2))
    context_end = min(len(unique_sentences), idx + (context_window_size // 2) + 1)
    
    total_words_in_context = 0
    for i in range(context_start, context_end):
        total_words_in_context += len(unique_sentences[i].split())
        for word in unique_sentences[i].split():
            word_counts[word.lower()] += 1
    


    for word, count in word_counts.items():
        tf = count / total_words_in_context
        tf_idf_matrix[idx, unique_words.index(word)] = tf * idf[word]




METHOD:1Taking input matrix and outputmatrix where input matrix contain 1 where targer sentence words are present 

In [3]:
import tensorflow as tf
def softmax(X):
    X_exp = tf.exp(X)
    partition = tf.reduce_sum(X_exp, 1, keepdims=True)
    return X_exp / partition

In [72]:
from scipy.sparse import lil_matrix

def process_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Removing duplicates while maintaining order
    unique_sentences = sorted(set(lines), key=lines.index)
    
    # Extract all unique words, ensuring lowercase representation
    all_words = set()
    for sentence in unique_sentences:
        for word in sentence.split():
            all_words.add(word.lower())

    return unique_sentences, sorted(all_words)

unique_sentences, unique_words = process_file('new.txt')
word_to_index = {word: idx for idx, word in enumerate(unique_words)}

num_sentences = len(unique_sentences)
num_words = len(unique_words)

input_matrix = lil_matrix((num_sentences, num_words), dtype=int)
output_matrix = lil_matrix((num_sentences, num_words), dtype=int)

context_window = 3

for idx, sentence in enumerate(unique_sentences):
    # Input matrix
    for word in sentence.split():
        input_matrix[idx, word_to_index[word.lower()]] = 1

    # Output matrix
    context_start = max(0, idx - (context_window // 2))
    context_end = min(num_sentences, idx + (context_window // 2) + 1)
    
    for i in range(context_start, context_end):
        if i != idx:
            for word in unique_sentences[i].split():
                output_matrix[idx, word_to_index[word.lower()]] = 1

# Converting to CSR format for efficient row-wise operations
input_matrix = input_matrix.tocsr()
output_matrix = output_matrix.tocsr()




In [4]:
import tensorflow as tf

vocab_size = input_matrix.shape[1]
num_hiddens = 25

W1 = tf.Variable(tf.random.normal(shape=(vocab_size, num_hiddens), mean=0, stddev=0.01))
W2 = tf.Variable(tf.random.normal(shape=(num_hiddens, vocab_size), mean=0, stddev=0.01))

params = [W1, W2]

In [5]:
import tensorflow as tf
from scipy.sparse import lil_matrix
import random

# ...[Your previous function and data preparation definitions]

def net(X):
    X = tf.reshape(tf.cast(X, dtype=tf.float32), (-1, vocab_size))
    a1 = tf.matmul(X, W1)  # Replaced input_matrix with X
    a2 = tf.matmul(a1, W2)
    return softmax(a2)

def negative_log_likelihood(y_hat, y):
    # Compute the negative log likelihood for true class predictions
    return -tf.reduce_mean(tf.math.log(tf.boolean_mask(y_hat, y) + 1e-10))

def sgd(params, grads, lr):
    """Gradient descent."""
    for param, grad in zip(params, grads):
        param.assign_sub(lr * grad)

def softmax(X):
    X_exp = tf.exp(X)
    partition = tf.reduce_sum(X_exp, 1, keepdims=True)
    return X_exp / partition 

# ...[Your previous data loading and processing definitions]

# Converting sparse matrices to dense tensors for TensorFlow operations
input_tensor = tf.convert_to_tensor(input_matrix.toarray(), dtype=tf.float32)
output_tensor = tf.convert_to_tensor(output_matrix.toarray(), dtype=tf.float32)

lr = 0.1
num_epochs = 10

for epoch in range(num_epochs):
    with tf.GradientTape() as tape:
        y_hat = net(input_tensor)
        l = negative_log_likelihood(y_hat, output_tensor)
        print(l)
    grads = tape.gradient(l, params)
    sgd(params, grads, lr)


tf.Tensor(9.398483, shape=(), dtype=float32)
tf.Tensor(9.398483, shape=(), dtype=float32)
tf.Tensor(9.398483, shape=(), dtype=float32)
tf.Tensor(9.398483, shape=(), dtype=float32)
tf.Tensor(9.398483, shape=(), dtype=float32)
tf.Tensor(9.398483, shape=(), dtype=float32)
tf.Tensor(9.398483, shape=(), dtype=float32)
tf.Tensor(9.398483, shape=(), dtype=float32)
tf.Tensor(9.398483, shape=(), dtype=float32)
tf.Tensor(9.398483, shape=(), dtype=float32)


In [6]:
class Embeddings(tf.Module):
    def __init__(self, W1, W2):
        self.W1 = tf.Variable(W1)
        self.W2 = tf.Variable(W2)


In [7]:
embedding_module = Embeddings(W1, W2)
saved_model_path = "./saved_embeddings"
tf.saved_model.save(embedding_module, saved_model_path)


INFO:tensorflow:Assets written to: ./saved_embeddings\assets


In [8]:
loaded_module = tf.saved_model.load(saved_model_path)
W1_loaded = loaded_module.W1
W2_loaded = loaded_module.W2


In [9]:
def cosine_similarity(vector_a, vector_b):
    dot_product = tf.reduce_sum(tf.multiply(vector_a, vector_b))
    norm_a = tf.norm(vector_a)
    norm_b = tf.norm(vector_b)
    return dot_product / (norm_a * norm_b)

# Example: Compute cosine similarity between two rows (i.e., two word embeddings) from W1_loaded
similarity_score = cosine_similarity(W1_loaded[1], W1_loaded[5])
print(similarity_score.numpy())


-0.3510478


In [11]:
def cosine_similarity(vector_a, vector_b):
    dot_product = tf.reduce_sum(tf.multiply(vector_a, vector_b))
    norm_a = tf.norm(vector_a)
    norm_b = tf.norm(vector_b)
    return dot_product / (norm_a * norm_b)

# No need for the line "unique_words = list(idx.keys())", remove it.

# Example: Compute cosine similarity between two rows (i.e., two word embeddings) from W1_loaded
index_a = 5
index_b = 10
similarity_score = cosine_similarity(W1_loaded[index_a], W1_loaded[index_b])

word_a = unique_words[index_a]
word_b = unique_words[index_b]

print(f"Cosine Similarity between '{word_a}' and '{word_b}': {similarity_score.numpy()}")


Cosine Similarity between 'abdominal' and 'ability': 0.1905229389667511


In [12]:
def cosine_similarity(vector_a, vector_b):
    dot_product = tf.reduce_sum(tf.multiply(vector_a, vector_b))
    norm_a = tf.norm(vector_a)
    norm_b = tf.norm(vector_b)
    return dot_product / (norm_a * norm_b)

# Loop through the indices from 30 to 50
for index_a in range(30, 51):
    for index_b in range(index_a+1, 51):  # This ensures we don't compare the word with itself
        similarity_score = cosine_similarity(W1_loaded[index_a], W1_loaded[index_b])
        
        word_a = unique_words[index_a]
        word_b = unique_words[index_b]
        
        print(f"Cosine Similarity between '{word_a}' and '{word_b}': {similarity_score.numpy()}")


Cosine Similarity between 'absolutely' and 'absorb': 0.038716256618499756
Cosine Similarity between 'absolutely' and 'absorbable': 0.34109070897102356
Cosine Similarity between 'absolutely' and 'absorbed': -0.36461934447288513
Cosine Similarity between 'absolutely' and 'absorber': 0.10269894450902939
Cosine Similarity between 'absolutely' and 'absorbing': 0.14536257088184357
Cosine Similarity between 'absolutely' and 'absorption': -0.12070897221565247
Cosine Similarity between 'absolutely' and 'absorptive': 0.0826815515756607
Cosine Similarity between 'absolutely' and 'abstain': 0.27033525705337524
Cosine Similarity between 'absolutely' and 'abstract': -0.20421965420246124
Cosine Similarity between 'absolutely' and 'abstracted': 0.05708847939968109
Cosine Similarity between 'absolutely' and 'abstraction': 0.010739210061728954
Cosine Similarity between 'absolutely' and 'abstractly': -0.12247917056083679
Cosine Similarity between 'absolutely' and 'abu': -0.010429462417960167
Cosine Simil

In [14]:
def cosine_similarity(vector_a, vector_b):
    dot_product = tf.reduce_sum(tf.multiply(vector_a, vector_b))
    norm_a = tf.norm(vector_a)
    norm_b = tf.norm(vector_b)
    return dot_product / (norm_a * norm_b)

# Loop through the indices from 30 to 50
for index_a in range(30, 51):
    for index_b in range(index_a+1, 51):  # This ensures we don't compare the word with itself
        similarity_score = cosine_similarity(W1_loaded[index_a], W1_loaded[index_b])
        
        word_a = unique_words[index_a]
        word_b = unique_words[index_b]
        
        print(f"Cosine Similarity between '{word_a}' and '{word_b}': {similarity_score.numpy()}")


Cosine Similarity between 'absolutely' and 'absorb': 0.038716256618499756
Cosine Similarity between 'absolutely' and 'absorbable': 0.34109070897102356
Cosine Similarity between 'absolutely' and 'absorbed': -0.36461934447288513
Cosine Similarity between 'absolutely' and 'absorber': 0.10269894450902939
Cosine Similarity between 'absolutely' and 'absorbing': 0.14536257088184357
Cosine Similarity between 'absolutely' and 'absorption': -0.12070897221565247
Cosine Similarity between 'absolutely' and 'absorptive': 0.0826815515756607
Cosine Similarity between 'absolutely' and 'abstain': 0.27033525705337524
Cosine Similarity between 'absolutely' and 'abstract': -0.20421965420246124
Cosine Similarity between 'absolutely' and 'abstracted': 0.05708847939968109
Cosine Similarity between 'absolutely' and 'abstraction': 0.010739210061728954
Cosine Similarity between 'absolutely' and 'abstractly': -0.12247917056083679
Cosine Similarity between 'absolutely' and 'abu': -0.010429462417960167
Cosine Simil

2.second method take target matrix as input matrix

In [23]:
from nltk.corpus import stopwords

# Assuming you have NLTK's stopwords dataset downloaded
# If not, you can download using:
# import nltk
# nltk.download('stopwords')

# Load English stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords_from_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Removing stopwords from each line
    cleaned_lines = []
    for line in lines:
        words = line.split()
        cleaned_line = ' '.join([word for word in words if word.lower() not in stop_words])
        cleaned_lines.append(cleaned_line)

    # Writing cleaned lines to a new file
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in cleaned_lines:
            f.write(line + '\n')

# Usage
input_filename = 'processed_output2.txt'
output_filename = 'new.txt'
remove_stopwords_from_file(input_filename, output_filename)


In [24]:
from scipy.sparse import lil_matrix

def process_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Removing duplicates while maintaining order
    unique_sentences = sorted(set(lines), key=lines.index)
    
    # Extract all unique words, ensuring lowercase representation
    all_words = set()
    for sentence in unique_sentences:
        for word in sentence.split():
            all_words.add(word.lower())

    return unique_sentences, sorted(all_words)

unique_sentences, unique_words = process_file('new.txt')
word_to_index = {word: idx for idx, word in enumerate(unique_words)}

num_sentences = len(unique_sentences)
num_words = len(unique_words)

In [25]:
print(num_sentences)
print(num_words)

29442
12070


In [26]:
from collections import defaultdict
import math
from scipy.sparse import lil_matrix

# Step 1: Calculate IDF for each word in the vocabulary

word_sentence_indices = defaultdict(set)

for idx, sentence in enumerate(unique_sentences):
    for word in set(sentence.split()):  # Using set to avoid counting a word multiple times in the same sentence
        word_sentence_indices[word.lower()].add(idx)

idf = {}
for word, sentence_indices in word_sentence_indices.items():
    idf[word] = math.log(len(unique_sentences) / len(sentence_indices))

# Step 2: Calculate TF for each word in each sentence based on the context window and compute TF-IDF

tf_idf_matrix = lil_matrix((len(unique_sentences), len(unique_words)), dtype=float)

context_window_size = 3  # -1, 0, 1

for idx, sentence in enumerate(unique_sentences):
    word_counts = defaultdict(int)
    
    # Calculate TF (Term Frequency) with the context window
    context_start = max(0, idx - (context_window_size // 2))
    context_end = min(len(unique_sentences), idx + (context_window_size // 2) + 1)
    
    total_words_in_context = 0
    for i in range(context_start, context_end):
        total_words_in_context += len(unique_sentences[i].split())
        for word in unique_sentences[i].split():
            word_counts[word.lower()] += 1
    


    for word, count in word_counts.items():
        tf = count / total_words_in_context
        tf_idf_matrix[idx, unique_words.index(word)] = tf * idf[word]




In [28]:
input_matrix = lil_matrix((num_sentences, num_words), dtype=int)
output_matrix = lil_matrix((num_sentences, num_words), dtype=int)

context_window = 3

for idx, sentence in enumerate(unique_sentences):
    # Input matrix
    for word in sentence.split():
        input_matrix[idx, word_to_index[word.lower()]] = 1

    # Output matrix
    context_start = max(0, idx - (context_window // 2))
    context_end = min(num_sentences, idx + (context_window // 2) + 1)
    
    for i in range(context_start, context_end):
        if i != idx:
            for word in unique_sentences[i].split():
                output_matrix[idx, word_to_index[word.lower()]] = 1

# Converting to CSR format for efficient row-wise operations
input_matrix = input_matrix.tocsr()
output_matrix = output_matrix.tocsr()

In [33]:
print(input_matrix.shape)
print(tf_idf_matrix.shape)
print(output_matrix.shape)

(29442, 12070)
(29442, 12070)
(29442, 12070)


In [36]:
vocab_size = input_matrix.shape[1]
print(vocab_size)

12070


In [38]:
import tensorflow as tf

vocab_size = input_matrix.shape[1]
num_hiddens = 25

W1 = tf.Variable(tf.random.normal(shape=(vocab_size, num_hiddens), mean=0, stddev=0.01))
W2 = tf.Variable(tf.random.normal(shape=(num_hiddens, vocab_size), mean=0, stddev=0.01))

params = [W1, W2]

In [39]:
print(W1.shape)

(12070, 25)


In [40]:
import tensorflow as tf
from scipy.sparse import lil_matrix
import random

# ...[Your previous function and data preparation definitions]

def net(X):
    X = tf.reshape(tf.cast(X, dtype=tf.float32), (-1, vocab_size))
    a1 = tf.matmul(X, W1)  # Replaced input_matrix with X
    a2 = tf.matmul(a1, W2)
    return softmax(a2)

def negative_log_likelihood(y_hat, y):
    # Compute the negative log likelihood for true class predictions
    return -tf.reduce_mean(tf.math.log(tf.boolean_mask(y_hat, y) + 1e-10))

def sgd(params, grads, lr):
    """Gradient descent."""
    for param, grad in zip(params, grads):
        param.assign_sub(lr * grad)

def softmax(X):
    X_exp = tf.exp(X)
    partition = tf.reduce_sum(X_exp, 1, keepdims=True)
    return X_exp / partition 

# ...[Your previous data loading and processing definitions]

# Converting sparse matrices to dense tensors for TensorFlow operations
input_tensor = tf.convert_to_tensor(tf_idf_matrix.toarray(), dtype=tf.float32)
output_tensor = tf.convert_to_tensor(output_matrix.toarray(), dtype=tf.float32)

lr = 0.1
num_epochs = 10

for epoch in range(num_epochs):
    with tf.GradientTape() as tape:
        y_hat = net(input_tensor)
        l = negative_log_likelihood(y_hat, output_tensor)
        print(l)
    grads = tape.gradient(l, params)
    sgd(params, grads, lr)


tf.Tensor(9.398473, shape=(), dtype=float32)
tf.Tensor(9.398473, shape=(), dtype=float32)
tf.Tensor(9.398472, shape=(), dtype=float32)
tf.Tensor(9.398472, shape=(), dtype=float32)
tf.Tensor(9.398472, shape=(), dtype=float32)
tf.Tensor(9.398472, shape=(), dtype=float32)
tf.Tensor(9.398472, shape=(), dtype=float32)
tf.Tensor(9.398472, shape=(), dtype=float32)
tf.Tensor(9.398472, shape=(), dtype=float32)
tf.Tensor(9.398472, shape=(), dtype=float32)


In [43]:
class Embeddings(tf.Module):
    def __init__(self, W1, W2):
        self.W1 = tf.Variable(W1)
        self.W2 = tf.Variable(W2)

embedding_module = Embeddings(W1, W2)
saved_model_path2 = "./saved_weight_embeddings"
tf.saved_model.save(embedding_module, saved_model_path2)
loaded_module = tf.saved_model.load(saved_model_path2)
W1_loaded = loaded_module.W1
W2_loaded = loaded_module.W2



INFO:tensorflow:Assets written to: ./saved_weight_embeddings\assets


INFO:tensorflow:Assets written to: ./saved_weight_embeddings\assets


In [45]:
def sentence_embedding(sentence, W1, word_to_index):
    words = sentence.split()
    word_indices = [word_to_index[word] for word in words if word in word_to_index]
    embeddings = tf.gather(W1, word_indices)
    sentence_emb = tf.reduce_mean(embeddings, axis=0)
    return sentence_emb

def cosine_similarity(vector_a, vector_b):
    dot_product = tf.reduce_sum(tf.multiply(vector_a, vector_b))
    norm_a = tf.norm(vector_a)
    norm_b = tf.norm(vector_b)
    return dot_product / (norm_a * norm_b)

sentence_1 = "Your first test sentence here."
sentence_2 = "Your second test sentence here."

embedding_1 = sentence_embedding(sentence_1, W1_loaded, word_to_index)
embedding_2 = sentence_embedding(sentence_2, W1_loaded, word_to_index)

similarity_score = cosine_similarity(embedding_1, embedding_2)

print(f"Cosine Similarity between the sentences: {similarity_score.numpy()}")


Cosine Similarity between the sentences: 0.6596184372901917


In [54]:
from sklearn.metrics.pairwise import cosine_similarity

# Extract sentence vectors from 1 to 5 from the input_matrix
sentence_vectors = input_matrix[1:6].toarray()

# Compute cosine similarity between these vectors
similarity_matrix = cosine_similarity(sentence_vectors)

# Print similarity scores
for i in range(5):
    for j in range(i+1, 5):
        print(f"Cosine Similarity between Sentence {i+1} and Sentence {j+1}: {similarity_matrix[i][j]:.6f}")


Cosine Similarity between Sentence 1 and Sentence 2: 0.000000
Cosine Similarity between Sentence 1 and Sentence 3: 0.123091
Cosine Similarity between Sentence 1 and Sentence 4: 0.000000
Cosine Similarity between Sentence 1 and Sentence 5: 0.000000
Cosine Similarity between Sentence 2 and Sentence 3: 0.000000
Cosine Similarity between Sentence 2 and Sentence 4: 0.000000
Cosine Similarity between Sentence 2 and Sentence 5: 0.000000
Cosine Similarity between Sentence 3 and Sentence 4: 0.000000
Cosine Similarity between Sentence 3 and Sentence 5: 0.000000
Cosine Similarity between Sentence 4 and Sentence 5: 0.080845


In [44]:
def cosine_similarity(vector_a, vector_b):
    dot_product = tf.reduce_sum(tf.multiply(vector_a, vector_b))
    norm_a = tf.norm(vector_a)
    norm_b = tf.norm(vector_b)
    return dot_product / (norm_a * norm_b)

# Loop through the indices from 30 to 50
for index_a in range(30, 51):
    for index_b in range(index_a+1, 51):  # This ensures we don't compare the word with itself
        similarity_score = cosine_similarity(W1_loaded[index_a], W1_loaded[index_b])
        
        word_a = unique_words[index_a]
        word_b = unique_words[index_b]
        
        print(f"Cosine Similarity between '{word_a}' and '{word_b}': {similarity_score.numpy()}")


Cosine Similarity between 'absolutely' and 'absorb': 0.031356293708086014
Cosine Similarity between 'absolutely' and 'absorbable': -0.15239395201206207
Cosine Similarity between 'absolutely' and 'absorbed': 0.19349133968353271
Cosine Similarity between 'absolutely' and 'absorber': -0.07062120735645294
Cosine Similarity between 'absolutely' and 'absorbing': -0.3145138919353485
Cosine Similarity between 'absolutely' and 'absorption': 0.19709327816963196
Cosine Similarity between 'absolutely' and 'absorptive': 0.040041469037532806
Cosine Similarity between 'absolutely' and 'abstain': -0.2023737132549286
Cosine Similarity between 'absolutely' and 'abstract': -0.11921528726816177
Cosine Similarity between 'absolutely' and 'abstracted': -0.022548381239175797
Cosine Similarity between 'absolutely' and 'abstraction': 0.23533833026885986
Cosine Similarity between 'absolutely' and 'abstractly': 0.07346609979867935
Cosine Similarity between 'absolutely' and 'abu': -0.14542092382907867
Cosine Simi

In [55]:
def process_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Removing duplicates while maintaining order
    unique_sentences2 = sorted(set(lines), key=lines.index)
    return unique_sentences2

test_sentences = process_file('shraddha8.txt')


In [56]:
def get_sentence_embedding(sentence, W1, vocab):
    words = sentence.split()
    embedding_sum = None
    valid_word_count = 0
    for word in words:
        if word in vocab:  # only consider words that have an embedding
            word_idx = vocab[word]
            word_embedding = W1[word_idx].numpy()
            if embedding_sum is None:
                embedding_sum = word_embedding
            else:
                embedding_sum += word_embedding
            valid_word_count += 1

    # Compute the average embedding
    if valid_word_count == 0:
        return None  # or you could return a zero vector of embedding size
    avg_embedding = embedding_sum / valid_word_count
    return avg_embedding

def cosine_similarity(vector_a, vector_b):
    dot_product = tf.reduce_sum(tf.multiply(vector_a, vector_b))
    norm_a = tf.norm(vector_a)
    norm_b = tf.norm(vector_b)
    return dot_product / (norm_a * norm_b)


In [57]:
sentence_1 = test_sentences[1]  # Example sentence index
sentence_2 = test_sentences[5]  # Another example sentence index

embedding_1 = get_sentence_embedding(sentence_1, W1_loaded, word_to_index)
embedding_2 = get_sentence_embedding(sentence_2, W1_loaded, word_to_index)

similarity = cosine_similarity(embedding_1, embedding_2)
print(f"Cosine Similarity between sentences: {similarity.numpy()}")


Cosine Similarity between sentences: -0.2759794294834137


In [63]:
def process_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Removing duplicates while maintaining order
    unique_sentences = sorted(set(lines), key=lines.index)
    
    # Extract all unique words, ensuring lowercase representation
    all_words = set()
    for sentence in unique_sentences:
        for word in sentence.split():
            all_words.add(word.lower())

    return unique_sentences, sorted(all_words)

unique_sentences, unique_words = process_file('new.txt')
original_vocab = {word: idx for idx, word in enumerate(unique_words)}

# ... [Rest of the code to load the model, W1_loaded, etc.]

test_sentences, _ = process_file('shraddha8.txt')

accuracy_count = 0
threshold = 0.1

# Compare only the first 100 sentences as per your request
for i in range(0, 100, 10):
    sentence_a = test_sentences[i]
    sentence_b = test_sentences[i+1]
    
    embedding_a = sentence_embedding(sentence_a, W1_loaded.numpy(), original_vocab)
    embedding_b = sentence_embedding(sentence_b, W1_loaded.numpy(), original_vocab)
    
    similarity = cosine_similarity(embedding_a, embedding_b)
    if similarity > threshold:
        accuracy_count += 1

accuracy = accuracy_count / 10
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 30.00%


In [64]:
for i in range(99):  # We'll loop till 99 so that i+1 for the last sentence doesn't cause an index error
    sentence_a = test_sentences[i]
    sentence_b = test_sentences[i+1]
    
    embedding_a = sentence_embedding(sentence_a, W1_loaded.numpy(), original_vocab)
    embedding_b = sentence_embedding(sentence_b, W1_loaded.numpy(), original_vocab)
    
    similarity = cosine_similarity(embedding_a, embedding_b)
    print(f"Cosine Similarity between sentence {i} and {i+1}: {similarity:.4f}")

Cosine Similarity between sentence 0 and 1: 0.0524
Cosine Similarity between sentence 1 and 2: -0.0010
Cosine Similarity between sentence 2 and 3: 0.0012
Cosine Similarity between sentence 3 and 4: 0.0347
Cosine Similarity between sentence 4 and 5: 0.0345
Cosine Similarity between sentence 5 and 6: -0.2966
Cosine Similarity between sentence 6 and 7: 0.4724
Cosine Similarity between sentence 7 and 8: 0.2890
Cosine Similarity between sentence 8 and 9: 0.0036
Cosine Similarity between sentence 9 and 10: -0.1691
Cosine Similarity between sentence 10 and 11: 0.3095
Cosine Similarity between sentence 11 and 12: 0.3289
Cosine Similarity between sentence 12 and 13: 0.2594
Cosine Similarity between sentence 13 and 14: 0.1393
Cosine Similarity between sentence 14 and 15: 0.1190
Cosine Similarity between sentence 15 and 16: -0.0497
Cosine Similarity between sentence 16 and 17: 0.4294
Cosine Similarity between sentence 17 and 18: 0.5102
Cosine Similarity between sentence 18 and 19: -0.2344
Cosine 

third method -- we can back progate and update with help of vector instaed of whole matrix but for my training daatset this is comptutationally expensive (this is for example)

In [66]:
import numpy as np

class Word2VecSkipgram:
    def __init__(self, embedding_dim, learning_rate):
        self.learning_rate = learning_rate
        self.embedding_dim = embedding_dim
        self.W_input = None  # Input (target) matrix
        self.W_output = None  # Output (context) matrix

    def initialize_weights(self, vocab_size):
        """
        Initialize weights with random values
        """
        self.W_input = np.random.rand(vocab_size, self.embedding_dim)
        self.W_output = np.random.rand(self.embedding_dim, vocab_size)

    def softmax(self, x):
        """
        Compute the softmax of vector x.
        """
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

    def forward_pass(self, input_vector):
        """
        Forward propagation
        """
        h = np.dot(self.W_input.T, input_vector)
        u = np.dot(self.W_output.T, h)
        y_pred = self.softmax(u)
        return y_pred, h, u

    def backward_pass(self, input_vector, h, u, y_pred, y_true):
        """
        Backward propagation
        """
        # Compute the error
        e = y_pred - y_true
        dW_output = np.outer(h, e)
        dW_input = np.outer(input_vector, np.dot(self.W_output, e))

        # Update weights
        self.W_output -= self.learning_rate * dW_output
        self.W_input -= self.learning_rate * dW_input

    def compute_loss(self, y_true, y_pred):
        """
        Compute the loss using cross-entropy
        """
        l = -np.sum(np.log(y_pred) * y_true)
        return l

    def train(self, input_matrix, output_matrix,epochs):
        """
        Train the model
        """
        # Initialize weights
        vocab_size = input_matrix.shape[1]
        self.initialize_weights(vocab_size)

        for epoch in range(epochs):
            epoch_loss = 0
            for i, vec in enumerate(input_matrix):
                y_true = output_matrix[i]
                input_vector = vec.T
                y_pred, h, u = self.forward_pass(input_vector)
                self.backward_pass(input_vector, h, u, y_pred, output_matrix[i])
                epoch_loss += self.compute_loss(output_matrix[i], y_pred)
            print(f"Epoch {epoch + 1}/{epochs} - Loss: {epoch_loss}")


In [67]:
# Sample matrices

# Sample matrices
input_matrix = np.array([
    [1, 0, 0, 0], 
    [0, 1, 0, 0]
])
output_matrix = np.array([
    [0, 1, 0, 0],
    [1, 0, 0, 0]
])


model = Word2VecSkipgram(embedding_dim=3, learning_rate=0.01)
model.train(input_matrix, output_matrix, epochs=1000)



Epoch 1/1000 - Loss: 3.0829096226191
Epoch 2/1000 - Loss: 3.0713532792541844
Epoch 3/1000 - Loss: 3.0599642909220717
Epoch 4/1000 - Loss: 3.0487363190761467
Epoch 5/1000 - Loss: 3.0376632082660366
Epoch 6/1000 - Loss: 3.0267389795814115
Epoch 7/1000 - Loss: 3.0159578243618514
Epoch 8/1000 - Loss: 3.0053140981633684
Epoch 9/1000 - Loss: 2.994802314972519
Epoch 10/1000 - Loss: 2.9844171416593275
Epoch 11/1000 - Loss: 2.9741533926605785
Epoch 12/1000 - Loss: 2.9640060248853204
Epoch 13/1000 - Loss: 2.953970132834721
Epoch 14/1000 - Loss: 2.9440409439287
Epoch 15/1000 - Loss: 2.934213814032044
Epoch 16/1000 - Loss: 2.924484223172986
Epoch 17/1000 - Loss: 2.914847771447467
Epoch 18/1000 - Loss: 2.905300175102594
Epoch 19/1000 - Loss: 2.895837262793013
Epoch 20/1000 - Loss: 2.886454972004173
Epoch 21/1000 - Loss: 2.877149345636693
Epoch 22/1000 - Loss: 2.8679165287462363
Epoch 23/1000 - Loss: 2.8587527654335383
Epoch 24/1000 - Loss: 2.8496543958794205
Epoch 25/1000 - Loss: 2.8406178535198188

4 last method (used matrix and find loss and update )

In [69]:
context_matrix = tf.convert_to_tensor(tf_idf_matrix.toarray(), dtype=tf.float32)
target_matrix = tf.convert_to_tensor(tf_idf_matrix.toarray(), dtype=tf.float32)

In [74]:
print(f"input_matrix shape: {input_matrix.shape}")
print(f"target_matrix shape: {target_matrix.shape}")
    
print(f"context_matrix shape: {context_matrix.shape}")
        
    
    


input_matrix shape: (29442, 12070)
target_matrix shape: (29442, 12070)
context_matrix shape: (29442, 12070)


In [76]:
import numpy as np
from scipy.sparse import spmatrix
import tensorflow as tf

def softmax(X):
    X_exp = tf.exp(X)
    partition = tf.reduce_sum(X_exp, 1, keepdims=True)
    return X_exp / partition 

# Assuming you have 29442 unique sentences and 12070 unique words
unique_sentences = 29442
unique_words = 12070

# Random initialization of matrices
input_matrix = np.random.rand(unique_sentences, unique_words)
target_matrix = np.random.rand(unique_sentences, unique_words)
context_matrix = np.random.rand(unique_sentences, unique_words)


# Hyperparameters
learning_rate = 0.01
epochs = 100
convergence_threshold = 1e-5

# Initialize previous loss to infinity for convergence checking
previous_loss = float('inf')

# Start the training loop
for epoch in range(epochs):
    total_loss = 0
    
    # Predictions and hidden layer computation for the entire batch
    hidden_matrix = np.matmul(input_matrix, np.transpose(target_matrix))
    output_matrix_predicted = np.matmul(hidden_matrix, context_matrix)
    predicted_probabilities = softmax(output_matrix_predicted)  # Replaced bidirectional_softmax with softmax
    
    # Convert to dense matrices if they are sparse
    if isinstance(output_matrix, spmatrix):
        output_matrix_dense = output_matrix.toarray()
    else:
        output_matrix_dense = output_matrix
    
    # Transpose the output matrix as per your description
    loss_matrix = output_matrix_dense.T

    
    # Compute the loss for all sentences
    log_probs = -np.log(predicted_probabilities + 1e-7)
    loss_per_element = np.array([np.dot(log_probs[i, :], loss_matrix[:, i]) for i in range(predicted_probabilities.shape[0])])
    
    total_loss = np.sum(loss_per_element)
    
    # Gradient computation
    dz = predicted_probabilities - output_matrix
    dz_transpose = np.transpose(dz)
    delta_matrix = np.matmul(context_matrix, dz_transpose)
    d_target_matrix = np.matmul(delta_matrix, input_matrix)
    d_context_matrix = np.matmul(np.transpose(hidden_matrix), dz)
    
    # Update matrices
    target_matrix -= learning_rate * d_target_matrix
    context_matrix -= learning_rate * d_context_matrix
    
    # Print the average loss for this epoch
    avg_loss = total_loss / (unique_sentences*unique_words) # Assuming 1000 sentences
    print(f"Epoch {epoch + 1}, Average Loss: {avg_loss}")

    # Check for convergence
    if abs(previous_loss - avg_loss) < convergence_threshold:
        print("Convergence reached!")
        break
    previous_loss = avg_loss
