In [4]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter
import numpy as np


In [None]:

def extract_text(file_path):

    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def encode_texts(texts, model_name='all-MiniLM-L6-v2'):

    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings


In [46]:

def save_to_csv(file_path, texts, embeddings):

    data = {
        "Index": list(range(len(texts))),
        "Text": texts,
        "Embeddings": [emb.tolist() for emb in embeddings]
    }
    df = pd.DataFrame(data)
    print(df.info())
    df.to_csv(file_path, index=False)


In [49]:
def save_to_csv(file_path, texts, embeddings):
    # Serialize embeddings as JSON
    serialized_embeddings = [json.dumps(emb.tolist()) for emb in embeddings]

    data = {
        "Index": list(range(len(texts))),
        "Text": texts,
        "Embeddings": serialized_embeddings
    }

    df = pd.DataFrame(data)
    print(df.info())
    df.to_csv(file_path, index=False)

In [50]:

def process_and_save_text(file_path, output_csv):
    text = extract_text(file_path)
    
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=200)
    chunks = text_splitter.split_text(text)
    
    embeddings = encode_texts(chunks)
    
    save_to_csv(output_csv, chunks, embeddings)
    print(f"Chunks and embeddings saved to {output_csv}")


In [52]:
import json

input_file_path = "extracted_html_files/saivineethkumar.github.io/extracted_text.txt"

output_csv_path = "extracted_html_files/saivineethkumar.github.io/chunks_and_embeddings.csv"

process_and_save_text(input_file_path, output_csv_path)

Created a chunk of size 706, which is longer than the specified 500
Created a chunk of size 2283, which is longer than the specified 500


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Index       15 non-null     int64 
 1   Text        15 non-null     object
 2   Embeddings  15 non-null     object
dtypes: int64(1), object(2)
memory usage: 492.0+ bytes
None
Chunks and embeddings saved to extracted_html_files/saivineethkumar.github.io/chunks_and_embeddings.csv


In [53]:
import pandas as pd

file = pd.read_csv('extracted_html_files/saivineethkumar.github.io/chunks_and_embeddings.csv')
file.info()
# file.shape, file.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Index       15 non-null     int64 
 1   Text        15 non-null     object
 2   Embeddings  15 non-null     object
dtypes: int64(1), object(2)
memory usage: 492.0+ bytes


#FAISS TEsts

In [66]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer

csv_file = 'extracted_html_files/saivineethkumar.github.io/chunks_and_embeddings.csv'
df = pd.read_csv(csv_file)
# print(df.info())
texts = df['Text'].tolist() 
embeddings = df["Embeddings"].apply(eval).apply(np.array).tolist()



model = SentenceTransformer('all-MiniLM-L6-v2')  
# embeddings = model.encode(texts, convert_to_tensor=False)

# Step 3: Build a FAISS Index
# print(len(embeddings),  type(list(embeddings[0])), embeddings[0])
d = len(embeddings[0]) # Dimension of embeddings
index = faiss.IndexFlatL2(d)  # Using L2 (Euclidean) distance
# print(type(embeddings))
index.add(np.array(embeddings)) # Add embeddings to the index

# Function to perform search and retrieve original text
def search(query_text, texts, k=3):
    query_embedding = model.encode([query_text], convert_to_tensor=False)
    D, I = index.search(query_embedding, k)  # Perform search
    return [texts[i] for i in I[0]], D[0]

# Example query
query = "Tell me something about me"
similar_texts, distances = search(query, texts)

for text, distance in zip(similar_texts, distances):
    print(f"Text: {text}")
    print(f"Distance: {distance}")
    print("*"*20)
    print()
# print("Similar Texts:", similar_texts)
# print("Distances:", distances)



Text: ### PARAGRAPHS ###


       I am a seasoned software developer driven by a passion for creating transformative products that positively impact the world
Distance: 1.4239776134490967
********************

Text: - 


-
Distance: 1.5956523418426514
********************

Text: A seasoned software developer with a knack for crafting innovative solutions, I am currently pursuing a Master's in Computer Science from UMass Amherst to further enhance my expertise in the ever-evolving realm of technology. With a passion for problem-solving and a keen eye for detail, I am eager to transition into a full-time software engineer role starting in May 2024. My academic background has equipped me with a solid foundation in programming languages, algorithms, and data structures, while my practical experience has honed my ability to design, develop, and test complex software systems. I thrive in collaborative environments and am always eager to learn new technologies.
Distance: 1.5979716777801514
**

Euclidean

In [69]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Load data
csv_file = 'extracted_html_files/saivineethkumar.github.io/chunks_and_embeddings.csv'
df = pd.read_csv(csv_file)
texts = df['Text'].tolist()
embeddings = df["Embeddings"].apply(eval).apply(np.array).tolist()

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Build a FAISS Index using Euclidean distance
d = len(embeddings[0])  # Dimension of embeddings
index = faiss.IndexFlatL2(d)  # Using L2 (Euclidean) distance
index.add(np.array(embeddings))  # Add embeddings to the index

# Function to perform search and retrieve original text
def search_euclidean(query_text, texts, k=3):
    query_embedding = model.encode([query_text], convert_to_tensor=False)
    D, I = index.search(query_embedding, k)  # Perform search
    return [texts[i] for i in I[0]], D[0]

# Example query
query = "Tell me something about me"
similar_texts, distances = search_euclidean(query, texts)

for text, distance in zip(similar_texts, distances):
    print(f"Text: {text}")
    print(f"Distance: {distance}")
    print("*"*40)

    print()


Text: ### PARAGRAPHS ###


       I am a seasoned software developer driven by a passion for creating transformative products that positively impact the world
Distance: 1.4239776134490967
****************************************

Text: - 


-
Distance: 1.5956523418426514
****************************************

Text: A seasoned software developer with a knack for crafting innovative solutions, I am currently pursuing a Master's in Computer Science from UMass Amherst to further enhance my expertise in the ever-evolving realm of technology. With a passion for problem-solving and a keen eye for detail, I am eager to transition into a full-time software engineer role starting in May 2024. My academic background has equipped me with a solid foundation in programming languages, algorithms, and data structures, while my practical experience has honed my ability to design, develop, and test complex software systems. I thrive in collaborative environments and am always eager to learn new techn

Inner product

In [1]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Load data
csv_file = 'extracted_html_files/saivineethkumar.github.io/chunks_and_embeddings.csv'
df = pd.read_csv(csv_file)
texts = df['Text'].tolist()
embeddings = df["Embeddings"].apply(eval).apply(np.array).tolist()

# Normalize embeddings
embeddings = np.array([embedding / np.linalg.norm(embedding) for embedding in embeddings])

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Build a FAISS Index using Inner Product (dot product)
d = len(embeddings[0])  # Dimension of embeddings
index = faiss.IndexFlatIP(d)  # Using Inner Product
index.add(embeddings)  # Add embeddings to the index

# Function to perform search and retrieve original text
def search_inner_product(query_text, texts, k=3):
    query_embedding = model.encode([query_text], convert_to_tensor=False)
    query_embedding = query_embedding / np.linalg.norm(query_embedding)  # Normalize the query embedding
    D, I = index.search(query_embedding, k)  # Perform search
    return [texts[i] for i in I[0]], D[0]

# Example query
query = "Tell me something about me"
similar_texts, distances = search_inner_product(query, texts)

for text, distance in zip(similar_texts, distances):
    print(f"Text: {text}")
    print(f"Distance: {distance}")
    print("*"*40)
    print()


  from tqdm.autonotebook import tqdm, trange


: 

In [None]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer


In [6]:
def load_embeddings(csv_file):

    df = pd.read_csv(csv_file)
    embeddings = df["Embeddings"].apply(eval).apply(np.array).tolist()
    texts = df["Text"].tolist()
    return texts, np.array(embeddings)

In [23]:
def encode_query(query, model_name='all-MiniLM-L6-v2'):

    model = SentenceTransformer(model_name)
    query_embedding = model.encode([query], convert_to_tensor=True)
    return query_embedding.cpu().numpy()


In [28]:
def find_top_k_similar_texts(query_embedding, texts, embeddings, k=3):

    # faiss.normalize_L2(embeddings)
    # faiss.normalize_L2(query_embedding)
    # index = faiss.IndexFlatIP(embeddings.shape[1])  # Inner Product (cosine similarity)


    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    D, I = index.search(query_embedding, k)
    print(D, I)
    top_k_texts = [texts[i] for i in I[0]]
    return top_k_texts

In [29]:
csv_file = 'extracted_html_files/saivineethkumar.github.io/chunks_and_embeddings.csv'
texts, embeddings = load_embeddings(csv_file)

# Handle user query
query = "Hello tell me something about me"
query_embedding = encode_query(query)

# Find top 10 similar texts
top_k_texts = find_top_k_similar_texts(query_embedding, texts, embeddings, k=3)

# Output the results
print("Top 3 similar texts:")
for i, text in enumerate(top_k_texts):
    print(f"Rank {i + 1}: {text}")

[[1.4210975 1.5832176 1.6176374]] [[0 4 1]]
Top 3 similar texts:
Rank 1: ### PARAGRAPHS ###


       I am a seasoned software developer driven by a passion for creating transformative products that positively impact the world
Rank 2: ### HEADINGS ###


      Contact Me
     

       About
      

      Experience
     

       Skills
      

      Projects
     

        Why hire me?  ↓
       

        Hobbies  ↓
       

          Software Engineer, Samsung Research
         

          SWE Intern, Samsung Research
         

          SWE Intern, Tika-Data
         

         Languages
        

         Tools / Frameworks
        

         Machine Learning / Data Science
        

         Miscellaneous
Rank 3: A seasoned software developer with a knack for crafting innovative solutions, I am currently pursuing a Master's in Computer Science from UMass Amherst to further enhance my expertise in the ever-evolving realm of technology. With a passion for problem-solving and a keen ey

In [2]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [3]:


def load_embeddings(csv_file):
    print("loadembeddings")
    df = pd.read_csv(csv_file)
    embeddings = df["Embeddings"].apply(eval).apply(np.array).tolist()
    texts = df["Text"].tolist()
    return texts, np.ascontiguousarray(embeddings, dtype=np.float32)

def encode_query(query, model_name='all-MiniLM-L6-v2'):
    print("encode_query")

    model = SentenceTransformer(model_name)
    query_embedding = model.encode([query], convert_to_tensor=True)
    return np.ascontiguousarray(query_embedding.cpu().numpy(), dtype=np.float32)

def find_top_k_similar_texts(query_embedding, texts, embeddings, k=3, metric='euclidean'):
    print("find_toop")
    if metric == 'cosine':
        faiss.normalize_L2(embeddings)
        faiss.normalize_L2(query_embedding)
        index = faiss.IndexFlatIP(embeddings.shape[1])  # Inner Product (cosine similarity)
    else:
        index = faiss.IndexFlatL2(embeddings.shape[1])  # Euclidean distance

    index.add(embeddings)
    D, I = index.search(query_embedding, k)
    top_k_texts = [texts[i] for i in I[0]]
    return top_k_texts, D[0]



In [3]:

csv_file = 'extracted_html_files/saivineethkumar.github.io/chunks_and_embeddings.csv'
texts, embeddings = load_embeddings(csv_file)

query = "Hello tell me something about me"
# query_embedding = encode_query(query)

# Choose the distance metric 'cosine' or 'euclidean'
metric = 'cosine'  
# metric = 'euclidean' 

# top_k_texts, distances = find_top_k_similar_texts(query_embedding, texts, embeddings, k=3, metric=metric)



loadembeddings


In [4]:
query = "who am I"
query_embedding = encode_query(query)


encode_query


: 