In [1]:
import pandas as pd
data = [['Where are your headquarters located?', 'location'],
['Throw my cellphone in the water', 'random'],
['Network Access Control?', 'networking'],
['Address', 'location']]
df = pd.DataFrame(data, columns = ['text', 'category'])


from sentence_transformers import SentenceTransformer
text = df['text']
import faiss
import numpy as np


def generate_encodings(text_list, model_name = "paraphrase-mpnet-base-v2"):
    encoder = SentenceTransformer( model_name)
    vectors = encoder.encode(text_list)
    return vectors

def build_index(vectors):
    vector_dimension = vectors.shape[1]
    index = faiss.IndexFlatL2(vector_dimension)
    faiss.normalize_L2(vectors)
    index.add(vectors)
    return index
    

vectors = generate_encodings( list(text))
index = build_index(vectors)




 
search_text = 'where is your office?'
# search_vector = encoder.encode(search_text)

search_vector = generate_encodings(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)
k = index.ntotal
distances, ann = index.search(_vector, k=k)

results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
labels  = df['category']
category = labels[ann[0][0]]
merge = pd.merge(results, df, left_on='ann', right_index=True)


print(merge['text'])

In [2]:
df

Unnamed: 0,text,category
0,Where are your headquarters located?,location
1,Throw my cellphone in the water,random
2,Network Access Control?,networking
3,Address,location


In [4]:
vectors = generate_encodings( list(text))
index = build_index(vectors)




 
search_text = 'where is your office?'
# search_vector = encoder.encode(search_text)

search_vector = generate_encodings(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)
k = index.ntotal
distances, ann = index.search(_vector, k=k)

results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
labels  = df['category']
category = labels[ann[0][0]]
merge = pd.merge(results, df, left_on='ann', right_index=True)


print(merge['text'])

In [24]:

 
search_text = 'where is your office?'
# search_vector = encoder.encode(search_text)

search_vector = generate_encodings(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)
k = index.ntotal
distances, ann = index.search(_vector, k=k)

results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
labels  = df['category']
category = labels[ann[0][0]]
merge = pd.merge(results, df, left_on='ann', right_index=True)

In [25]:
merge

Unnamed: 0,distances,ann,text,category
0,0.584872,0,Where are your headquarters located?,location
1,1.17595,3,Address,location
2,1.644265,2,Network Access Control?,networking
3,1.919767,1,Throw my cellphone in the water,random


In [4]:
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

class TextIndexer:
    def __init__(self, data, model_name="paraphrase-mpnet-base-v2"):
        self.data = pd.DataFrame(data, columns=['text', 'category'])
        self.encoder = SentenceTransformer(model_name)
        self.index = None
        
    def generate_encodings(self, text_list):
        vectors = self.encoder.encode(text_list)
        return vectors
    
    def build_index(self):
        vectors = self.generate_encodings(self.data['text'])
        vector_dimension = vectors.shape[1]
        self.index = faiss.IndexFlatL2(vector_dimension)
        faiss.normalize_L2(vectors)
        self.index.add(vectors)
        
    def save_index(self, file_path):
        faiss.write_index(self.index, file_path)
        
    def load_index(self, file_path):
        self.index = faiss.read_index(file_path)
        
    def search(self, search_text, top_k=None):
        search_vector = self.generate_encodings([search_text])
        _vector = np.array(search_vector)
        faiss.normalize_L2(_vector)
        k = self.index.ntotal if top_k is None else top_k
        distances, ann = self.index.search(_vector, k)
        
        results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
        merge = pd.merge(results, self.data, left_on='ann', right_index=True)
        
        return merge['text']
    
# Usage:
data = [
    ['Where are your headquarters located?', 'location'],
    ['Throw my cellphone in the water', 'random'],
    ['Network Access Control?', 'networking'],
    ['Address', 'location']
]

# text_indexer = TextIndexer(data)
# text_indexer.build_index()
# text_indexer.save_index('text_index.faiss')

# Load the index from file and perform a search
text_indexer.load_index('text_index.faiss')
search_text = 'where is your office?'
matching_texts = text_indexer.search(search_text)
print(matching_texts)


0    Where are your headquarters located?
1                                 Address
2                 Network Access Control?
3         Throw my cellphone in the water
Name: text, dtype: object


In [31]:
import os
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

class TextIndexer:
    def __init__(self, data=None, model_name="paraphrase-mpnet-base-v2"):
        self.data = data or []  # list of strings
        self.encoder = SentenceTransformer(model_name)
        self.index = None
        
    def generate_encodings(self, text_list):
        vectors = self.encoder.encode(text_list)
        return vectors
    
    def build_index(self):
        vectors = self.generate_encodings(self.data)
        vector_dimension = vectors.shape[1]
        self.index = faiss.IndexFlatL2(vector_dimension)
        faiss.normalize_L2(vectors)
        self.index.add(vectors)
        
    def save_index(self, file_path):
        faiss.write_index(self.index, file_path)
        # Also save the original text data
        data_file_path = file_path + '_paragraphs.json'
        with open(data_file_path, 'w') as f:
            json.dump(self.data, f)
        
    def load_index(self, file_path):
        self.index = faiss.read_index(file_path)
        # Load the original text data
        data_file_path = file_path + '_paragraphs.json'
        if os.path.exists(data_file_path):
            with open(data_file_path, 'r') as f:
                self.data = json.load(f)
        else:
            print(f"No data file found at {data_file_path}. Cannot query by text.")
    
    def search(self, search_text, top_k=2):
        search_vector = self.generate_encodings([search_text])
        faiss.normalize_L2(search_vector)
        k = self.index.ntotal if top_k is None else top_k
        distances, indices = self.index.search(search_vector, k)
        # Ensure indices are within bounds
        assert all(0 <= i < len(self.data) for i in indices[0]), f"Invalid indices: {indices}"
        # Return the text corresponding to the indices
        return [self.data[i] for i in indices[0]]


In [32]:
siren = TextIndexer()

In [33]:
siren.data

[]

In [34]:
siren.load_index("tmp.index.index")

In [35]:
siren.search("study")

['Many poll respondents also said that they would not participate in a\nprogram requiring a new identification card for cross-border travel',
 "-  A majority of the poll's respondents said that they supported an enhanced driver's license\nthat could combine a normal license with a border crossing ID card, and that such a document\nwould increase the likelihood that they would continue to cross the border"]