In [19]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from scipy.spatial.distance import cdist

class BERTSemanticSearch:
    def __init__(self, data_file, model_name='bert-base-uncased'):
        self.data = self._load_data(data_file)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.embeddings = self._generate_embeddings()

    def _load_data(self, data_file):
        """Load material data from a JSON file."""
        import json
        with open(data_file, 'r') as f:
            return json.load(f)

    def _generate_embeddings(self):
        """Generate BERT embeddings for all descriptions."""
        descriptions = [item['description'] for item in self.data]
        return self._encode_texts(descriptions)

    def _encode_texts(self, texts):
        """Generate BERT embeddings for a list of texts."""
        inputs = self.tokenizer(
            texts, padding=True, truncation=True, return_tensors="pt"
        )
        with torch.no_grad():
            outputs = self.model(**inputs)
            # Use the [CLS] token's embedding as the sentence embedding
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
        return cls_embeddings.numpy()

    def _cosine_similarity_to_percentage(self, cosine_similarity):
        """Convert cosine similarity score to a percentage (0-100%)."""
        return round(((1-cosine_similarity) * 100),2)


    def search(self, queries, top_k=5):
        """Perform semantic search for a list of queries."""
        query_embeddings = self._encode_texts(queries)
        distances = cdist(query_embeddings, self.embeddings, metric='cosine')
        results = []

        for i, query in enumerate(queries):
            ranked_indices = np.argsort(distances[i])[:top_k]
            matches = [
                {
                    "material_number": self.data[idx]['material_number'],
                    "description": self.data[idx]['description'],
                    # "score": 1 - distances[i, idx]  # Cosine similarity (1 - distance)
                    "score": self._cosine_similarity_to_percentage(distances[i, idx])
                }
                for idx in ranked_indices
            ]
            results.append({"query": query, "matches": matches})
        return results


In [20]:
# Example Usage
if __name__ == "__main__":
    # Initialize the semantic search engine
    search_engine = BERTSemanticSearch(data_file='materials.json')

    # Example search queries
    queries = ["metal rod", "engine oil"]
    results = search_engine.search(queries)

    # Print results
    for result in results:
        print(f"Query: {result['query']}")
        for match in result['matches']:
            print(f"  - {match}")

Query: metal rod
  - {'material_number': '10000000-0475', 'description': 'Office Supplies - stapler', 'score': 86.41}
  - {'material_number': '10000000-0726', 'description': 'Office Supplies - stapler', 'score': 86.41}
  - {'material_number': '10000000-0132', 'description': 'Office Supplies - stapler', 'score': 86.41}
  - {'material_number': '10000000-0951', 'description': 'Office Supplies - stapler', 'score': 86.41}
  - {'material_number': '10000000-0218', 'description': 'Office Supplies - stapler', 'score': 86.41}
Query: engine oil
  - {'material_number': '10000000-0627', 'description': 'Office Supplies - stapler', 'score': 86.64}
  - {'material_number': '10000000-0554', 'description': 'Office Supplies - stapler', 'score': 86.64}
  - {'material_number': '10000000-0951', 'description': 'Office Supplies - stapler', 'score': 86.64}
  - {'material_number': '10000000-0435', 'description': 'Office Supplies - stapler', 'score': 86.64}
  - {'material_number': '10000000-0884', 'description': 