In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cdist

# Load data and model
class SemanticSearch:
    def __init__(self, data_file, model_name='all-MiniLM-L6-v2'):
        self.data = self._load_data(data_file)
        self.model = SentenceTransformer(model_name)
        self.embeddings = self._generate_embeddings()

    def _load_data(self, data_file):
        """Load material data from a JSON file."""
        import json
        with open(data_file, 'r') as f:
            return json.load(f)

    def _generate_embeddings(self):
        """Generate embeddings for all descriptions."""
        descriptions = [item['description'] for item in self.data]
        return self.model.encode(descriptions)

    def _cosine_similarity_to_percentage(self, cosine_similarity):
        """Convert cosine similarity score to a percentage (0-100%)."""
        return round(((1-cosine_similarity) * 100),2)


    def search(self, queries, top_k=5):
        """Perform semantic search for a list of queries."""
        query_embeddings = self.model.encode(queries)
        distances = cdist(query_embeddings, self.embeddings, metric='cosine')
        results = []

        for i, query in enumerate(queries):
            ranked_indices = np.argsort(distances[i])[:top_k]
            matches = [
                {
                    "material_number": self.data[idx]['material_number'],
                    "description": self.data[idx]['description'],
                    "score": 1 - distances[i, idx],  # Cosine similarity (1 - distance)
                    "score": self._cosine_similarity_to_percentage(distances[i, idx])
                }
                for idx in ranked_indices
            ]
            results.append({"query": query, "matches": matches})
        return results

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import json
# Example Usage
if __name__ == "__main__":
    # Initialize search engine
    search_engine = SemanticSearch(data_file='materials.json')

    # Example search queries
    queries = ["hammer", "engine oil"]
    results = search_engine.search(queries)

    # Print results
    #for result in results:
     #   print(f"Query: {result['query']}")
     #   for match in result['matches']:
      #      print(f"  - {match}")


import json

# Convert Python object to JSON string
json_string = json.dumps(results)

print(json_string)

[{"query": "hammer", "matches": [{"material_number": "60000000-0299", "description": "Tools - hammer", "score": 83.02}, {"material_number": "60000000-0976", "description": "Tools - hammer", "score": 83.02}, {"material_number": "60000000-0836", "description": "Tools - hammer", "score": 83.02}, {"material_number": "60000000-0851", "description": "Tools - hammer", "score": 83.02}, {"material_number": "60000000-0008", "description": "Tools - hammer", "score": 83.02}]}, {"query": "engine oil", "matches": [{"material_number": "80000000-0908", "description": "Automotive Parts - engine", "score": 48.13}, {"material_number": "80000000-0779", "description": "Automotive Parts - engine", "score": 48.13}, {"material_number": "80000000-0861", "description": "Automotive Parts - engine", "score": 48.13}, {"material_number": "80000000-0704", "description": "Automotive Parts - engine", "score": 48.13}, {"material_number": "80000000-0905", "description": "Automotive Parts - engine", "score": 48.13}]}]


In [12]:
first_query = results[0]
matches_for_hammer = first_query['matches']
for match in matches_for_hammer:
    print(f"- Material Number: {match['material_number']}, Description: {match['description']}, Score: {match['score']}")


- Material Number: 60000000-0299, Description: Tools - hammer, Score: 83.02
- Material Number: 60000000-0976, Description: Tools - hammer, Score: 83.02
- Material Number: 60000000-0836, Description: Tools - hammer, Score: 83.02
- Material Number: 60000000-0851, Description: Tools - hammer, Score: 83.02
- Material Number: 60000000-0008, Description: Tools - hammer, Score: 83.02
