# Search Car Engine

In [2]:
from sentence_transformers import SentenceTransformer
from constants import TITLE_TO_VEC_FILE

import numpy as np

In [3]:
class FindCar():
    
    def __init__(self):
        tmp_item = np.load(TITLE_TO_VEC_FILE, allow_pickle=True).item()
        self.title_embd_np_array = tmp_item["title_embd_array"]
        self.title_embd_norm_array = np.linalg.norm(self.title_embd_np_array, axis = 1)
        self.index_to_title_dict = tmp_item["index_to_title_dict"]
        
        self.encoder = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        
    def recommend(self, query, top_k=5):
        query_emb = self.encoder.encode([query]).squeeze()
        output = ((self.title_embd_np_array @ query_emb)/self.title_embd_norm_array)/np.linalg.norm(query_emb)
        output_1 = output.argsort()[:top_k]
        
        return [self.index_to_title_dict[index] for index in output_1]

In [4]:
Obj = FindCar()

In [7]:
query = "show me a good l car."
title_list = Obj.recommend(query)
for title in title_list:
    print(title)

perodua axia 1.0a advance
ssangyong tivoli 1.5a deluxe
mitsubishi fuso fighter fm65 (new 10-yr coe)
maserati granturismo 4.2a (coe till 09/2030)
suzuki swift 1.4a glx (coe till 05/2026)


In [None]:
perodua axia 1.0a advance
perodua viva elite 1.0a ez (coe till 02/2025)
maserati granturismo 4.2a (coe till 09/2030)
ssangyong tivoli 1.5a deluxe
suzuki swift 1.4a glx (coe till 05/2026)

# Code to generate Title Embeddings
This code does not need to run at evaluation or prediction time. It generates title embeddings only.

In [1]:
import pandas as pd
import numpy as np
import json

from sentence_transformers import SentenceTransformer
from constant import TRAIN_PATH, TITLE_TO_VEC_FILE

ModuleNotFoundError: No module named 'constant'

In [None]:
train_file = TRAIN_PATH

train_df = pd.read_csv(train_file)
train_df = train_df.drop("listing_id", axis=1)

In [None]:
def preprocess(entry):
    return str(entry).lower().strip()

In [None]:
train_df.title = train_df.title.apply(preprocess)
group_by_title = train_df.groupby("title")

title_to_data_dict = {}

for key in group_by_title.groups.keys():
    for index in group_by_title.groups.get(key):
        tmp_list = []
        for colm_value in list(train_df.iloc[index].items()):
            tmp_list.append("{} is {} .".format(colm_value[0], colm_value[1]))
            
        if key in title_to_data_dict:
            title_to_data_dict[key].append(" ".join(tmp_list))
        else:
            title_to_data_dict[key] = [" ".join(tmp_list)]
            
with open(TITLE_TO_VEC_FILE, "w") as out_f:
    out_f.write(json.dumps(title_to_data_dict))

In [None]:
encoder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
title_to_mean_dict = {}

for key in title_to_data_dict.keys():
    mean_embd = np.mean(encoder.encode(title_to_data_dict[key]), axis=0)
    title_to_mean_dict[key] = mean_embd
    
    
index_to_title_dict = {}
title_embd_list = []

index = 0
for key in title_to_mean_dict.keys():
    title_embd_list.append(title_to_mean_dict[key])
    index_to_title_dict[index] = key
    index += 1
    
np.save(TITLE_TO_VEC_FILE, {"title_embd_array": np.array(title_embd_list), "index_to_title_dict": index_to_title_dict})