# Car Finder Engine

### Setup

Import packages, read train data and embeddings

In [1]:
from typing import List

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

import cleaner
import constants as const
from generate_sentence_embeddings import preprocess_title

In [2]:
df_train_raw = pd.read_csv(const.TRAIN_PATH)
df_train = cleaner.clean_preliminary(df_train_raw)
df_train['title_lower'] = df_train.title.apply(preprocess_title)

Setup and instantiate `CarFinder` class

In [3]:
class CarFinder():
    def __init__(self):
        embedding_dict = np.load(const.TITLE_EMBEDDING_DICT_PATH, allow_pickle=True).item()
        self.title_embeddings = embedding_dict['title_embeddings']
        self.title_norm_array = np.linalg.norm(self.title_embeddings, axis=1)
        self.index_to_title_dict = embedding_dict['index_to_title_dict']
        self.encoder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    def query_titles(self, query: str, k: int = 5) -> List[str]:
        """
        Takes a query string as input and returns a list of the `k` titles with the
        most similar sentence embedding
        """
        query_embedding = self.encoder.encode([query]).squeeze()
        raw_sim_score = self.title_embeddings @ query_embedding
        normalizer = self.title_norm_array * np.linalg.norm(query_embedding)
        sim_scores = raw_sim_score / normalizer
        title_indices = sim_scores.argsort()[:k]
        titles = [self.index_to_title_dict[idx] for idx in title_indices]
        return titles

    def query_listings(self, query: str, k: int = 5) -> pd.DataFrame:
        """
        Takes a query string as input and returns a DataFrame of the `k` listings
        with the most similar sentence embedding
        """
        titles = self.query_titles(query, k)
        df = df_train[df_train.title_lower.isin(titles)]
        ordering = {title: i for i, title in enumerate(titles)}
        df = df.sort_values(by=['title_lower'], key=lambda title: title.map(ordering))
        return df.head(k)


car_finder = CarFinder()

## Running Queries

Query the data. You can search for either just titles with `query_titles` or use `query_listings` to get listings in the form of a DataFrame

In [4]:
search_str = 'auto retractable side mirrors, multi-function steering controls'

Raw titles

In [5]:
for title in car_finder.query_titles(search_str, k=5):
    print(title)

maserati granturismo cambiocorsa (coe till 09/2029)
yutong zk6100h (coe till 05/2029)
maserati granturismo s 4.7a (coe till 09/2030)
maserati granturismo mc stradale 4.7a (coe till 04/2031)
isuzu ftr34p (coe till 06/2023)


Listings

In [6]:
COLS_TO_SHOW = [
    'listing_id',
    'title',
    'description',
    'features',
    'accessories',
    'price'
]

df = car_finder.query_listings(search_str, k=5)
df[COLS_TO_SHOW]

Unnamed: 0,listing_id,title,description,features,accessories,price
9470,1031009,Maserati GranTurismo Cambiocorsa (COE till 09/...,high loan available. please call for viewing! ...,view specs of the maserati granturismo,UNKNOWN,226500
802,987627,Yutong ZK6100H (COE till 05/2029),UNKNOWN,UNKNOWN,UNKNOWN,67100
11997,1000199,Maserati GranTurismo S 4.7A (COE till 09/2030),8000,UNKNOWN,20,179300
8274,1012503,Maserati GranTurismo MC Stradale 4.7A (COE til...,2,UNKNOWN,UNKNOWN,251400
3876,866523,Isuzu FTR34P (COE till 06/2023),"0% downpayment, laden 16000kg hooklift, excell...",UNKNOWN,UNKNOWN,64700


Quickly check out the details for a given row returned

In [7]:
# View the content in full for one of the listings
row_idx = 9470

row = df.loc[row_idx]
print(
    f'{row.title}\n',
    f'Description: {row.description}\n',
    f'Features: {row.features}\n',
    f'Acc: {row.accessories}'
)

Maserati GranTurismo Cambiocorsa (COE till 09/2029)
 Description: high loan available. please call for viewing! do contact our friendly sales representative and will attend to you shortly!
 Features: view specs of the maserati granturismo
 Acc: UNKNOWN
