# Car Finder Engine

### Setup

Import packages, read train data and embeddings

In [1]:
from typing import List

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

import cleaner
import constants as const
from generate_sentence_embeddings import preprocess_title

In [2]:
df_train_raw = pd.read_csv(const.TRAIN_PATH)
df_train = cleaner.clean_preliminary(df_train_raw)
df_train['title_lower'] = df_train.title.apply(preprocess_title)

Setup and instantiate `CarFinder` class

In [3]:
class CarFinder():
    def __init__(self):
        embedding_dict = np.load(const.TITLE_EMBEDDING_DICT_PATH, allow_pickle=True).item()
        self.title_embeddings = embedding_dict['title_embeddings']
        self.title_norm_array = np.linalg.norm(self.title_embeddings, axis=1)
        self.index_to_title_dict = embedding_dict['index_to_title_dict']
        self.encoder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    def query_titles(self, query: str, k: int = 5) -> List[str]:
        """
        Takes a query string as input and returns a list of the `k` titles with the
        most similar sentence embedding
        """
        query_embedding = self.encoder.encode([query]).squeeze()
        raw_sim_score = self.title_embeddings @ query_embedding
        normalizer = self.title_norm_array * np.linalg.norm(query_embedding)
        sim_scores = raw_sim_score / normalizer
        title_indices = sim_scores.argsort()[-k:][::-1]
        titles = [self.index_to_title_dict[idx] for idx in title_indices]
        return titles

    def query_listings(self, query: str, k: int = 5) -> pd.DataFrame:
        """
        Takes a query string as input and returns a DataFrame of the `k` listings
        with the most similar sentence embedding
        """
        titles = self.query_titles(query, k)
        df = df_train[df_train.title_lower.isin(titles)]
        ordering = {title: i for i, title in enumerate(titles)}
        df = df.sort_values(by=['title_lower'], key=lambda title: title.map(ordering))
        return df.head(k)


car_finder = CarFinder()

## Running Queries

Query the data. You can search for either just titles with `query_titles` or use `query_listings` to get listings in the form of a DataFrame

In [4]:
search_str = 'auto retractable side mirrors, multi-function steering controls'

Raw titles

In [5]:
for title in car_finder.query_titles(search_str, k=5):
    print(title)

toyota wish 1.8a x (coe till 07/2024)
audi a3 cabriolet 1.4a tfsi s-tronic attraction
honda fit 1.3a g skyroof (coe till 07/2025)
volkswagen scirocco 1.4a tsi (coe till 04/2026)
volkswagen scirocco 1.4a tsi (coe till 03/2024)


Listings

In [6]:
COLS_TO_SHOW = [
    'listing_id',
    'title',
    'features',
    'accessories'
]

df = car_finder.query_listings(search_str, k=5)
df[COLS_TO_SHOW]

Unnamed: 0,listing_id,title,features,accessories
1706,1024611,Toyota Wish 1.8A X (COE till 07/2024),"1.8l dual vvti engine, 140 bhp, 7 speed cvt au...",touchscreen audio system. auto climate control...
9143,1024200,Audi A3 Cabriolet 1.4A TFSI S-tronic Attraction,"1.4l inline 4 cylinder turbocharged engine, 12...","audi mmi radio, bluetooth connectivity, usb. d..."
9088,1016920,Honda Fit 1.3A G Skyroof (COE till 07/2025),1.3l fuel saver yet powerful 4 cylinders inlin...,"leather seats, sports rims, audio player, reve..."
15034,999823,Volkswagen Scirocco 1.4A TSI (COE till 04/2026),powerful and responsive 1.4l twin charged engi...,"18"" sports rims, leather seats. factory fitted..."
6158,1024514,Volkswagen Scirocco 1.4A TSI (COE till 03/2024),"powerful 1.4l twin charged turbo engine, respo...",re-tractable side mirrors. electric driver sea...


Quickly check out the details for a given row returned

In [10]:
# View the content in full for one of the listings
row_idx = 1706

row = df.loc[row_idx]
print(
    f'{row.title}\n',
    f'Description: {row.description}\n',
    f'Features: {row.features}\n',
    f'Acc: {row.accessories}'
)

Toyota Wish 1.8A X (COE till 07/2024)
 Description: best buy, best deal, 100% loan + 0 driveaway available. lowest depreciation in the market. beautiful paint. well maintained, view to believe! flexible loan and high trade-in available! call or whatsapp our sales person for viewing now
 Features: 1.8l dual vvti engine, 140 bhp, 7 speed cvt automatic with steptronic, airbags, traction control, abs, keyless entry/start/stop, knockdown rear seats. view specs of the toyota wish
 Acc: touchscreen audio system. auto climate control aircon. reverse sensors. leather seats. knockdown rear seats. retractable side mirrors with indicators.
