# Car Finder Engine

### Setup

Import packages, read train data and embeddings

In [1]:
from typing import List

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

import cleaner
import constants as const

In [2]:
df_train_raw = pd.read_csv(const.TRAIN_PATH)
df_train = cleaner.clean_preliminary(df_train_raw)
df_train['title_lower'] = df_train.title.apply(lambda x: x.lower())

Setup and instantiate `CarFinder` class

In [3]:
class CarFinder():
    def __init__(self):
        vec_file = np.load(const.TITLE_TO_VEC_FILE, allow_pickle=True).item()
        self.title_embd_np_array = vec_file['title_embd_array']
        self.title_embd_norm_array = np.linalg.norm(self.title_embd_np_array, axis=1)
        self.index_to_title_dict = vec_file['index_to_title_dict']
        self.encoder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    def query_titles(self, query: str, k: int = 5) -> List[str]:
        """
        Takes a query string as input and returns a list of the `k` titles with the
        most similar sentence embedding
        """
        query_embedding = self.encoder.encode([query]).squeeze()
        raw_sim_score = self.title_embd_np_array @ query_embedding
        normalizer = self.title_embd_norm_array * np.linalg.norm(query_embedding)
        sim_scores = raw_sim_score / normalizer
        title_indices = sim_scores.argsort()[:k]
        titles = [self.index_to_title_dict[idx] for idx in title_indices]
        return titles

    def query_listings(self, query: str, k: int = 5) -> pd.DataFrame:
        """
        Takes a query string as input and returns a DataFrame of the `k` listings
        with the most similar sentence embedding
        """
        titles = self.query_titles(query, k)
        df = df_train[df_train.title_lower.isin(titles)]
        ordering = {title: i for i, title in enumerate(titles)}
        df = df.sort_values(by=['title_lower'], key=lambda title: title.map(ordering))
        return df.head(k)


car_finder = CarFinder()

## Running Queries

Query the data. You can search for either just titles with `query_titles` or use `query_listings` to get listings in the form of a DataFrame

In [4]:
search_str = 'auto retractable side mirrors, multi-function steering controls'

Raw titles

In [5]:
for title in car_finder.query_titles(search_str, k=5):
    print(title)

maserati granturismo 4.2a (coe till 09/2030)
nissan leaf electric g
isuzu cyz52k tipper (coe till 12/2030)
isuzu cyz52k
kia cerato forte 1.6a sx (coe till 03/2030)


Listings

In [6]:
COLS_TO_SHOW = [
    'listing_id',
    'title',
    'description',
    'features',
    'accessories',
    'price'
]

df = car_finder.query_listings(search_str, k=5)
df[COLS_TO_SHOW]

Unnamed: 0,listing_id,title,description,features,accessories,price
1452,1009066,Maserati GranTurismo 4.2A (COE till 09/2030),pristine condition and best value in the marke...,"4.2l v8 engine, 399bhp with 460nm of torque, 6...","multi-function steering, electric seats, auto ...",170500
14604,1000923,Nissan Leaf Electric G,new ev battery 1 year warranty 24kw upgrade ev...,view specs of the nissan leaf electric,UNKNOWN,61600
1614,945905,Isuzu CYZ52K Tipper (COE till 12/2030),rare gem! lowest depreciation! 2011! 10yrs coe...,UNKNOWN,UNKNOWN,94400
6514,1015358,Isuzu CYZ52K,UNKNOWN,UNKNOWN,UNKNOWN,68200
1859,1028741,Kia Cerato Forte 1.6A SX (COE till 03/2030),view to believe and passing through do not mis...,"1.6l dohc 16v cvvt engine with 122bhp, 4 speed...","original rims, premium leather upholstery seat...",54800


Quickly check out the details for a given row returned

In [7]:
# View the content in full for one of the listings
row_idx = 1452

row = df.loc[row_idx]
print(
    f'{row.title}\n',
    f'Description: {row.description}\n',
    f'Features: {row.features}\n',
    f'Acc: {row.accessories}'
)

Maserati GranTurismo 4.2A (COE till 09/2030)
 Description: pristine condition and best value in the market! meticulous owner and serviced at maserati specialist eag! accident-free! bank/in house finance available at attractive interest rates! trade-in welcome! book an appointment with our friendly staff now for viewing. free servicing/grooming package if the price is right! terms and conditions applied.
 Features: 4.2l v8 engine, 399bhp with 460nm of torque, 6 speed zf gearbox, paddle shifters, rear wheel drive. view specs of the maserati granturismo
 Acc: multi-function steering, electric seats, auto headlights/wipers, navigation, all round sensors, android headunit, no sticky buttons!


# Code to generate title embeddings

This code should not be run at prediction time. It is to generate title embeddings only once.

In [1]:
import pandas as pd
import numpy as np
import json

from sentence_transformers import SentenceTransformer
from constants import TRAIN_PATH, TITLE_TO_VEC_FILE

In [2]:
train_file = TRAIN_PATH

train_df = pd.read_csv(train_file)
train_df = train_df.drop("listing_id", axis=1)

In [3]:
def preprocess(entry):
    return str(entry).lower().strip()

In [None]:
train_df.title = train_df.title.apply(preprocess)
group_by_title = train_df.groupby("title")

title_to_data_dict = {}

for key in group_by_title.groups.keys():
    for index in group_by_title.groups.get(key):
        tmp_list = []
        for colm_value in list(train_df.iloc[index].items()):
            tmp_list.append("{} is {} .".format(colm_value[0], colm_value[1]))
            
        if key in title_to_data_dict:
            title_to_data_dict[key].append(" ".join(tmp_list))
        else:
            title_to_data_dict[key] = [" ".join(tmp_list)]
            
with open(TITLE_TO_VEC_FILE, "w") as out_f:
    out_f.write(json.dumps(title_to_data_dict))

In [None]:
encoder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
title_to_mean_dict = {}

for key in title_to_data_dict.keys():
    mean_embd = np.mean(encoder.encode(title_to_data_dict[key]), axis=0)
    title_to_mean_dict[key] = mean_embd
    
    
index_to_title_dict = {}
title_embd_list = []

index = 0
for key in title_to_mean_dict.keys():
    title_embd_list.append(title_to_mean_dict[key])
    index_to_title_dict[index] = key
    index += 1
    
np.save(TITLE_TO_VEC_FILE, {"title_embd_array": np.array(title_embd_list), "index_to_title_dict": index_to_title_dict})