1. Load all imports

In [30]:
import os
import pandas as pd
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from dotenv import load_dotenv

from sklearn.manifold import TSNE
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

load_dotenv()

True

2.Load Dataset

In [32]:
df = pd.read_csv("cars-dataset.csv")
df.columns = df.columns.str.strip()
df = df.dropna(subset=["Company", "Model", "Production years"], how="any")
df.head()



Columns (32) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0.1,Unnamed: 0,Model,Serie,Company,Body style,Segment,Production years,Cylinders,Displacement,Power(HP),...,Nominal Capacity,Top speed (electrical),EV Range,High mpg,Extra high mpg,Medium mpg,Low mpg,Total maximum torque,Maximum Capacity,Specification summary
0,0,AC Ace (1993-1996),Ace,AC,"Convertible (spider/spyder, cabrio/cabriolet, ...",Coupe Cabrio,"1993, 1994, 1995, 1996",V8,4942 cm3,260 HP @ 5250 RPM,...,,,,,,,,,,4.9L V8 4AT (260 HP)
1,1,AC Aceca (1998-2000),Aceca,AC,Coupé (two-door),Coupe,"1998, 1999, 2000",V8,3506 cm3,350 HP @ 6500 RPM,...,,,,,,,,,,3.5L V8 32V Turbo 6MT (354 HP)
2,2,AC Aceca (1998-2000),Aceca,AC,Coupé (two-door),Coupe,"1998, 1999, 2000",V8,3506 cm3,350 HP @ 6500 RPM,...,,,,,,,,,,4.6L V8 5MT (326 HP)
3,3,AC Cobra MkIII (1965-1967),Cobra,AC,Coupé (two-door),Roadster & Convertible,"1965, 1966, 1967",V8,4727 cm3,280 HP @ 5800 RPM,...,,,,,,,,,,4.7L V8 4MT (280 HP)
4,4,AC Cobra 378 MkIV (1965-1967),Cobra,AC,Coupé (two-door),Roadster & Convertible,"1965, 1966, 1967",V8,,446.1 HP @ - RPM,...,,,,,,,,,,6.2L V8 6MT


3.Define all Spec Columns that are going to be used

In [33]:
key_columns = [
    "Company", "Model", "Serie", "Production years", "Cylinders", "Displacement", "Power(HP)",
    "Torque(lb-ft)", "Fuel", "Fuel capacity", "Top Speed", "Acceleration 0-62 Mph (0-100 kph)",
    "Drive Type", "Gearbox", "Length", "Width", "Height", "Wheelbase",
    "Unladen Weight", "Cargo Volume", "City mpg", "Highway mpg", "Combined mpg"
]


4.Converting Rows to Natural Language Summaries

In [34]:
documents = []
doc_labels = []
company_list = []
for i, row in df.iterrows():
    parts = [f"{col}: {row[col]}" for col in key_columns if col in row and pd.notnull(row[col])]
    car_name = f"{row['Company']} {row['Model']} ({row['Production years']})"
    summary = car_name + " - " + "; ".join(parts)
    doc = Document(
        page_content=summary,
        metadata={"index": i, "make": row['Company'], "model": row['Model'], "year": row['Production years']}
    )
    documents.append(doc)
    doc_labels.append(car_name)
    company_list.append(row['Company'])

5.Creating Embeddings and Vector Store

In [35]:
embedding_fn = OpenAIEmbeddings(model="text-embedding-3-small")
db_dir = "./car_vector_store"

if not os.path.exists(db_dir):
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embedding_fn,
        persist_directory=db_dir
    )
    vector_store.persist()
else:
    vector_store = Chroma(
        persist_directory=db_dir,
        embedding_function=embedding_fn
    )

retriever = vector_store.as_retriever(search_kwargs={"k": 5})

6. Visualizations

In [36]:
unique_companies = list(set(company_list))
color_palette = plt.cm.get_cmap("tab20", len(unique_companies))
company_color_map = {
    company: f"rgb({int(r*255)},{int(g*255)},{int(b*255)})"
    for company, (r, g, b, _) in zip(unique_companies, color_palette.colors)
}
colors = [company_color_map[make] for make in company_list[:1000]]

# ---- Embedding and Dimensionality Reduction ----
print("\nGenerating vector visualizations (2D & 3D with t-SNE)...")

sample_docs = documents[:1000]
sample_labels = doc_labels[:1000]
sample_texts = [doc.page_content for doc in sample_docs]

# Safe batch embedding
batch_size = 512
sample_embeddings = []
for i in range(0, len(sample_texts), batch_size):
    batch = sample_texts[i:i + batch_size]
    sample_embeddings.extend(embedding_fn.embed_documents(batch))

X = np.array(sample_embeddings)


The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.




Generating vector visualizations (2D & 3D with t-SNE)...


7. 2D Visual

In [38]:
tsne_2d = TSNE(n_components=2, random_state=42)
X_2d = tsne_2d.fit_transform(X)

fig_2d = go.Figure(data=[go.Scatter(
    x=X_2d[:, 0],
    y=X_2d[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Car: {label}" for label in sample_labels],
    hoverinfo='text'
)])
fig_2d.update_layout(
    title='2D t-SNE Car Spec Embedding',
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)
fig_2d.write_html("car_embedding_2d.html", auto_open=True)

8. 3D Visual

In [40]:
tsne_3d = TSNE(n_components=3, random_state=42)
X_3d = tsne_3d.fit_transform(X)

fig_3d = go.Figure(data=[go.Scatter3d(
    x=X_3d[:, 0],
    y=X_3d[:, 1],
    z=X_3d[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Car: {label}" for label in sample_labels],
    hoverinfo='text'
)])
fig_3d.update_layout(
    title='3D t-SNE Car Spec Embedding',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)
fig_3d.write_html("car_embedding_3d.html", auto_open=True)