In [8]:
pip install sentence-transformers

Collecting sentence-transformersNote: you may need to restart the kernel to use updated packages.





  Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
     -------------------------------------- 275.9/275.9 kB 5.7 MB/s eta 0:00:00
Collecting huggingface-hub>=0.20.0
  Downloading huggingface_hub-0.29.1-py3-none-any.whl (468 kB)
     -------------------------------------- 468.0/468.0 kB 7.4 MB/s eta 0:00:00
Collecting transformers<5.0.0,>=4.41.0
  Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
     --------------------------------------- 10.0/10.0 MB 10.5 MB/s eta 0:00:00
Collecting safetensors>=0.4.1
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl (308 kB)
     -------------------------------------- 308.9/308.9 kB 9.6 MB/s eta 0:00:00
Collecting tokenizers<0.22,>=0.21
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl (2.4 MB)
     ---------------------------------------- 2.4/2.4 MB 3.2 MB/s eta 0:00:00
Installing collected packages: safetensors, huggingface-hub, tokenizers, transformers, sentence-transformers
  Attempting uninstall: safetens

In [9]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
data = pd.read_csv('./akc-data-latest.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Dog_name                     277 non-null    object 
 1   description                  277 non-null    object 
 2   temperament                  276 non-null    object 
 3   popularity                   198 non-null    object 
 4   min_height                   277 non-null    float64
 5   max_height                   277 non-null    float64
 6   min_weight                   275 non-null    float64
 7   max_weight                   275 non-null    float64
 8   min_expectancy               274 non-null    float64
 9   max_expectancy               274 non-null    float64
 10  group                        277 non-null    object 
 11  grooming_frequency_value     270 non-null    float64
 12  grooming_frequency_category  270 non-null    object 
 13  shedding_value      

In [4]:
data.Dog_name.nunique()

277

In [5]:
data.describe()

Unnamed: 0,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,grooming_frequency_value,shedding_value,energy_level_value,trainability_value,demeanor_value
count,277.0,277.0,275.0,275.0,274.0,274.0,270.0,257.0,271.0,253.0,252.0
mean,44.225801,52.720588,17.888858,27.291416,11.306569,13.832117,0.425926,0.529183,0.712915,0.624506,0.620635
std,14.238298,15.885454,12.2906,19.061416,1.817949,2.016668,0.198306,0.189068,0.168927,0.247271,0.201713
min,12.7,17.78,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.2,0.2
25%,33.02,38.1,8.164663,12.927383,10.0,13.0,0.2,0.4,0.6,0.4,0.4
50%,45.085,53.34,15.875733,24.94758,12.0,14.0,0.4,0.6,0.6,0.6,0.6
75%,55.88,66.04,22.679619,34.019428,12.0,15.0,0.6,0.6,0.8,0.8,0.8
max,76.2,88.9,68.038855,108.862169,16.0,19.0,1.0,1.0,1.0,1.0,1.0


In [6]:
data.columns

Index(['Dog_name', 'description', 'temperament', 'popularity', 'min_height',
       'max_height', 'min_weight', 'max_weight', 'min_expectancy',
       'max_expectancy', 'group', 'grooming_frequency_value',
       'grooming_frequency_category', 'shedding_value', 'shedding_category',
       'energy_level_value', 'energy_level_category', 'trainability_value',
       'trainability_category', 'demeanor_value', 'demeanor_category'],
      dtype='object')

In [12]:
data.dropna(axis=0, inplace=True)

In [13]:
data.shape

(188, 21)

In [14]:
def create_breed_text(row: pd.Series) -> str:
    """
    Create an aggregated text representation for each breed using various NLP columns.
    Exclude 'Dog_name' from the aggregation (as it is the target) or include it if it helps provide context.
    Here, we combine multiple descriptive fields to form a richer document for each breed.
    """
    # You can adjust the columns you want to include. Here, we combine several key descriptive fields.
    columns_to_include = [
        'description',
        'temperament',
        'energy_level_category',
        'trainability_category',
        'demeanor_category',
        'grooming_frequency_category',
        'shedding_category'
    ]
    # Concatenate available text fields, filtering out NaNs.
    texts = [str(row[col]) for col in columns_to_include if pd.notna(row[col])]
    return " ".join(texts)

def create_breed_embeddings(df: pd.DataFrame, model: SentenceTransformer) -> torch.Tensor:
    """
    For each breed, create a text string that combines multiple NLP columns.
    Then encode all these aggregated texts into embeddings.
    """
    # Create a new column with aggregated text for each breed.
    df['breed_text'] = df.apply(create_breed_text, axis=1)
    texts = df['breed_text'].tolist()
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings

In [17]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = create_breed_embeddings(data, model=model)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [18]:
embeddings

tensor([[-0.0575,  0.0033,  0.0428,  ..., -0.0549,  0.0493,  0.0666],
        [-0.0670,  0.0820,  0.0762,  ..., -0.0179,  0.0767, -0.0050],
        [-0.0092,  0.0266,  0.0282,  ..., -0.0150, -0.0589,  0.0975],
        ...,
        [ 0.0129,  0.0251,  0.0221,  ...,  0.0050,  0.0036,  0.0264],
        [ 0.0508,  0.0530,  0.0181,  ..., -0.0454,  0.0456,  0.0269],
        [ 0.0538, -0.0459,  0.0782,  ..., -0.0160,  0.0280,  0.0713]])

In [19]:
len(embeddings), len(embeddings[0])

(188, 384)

In [20]:
def get_recommendation(query: str, df: pd.DataFrame, breed_embeddings: torch.Tensor, model: SentenceTransformer, top_k: int = 1) -> pd.DataFrame:
    """
    Compute the embedding for the user query and compare it to the breed embeddings.
    Return the top_k breeds ranked by cosine similarity.
    """
    query_embedding = model.encode(query, convert_to_tensor=True)
    cosine_scores = util.cos_sim(query_embedding, breed_embeddings)[0]

    top_results = torch.topk(cosine_scores, k=top_k)
    recommended_breeds = df.iloc[top_results.indices.cpu().numpy()].copy()
    recommended_breeds['similarity_score'] = top_results.values.cpu().numpy()
    return recommended_breeds

In [26]:
query = "I have young kids and limited time for grooming. Which breed would suit my family?"
recommendation = get_recommendation(query, data, embeddings, model, top_k=3)

In [27]:
recommendation[['Dog_name',
        'description',
        'temperament',
        'energy_level_category',
        'trainability_category',
        'demeanor_category',
        'grooming_frequency_category',
        'shedding_category'
    ]]

Unnamed: 0,Dog_name,description,temperament,energy_level_category,trainability_category,demeanor_category,grooming_frequency_category,shedding_category
82,Collie,The Collie is a large but lithe herder standin...,"Devoted, Graceful, Proud",Regular Exercise,Easy Training,Friendly,2-3 Times a Week Brushing,Seasonal
34,Bergamasco Sheepdog,"Beneath the one-of-a-kind coat is a large, mus...","Independent, Sociable, Intelligent",Calm,Agreeable,Reserved with Strangers,Occasional Bath/Brush,Infrequent
109,Flat-Coated Retriever,The Flat-Coated Retriever’s eponymous flat-lyi...,"Cheerful, Optimistic, Good-Humored",Needs Lots of Activity,Eager to Please,Friendly,Weekly Brushing,Regularly


In [28]:
query = "What breeds are known for being both protective and good with families?"
recommendation = get_recommendation(query, data, embeddings, model, top_k=3)
recommendation[['Dog_name',
        'description',
        'temperament',
        'energy_level_category',
        'trainability_category',
        'demeanor_category',
        'grooming_frequency_category',
        'shedding_category'
    ]]

Unnamed: 0,Dog_name,description,temperament,energy_level_category,trainability_category,demeanor_category,grooming_frequency_category,shedding_category
88,Dalmatian,"The Dalmatian’s delightful, eye-catching spots...","Dignified, Smart, Outgoing",Energetic,Agreeable,Alert/Responsive,Weekly Brushing,Frequent
106,Field Spaniel,Field Spaniels bear a family resemblance to Co...,"Sweet, Fun-Loving, Sensitive",Regular Exercise,Agreeable,Reserved with Strangers,Weekly Brushing,Regularly
34,Bergamasco Sheepdog,"Beneath the one-of-a-kind coat is a large, mus...","Independent, Sociable, Intelligent",Calm,Agreeable,Reserved with Strangers,Occasional Bath/Brush,Infrequent


In [30]:
recommendation.description.iloc[0]

'The Dalmatian’s delightful, eye-catching spots of black or liver adorn one of the most distinctive coats in the animal kingdom. Beneath the spots is a graceful, elegantly proportioned trotting dog standing between 19 and 23 inches at the shoulder. Dals are muscular, built to go the distance; the powerful hindquarters provide the drive behind the smooth, effortless gait.The Dal was originally bred to guard horses and coaches, and some of the old protective instinct remains. Reserved and dignified, Dals can be aloof with strangers and are dependable watchdogs. With their preferred humans, Dals are bright, loyal, and loving house dogs. They are strong, active athletes with great stamina—a wonderful partner for runners and hikers. The dignified Dalmatian, dogdom\'s citizen of the world, is famed for his spotted coat and unique job description. During their long history, these "coach dogs" have accompanied the horse-drawn rigs of nobles, gypsies, and firefighters.'