In [2]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
%pip install sentence_transformers
from sentence_transformers import SentenceTransformer
import numpy as np

# Load pre-trained model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name)

# Alternatively, use SentenceTransformer for easier sentence embeddings
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [3]:
def get_contextual_embeddings(texts, use_sentence_transformer=True):

    if not isinstance(texts, list):
        texts = [texts]
    if use_sentence_transformer:
        return sentence_model.encode(texts)
    else:
        inputs = tokenizer(texts, return_tensors="tf", padding=True, truncation=True, max_length=128)
        outputs = model(inputs)
        # Use the [CLS] token embedding as the sentence representation
        return outputs.last_hidden_state[:, 0, :].numpy()

def contextual_similarity(emb1, emb2):
    return np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))

def get_batch_embeddings(texts, batch_size=512):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        batch_embeddings = get_contextual_embeddings(batch)
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

# def get_similar_keyword_suggestions(keyword_list, corpus, top_n=25, batch_size=512):
#     keyword_embeddings = get_contextual_embeddings(keyword_list)
#     corpus_embeddings = get_batch_embeddings(corpus, batch_size)
    
#     suggestions = {}
#     for i, keyword in enumerate(keyword_list):
#         similarities = np.dot(corpus_embeddings, keyword_embeddings[i]) / (np.linalg.norm(corpus_embeddings, axis=1) * np.linalg.norm(keyword_embeddings[i]))
#         top_indices = np.argsort(similarities)[-top_n:][::-1]
#         suggestions[keyword] = [corpus[idx] for idx in top_indices]
    
#     return suggestions
def get_similar_keyword_suggestions(keyword_list, corpus, min_similarity=0.2, max_similarity=0.6, batch_size=512, max_suggestions=25):
    keyword_embeddings = get_contextual_embeddings(keyword_list)
    corpus_embeddings = get_batch_embeddings(corpus, batch_size)
    
    suggestions = {}
    for i, keyword in enumerate(keyword_list):
        similarities = np.dot(corpus_embeddings, keyword_embeddings[i]) / (np.linalg.norm(corpus_embeddings, axis=1) * np.linalg.norm(keyword_embeddings[i]))
        
        # Filter similarities based on thresholds
        valid_indices = np.where((similarities >= min_similarity) & (similarities <= max_similarity))[0]
        
        # Sort the valid indices by similarity
        sorted_indices = valid_indices[np.argsort(similarities[valid_indices])[::-1]]
        
        # Get the top suggestions (up to max_suggestions)
        top_indices = sorted_indices[:max_suggestions]
        
        suggestions[keyword] = [
            (corpus[idx], similarities[idx])  # Include similarity score in output
            for idx in top_indices
        ]
    
    return suggestions

In [1]:
import pandas as pd
df = pd.read_csv('FinalGoogleKeywords.csv')

In [19]:
# Example usage
keyword_list = ["jeans", "necklace", "denim shirt", "cardigan"]
corpus = df['Keywords'].to_list()

# suggestions = get_similar_keyword_suggestions(keyword_list, corpus, min_similarity=0.3, max_similarity=0.7, max_suggestions=5)
suggestions = get_similar_keyword_suggestions(keyword_list, corpus, min_similarity=0.4, max_similarity=0.7, max_suggestions=20)

# Print results
for keyword, similar_words in suggestions.items():
    print(f"Suggestions for '{keyword}':")
    for word, similarity in similar_words:
        print(f"  - {word} (similarity: {similarity:.4f})")
    print()

Suggestions for 'jeans':
  - skinny trousers (similarity: 0.6997)
  - petite wide leg jeans (similarity: 0.6985)
  - petite jeans for women (similarity: 0.6984)
  - distressed jeans women (similarity: 0.6978)
  - black suit pants (similarity: 0.6974)
  - black high waist jeans for women (similarity: 0.6972)
  - old navy mens jeans (similarity: 0.6965)
  - joggers pants (similarity: 0.6959)
  - pants men (similarity: 0.6950)
  - maison margiela jeans (similarity: 0.6945)
  - white jeans outfit summer (similarity: 0.6941)
  - jeans jacket men (similarity: 0.6935)
  - grey jeans women (similarity: 0.6929)
  - adidas pants (similarity: 0.6925)
  - bape pants (similarity: 0.6923)
  - pink trousers (similarity: 0.6918)
  - pleather pants (similarity: 0.6906)
  - outdoor pants (similarity: 0.6905)
  - jeans shirt for men (similarity: 0.6899)
  - ralph lauren pants (similarity: 0.6897)

Suggestions for 'necklace':
  - gorjana necklace (similarity: 0.6991)
  - beloved jewelry (similarity: 0.698

In [8]:
suggestions

{'jeans': [('doc marten summer outfits', 0.29990375),
  ('swish body faja', 0.2998757),
  ('edwardian undergarments', 0.2998671),
  ('mens sweater vest', 0.29985845),
  ('push up bra lingerie', 0.29983717),
  ("macy's jewelry", 0.2998363),
  ('purple midi dress', 0.29981062),
  ('best girdle for dresses', 0.29979995),
  ('summer disneyland outfits', 0.2997862),
  ('gold long earrings', 0.2997671),
  ('summer dinner outfits mens', 0.2997512),
  ('carhartt shirt jacket', 0.2997137),
  ('nike tennis shoes women', 0.29968315),
  ('best spanx for tummy under dress', 0.2996818),
  ('zara shoes women', 0.29967126),
  ('winter dinner outfits', 0.2996631),
  ('long sleeve lace dress', 0.29962787),
  ('tummy control underwear for low back dress', 0.29960233),
  ('platform sandals for women', 0.29954478),
  ('cute summer dinner outfits', 0.29954335)],
 'dress': [('ladies panti', 0.29999998),
  ('casadei shoes', 0.29999772),
  ('dexter shoes', 0.29999352),
  ('white mountain shoes', 0.29995158),
 