<a href="https://colab.research.google.com/github/sukritis312/perfume_recommendation_with_sentence_bert/blob/main/perfume_recommendation_with_sentence_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U sentence-transformers
!pip install scikit-learn==0.23.1

In [None]:
!pip install skillsnetwork

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from numpy.linalg import norm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
import skillsnetwork
sns.set_context('notebook')
sns.set_style('white')



In [None]:
#defining helper functions
def plotter(x,y,title):
  plt.plot(x,y)
  plt.xlabel('X')
  plt.ylabel('Y')
  plt.title(title)
  plt.show()

In [None]:
sentences = ['This framework generates embeddings for each input sentence',
            'Sentences are passed as a list of string.',
            'The quick brown fox jumps over the lazy dog.']

In [None]:
model=SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
embeddings=model.encode(sentences,convert_to_numpy=True)
embeddings.shape

In [None]:
embeddings[0]:[50]

In [None]:
#SBERT for analyzing semantic textual similarity(sts)
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']
embeddings=model.encode(sentences,convert_to_numpy=True)

In [None]:
#function for calculating cosine similarity score
def cosine_similarity(a,b):
  score=np.dot(a,b/(norm(a)*norm(b)))
  return score

In [None]:
cosine_similarity(embeddings[0],embeddings[1])

In [None]:
cosine_similarity(embeddings[3],embeddings[6])

In [None]:
cosine_scores=util.cos_sim(embeddings,embeddings)
cosine_scores.shape

In [None]:
pairs = []

for i in range(len(cosine_scores)-1): # 0, 1, 2, 3, 4, 5, 6
    for j in range(i+1, len(cosine_scores)): # 1-7, 2-7, 3-7, 4-7, 5-7, 6-7, 7
        pairs.append({'index': [i,j], 'score': cosine_scores[i][j]})

len(pairs)

In [None]:
sorted_pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in sorted_pairs[0:3]:
    i, j = pair['index']
    print(f"{sentences[i]} | {sentences[j]} \n Score: {pair['score']:.2f} \n")

In [None]:
pca=PCA(n_components=2)
embeddings_reduced=pca.fit_transform(embeddings)

In [None]:
for coord,sentence in zip(embeddings_reduced,sentences):
  plt.scatter(coord[0],coord[1])
  plt.annotate(sentence, (coord[0],coord[1]))

In [None]:
#download dataset
await skillsnetwork.prepare("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-GPXX068IEN/data/perfume_data.tgz", overwrite=True)

In [None]:
df=pd.read_csv("./perfume_data.csv", encoding="unicode_escape")
df.head()

In [None]:
df.rename(columns={"ï»¿Name": "Name"}, inplace=True)
df['Name'] = df['Brand'] + " - " + df['Name']
df.drop(labels=['Description', 'Image URL', 'Brand'], axis=1, inplace=True)
df.head()

In [None]:
#check for missing values
df.Notes.isnull().sum()

In [None]:
df.dropna(inplace=True)
df.reset_index(inplace=True,drop=True)
df.shape

In [None]:
words = ["Perfume Oil", "Extrait", "Travel", "Hair", "Body", "Hand", "Intense", "Intensivo", "Oil"] # check for these words in perfume names

index_to_drop = []
for index, name in enumerate(df.Name):
    if any(word.lower() in name.lower() for word in words):
        index_to_drop.append(index)

In [None]:
df.drop(index_to_drop, axis=0, inplace=True)
df.reset_index(inplace=True, drop = True)
df.shape

In [None]:
df.Notes = df.Notes.apply(lambda x: str(x))
notes = df.Notes.to_list()
len(notes)

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

note_embeddings = model.encode(notes, show_progress_bar=True, batch_size=64)

In [None]:
print(note_embeddings.shape)

print(note_embeddings[0][:50])

In [None]:
#recommending perfumes using cosine similarity
cosine_scores = util.cos_sim(note_embeddings, note_embeddings)
cosine_scores.shape

In [None]:
pairs = []

for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({"index": [i,j], "score": cosine_scores[i][j]})

len(pairs)

In [None]:
sorted_pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in sorted_pairs[0:10]:
    i, j = pair['index']
    print(f"{df.iloc[i, 0]} | {df.iloc[j, 0]} \n Score: {pair['score']:.2f} \n")

In [None]:
my_perfumes = pd.DataFrame([['Jo Malone - English Pear & Freesia', 'Pear, Melon, Freesia, Rose, Musk, Patchouli, Rhuburb, Amber'],
                      ['Jo Malone - Myrrh & Tonka', 'Lavender, Myrrh, Tonka Bean, Vanilla, Almond'],
                      ['Jo Malone - Oud & Bergamot', 'orange, bergamot, lemon, cedar and oud.'],
                      ['Guerlain - Néroli Outrenoir', 'Petitgrain, Bergamot, Tangerine, Lemon, Grapefruit, Tea, Neroli, Orange Blossom, Smoke, Earthy Notes, Myrrh, Vanilla, Benzoin, Ambrette, Oakmoss.'],
                      ['Guerlain - Épices Volées', 'Coriander, Lemon, Artemisia, Bergamot, Clove, Cardamom, Sage, Bulgarian Rose, Sandalwood, Patchouli, Benzoin, Labdanum.'],
                      ['Guerlain - Aqua Allegoria Nerolia Vetiver Eau de Toilette', 'Basil, Vetiver, Fig Accord, Neroli'],
                      ['Chloe Eau de Parfum', 'Peony, Litchi, Freesia, Rose, Lily-of-the-Valley, Magnolia, Virginia Cedar, Amber.']
                     ],
                   columns=df.columns)

my_perfumes

In [None]:
notes = list(my_perfumes.Notes)

model = SentenceTransformer('all-MiniLM-L6-v2')
my_embeddings = model.encode(notes, show_progress_bar=True)

In [None]:
cosine_scores = util.cos_sim(my_embeddings, note_embeddings)

In [None]:
my_pairs=[]

for i in range(cosine_scores.shape[0]):
    for j in range(cosine_score.shape[1]):
        my_pairs.append({"index": [i,j], "score": cosine_scores[i][j]})


my_sorted_pairs = sorted(my_pairs, key=lambda x: x['score'], reverse=True)