In [71]:
import pandas as pd
import pickle
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import re
import gensim
from skimage import io
import matplotlib.pyplot as plt
from gensim.test.utils import get_tmpfile

In [72]:
df = pd.read_pickle('./perfume_data.pkl')

In [73]:
df.head()

Unnamed: 0,title,imgae,description,notes
0,Creation-E Parfum Cologne,https://www.luckyscent.com//images/products/74...,"Roja’s much loved Creation-E, or Enigma as it’...","bergamot, geranium, rose de mai, neroli, jasmi..."
1,Baccarat Rouge 540,https://www.luckyscent.com//images/products/49...,"In the Fall of 2014, Baccarat, the most presti...","Citrus, jasmine, saffron, sage, ambergris, oak..."
2,Bee,https://www.luckyscent.com//images/products/76...,We don’t usually play favorites. We love all o...,"orange, ginger syrup, royal jelly accord, broo..."
3,Pacific Rock Moss,https://www.luckyscent.com//images/products/79...,"A bona fide fragrance sensation, Pacific Rock ...","Australian coastal moss, lemon, sage, geranium..."
4,Ani,https://www.luckyscent.com//images/products/77...,An ancient metropolis now abandoned to the age...,"Bergamot, green notes, blue ginger, pink peppe..."


In [74]:
df['description'][0]

'Roja’s much loved Creation-E, or Enigma as it’s known over the pond, is probably the ne-plus-ultra of private men’s club genre - a heady swirl of cognac, rum, cane sugar, cigar tobacco, vanilla, and gingery aromas that wraps around its wearer like an ermine cloak custom-made in one of London’s Saville Row tailors. But maybe – just maybe – you can’t wear an ermine cloak of a scent every day (though Roja would probably argue with us on that one). \rIf you love the rich warmth of this spicy oriental and just don’t know how to quit it, then take heart – Roja Dove has heard you. Creation-E Parfum Cologne is a much lighter, fresher take on the original eau de parfum that’s perfect for both those early office meetings and stupid hot weather.  \rThe warm booziness of cognac and rum is still there, but cast over a frame that features lighter, fresher qualities of woods, tobacco, and jasmine, you get less of that ‘rum-and-coke’ feel that’s not really appropriate outside of date night or deep wi

#  Clean data

In [75]:
def stem_words(text):
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

In [76]:
df['description'] = df.description.apply(func=make_lower_case)
df['description'] = df.description.apply(func=remove_stop_words)
df['description'] = df.description.apply(func=remove_punctuation)
df['description'] = df.description.apply(func=stem_words)

In [77]:
df['notes'] = df.description.apply(func=make_lower_case)
df['notes'] = df.description.apply(func=remove_punctuation)
df['notes'] = df.description.apply(func=stem_words)

In [78]:
df['description'][0]

'roja s much love creation e enigma it s known pond probabl ne plus ultra privat men s club genr headi swirl cognac rum cane sugar cigar tobacco vanilla gingeri aroma wrap around wearer like ermin cloak custom made one london s savill row tailor mayb mayb can t wear ermin cloak scent everi day though roja would probabl argu us one love rich warmth spici orient don t know quit it take heart roja dove heard you creation e parfum cologn much lighter fresher take origin eau de parfum that s perfect earli offic meet stupid hot weather warm boozi cognac rum still there cast frame featur lighter fresher qualiti wood tobacco jasmin get less rum and coke feel that s realli appropri outsid date night deep winter light spice powder tobacco affair still get complex jasmin layer wood spice leather wood creation e parfum cologn airi genois fruitcak origin iter gorgeous choos accord lifestyl person'

#  TF-IDF model

In [79]:
df['full_document'] = df['description'] + ' ' + df['notes']

In [80]:
#Fit TFIDF 
#Learn vocabulary and tfidf from all style_ids.
tf = TfidfVectorizer(analyzer='word', 
                     min_df=10,
                     ngram_range=(1, 2),
                     #max_features=1000,
                     stop_words='english')
tf.fit(df['full_document'])

#Transform style_id products to document-term matrix.
tfidf_matrix = tf.transform(df['full_document'])
pickle.dump(tf, open("models/tfidf_model.pkl", "wb"))

print(tfidf_matrix.shape)


(2288, 3615)


In [81]:
# Compress with SVD
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500)
latent_matrix = svd.fit_transform(tfidf_matrix)
pickle.dump(svd, open("models/svd_model.pkl", "wb"))

In [82]:
print(latent_matrix.shape)
print(latent_matrix)

(2288, 500)
[[ 2.56066064e-01 -1.60660332e-01 -3.74097281e-03 ... -2.65173908e-04
   1.28291562e-04 -2.13379956e-04]
 [ 2.56277794e-01 -6.13705159e-01  2.52772069e-01 ...  2.88009412e-04
  -3.44728004e-04  1.35107576e-04]
 [ 2.70536656e-01 -1.16768852e-01 -5.34095910e-01 ... -4.50300373e-04
   5.05087917e-04  5.29611378e-05]
 ...
 [ 1.12383416e-01 -1.45280927e-02 -1.68938609e-02 ... -2.87658095e-02
   1.03651861e-02  1.89922949e-02]
 [ 1.40468757e-01 -1.57948640e-02 -7.02591002e-02 ... -2.09660143e-02
   1.14553588e-02  3.55166052e-02]
 [ 1.36563592e-01  6.52840220e-03 -1.00144847e-02 ...  7.11554379e-03
   1.85367905e-02 -2.76941765e-02]]


In [83]:
n = 15 #pick components to reduce model training time

doc_labels = df.title
svd_feature_matrix = pd.DataFrame(latent_matrix[:,0:n] ,index=doc_labels)
print(svd_feature_matrix.shape)
print(svd_feature_matrix.head())

pickle.dump(svd_feature_matrix, open("models/lsa_embeddings.pkl", "wb"))

(2288, 15)
                                  0         1         2         3         4  \
title                                                                         
Creation-E Parfum Cologne  0.256066 -0.160660 -0.003741 -0.525072 -0.047155   
Baccarat Rouge 540         0.256278 -0.613705  0.252772 -0.075099  0.108815   
Bee                        0.270537 -0.116769 -0.534096 -0.185133 -0.382975   
Pacific Rock Moss          0.218100  0.336110 -0.299989  0.079515  0.752934   
Ani                        0.289802  0.067467  0.386150 -0.259278  0.259697   

                                  5         6         7         8         9  \
title                                                                         
Creation-E Parfum Cologne -0.500964 -0.571709 -0.022680  0.226952  0.004193   
Baccarat Rouge 540         0.576175 -0.071931 -0.248579  0.276881  0.000810   
Bee                        0.200613  0.228671  0.527381  0.275221  0.006128   
Pacific Rock Moss          0.195581 -0.2

# Doc2Vec model

In [84]:
#Use  descriptions for vocabulary 
descriptions = df.description.values.tolist()
#notes = df.notes.values.tolist() #not using notes because sematics and order of list is not meaningfull. 

documents = []
for i in range(len(df)):
    mystr = descriptions[i]
    documents.append(re.sub("[^\w]", " ",  mystr).split())

In [85]:
print(len(df))
print(len(documents))

2288
2288


In [86]:
print(descriptions[0])

roja s much love creation e enigma it s known pond probabl ne plus ultra privat men s club genr headi swirl cognac rum cane sugar cigar tobacco vanilla gingeri aroma wrap around wearer like ermin cloak custom made one london s savill row tailor mayb mayb can t wear ermin cloak scent everi day though roja would probabl argu us one love rich warmth spici orient don t know quit it take heart roja dove heard you creation e parfum cologn much lighter fresher take origin eau de parfum that s perfect earli offic meet stupid hot weather warm boozi cognac rum still there cast frame featur lighter fresher qualiti wood tobacco jasmin get less rum and coke feel that s realli appropri outsid date night deep winter light spice powder tobacco affair still get complex jasmin layer wood spice leather wood creation e parfum cologn airi genois fruitcak origin iter gorgeous choos accord lifestyl person


In [87]:
formatted_documents = [gensim.models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]

model = gensim.models.doc2vec.Doc2Vec(vector_size=15, min_count=5, epochs=100, seed=0, dm=0)
model.build_vocab(formatted_documents)

In [88]:
%time model.train(formatted_documents, total_examples=model.corpus_count, epochs=model.epochs)

Wall time: 14.5 s


In [89]:
fname = get_tmpfile("models/doc2vec_model")
model.save("models/doc2vec_model")
model = gensim.models.doc2vec.Doc2Vec.load("./models/doc2vec_model")

In [90]:
vector = model.infer_vector(doc_words=["this", "is", "a", "test"], epochs=50)
vector

array([-0.7054569 ,  0.4365423 ,  0.41396883, -0.10957432,  0.16441976,
        0.06373617, -0.808805  ,  0.57922715,  0.56320035, -0.08707153,
        0.4541724 , -0.05609159, -0.77222055,  0.2664958 ,  0.8411225 ],
      dtype=float32)

In [91]:
doctovec_feature_matrix = pd.DataFrame(model.docvecs.vectors_docs, index=df.title)
print(doctovec_feature_matrix.shape)
print(doctovec_feature_matrix.head())
pickle.dump(doctovec_feature_matrix, open("models/doctovec_embeddings.pkl", "wb"))

(2288, 15)
                                  0         1         2         3         4  \
title                                                                         
Creation-E Parfum Cologne  0.836062  1.895215 -0.516446  0.991018 -2.421540   
Baccarat Rouge 540         0.004957  1.053481 -1.103654  2.033489 -4.019248   
Bee                       -1.487729 -0.472993 -1.150733  1.383259 -1.438663   
Pacific Rock Moss         -2.584111  1.564771  2.985204 -0.556560 -2.596961   
Ani                       -0.885993  1.186680 -0.663744  0.905095 -1.787201   

                                  5         6         7         8         9  \
title                                                                         
Creation-E Parfum Cologne -0.713522 -0.152724  3.299130  1.626242  0.975157   
Baccarat Rouge 540        -0.197712 -1.748033 -0.815798  0.903669 -0.794170   
Bee                       -1.446842 -2.462992  0.541889 -1.002507  2.152622   
Pacific Rock Moss         -0.952518 -1.0