In [1]:
from pyemd import emd
from gensim.similarities import WmdSimilarity
import pandas as pd
from collections import defaultdict
from gensim import corpora

In [2]:
whiskyp = pd.read_parquet('data/whisky_processed.parquet')

  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


## Build Corpuses

In [3]:
def buildcorpus(column, texts=False):

    # convert to list of lists
    texts = [list(document) for document in column]

    # remove words that only occur once since they won't help find similarities
    # TODO?: if this doesn't give results try tfidf instead here

    frequency = defaultdict(int)
    # calculate frequencies
    for text in texts:
        for token in text:
            frequency[token] += 1

    # filter frequencies
    texts = [
        [token for token in text if frequency[token] > 1]
        for text in texts
    ]

    # build a dictionary for the corpus to know which id is which word
    dictionary = corpora.Dictionary(texts)

    # convert documents to vectors
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    if texts:
        # user requested texts as well
        return dictionary, corpus, texts
    else:
        # dont bother with texts
        return dictionary, corpus

# function to easily save dictionary and corpus files
def savecorpus(dictionary, corpus, name):
    dictpath = 'data/' + name + '.dict'
    corpuspath = 'data/' + name + '.mm'
    dictionary.save(dictpath)
    print('Saved dict to ' + dictpath)
    corpora.MmCorpus.serialize(corpuspath, corpus)
    print('Saved corpus to ' + corpuspath)
    return True

# function to easily load dictionary and corpus files
def loadcorpus(name):
    dictpath = 'data/' + name + '.dict'
    corpuspath = 'data/' + name + '.mm'
    
    dictionary = corpora.Dictionary.load(dictpath)
    corpus = corpora.MmCorpus(corpuspath)
    
    return dictionary, corpus

In [4]:
# generate dictionaries and corpuses for each column
nose_dict  , nose_corpus  , nose_texts   = buildcorpus(whiskyp.nose  )
taste_dict , taste_corpus , taste_texts  = buildcorpus(whiskyp.taste )
finish_dict, finish_corpus, finish_texts = buildcorpus(whiskyp.finish)

# save to file
savecorpus(nose_dict  , nose_corpus  , 'nose'  )
savecorpus(taste_dict , taste_corpus , 'taste' )
savecorpus(finish_dict, finish_corpus, 'finish')

# to load later:
#nose_dict  , nose_corpus   = loadcorpus('nose'  )
#taste_dict , taste_corpus  = loadcorpus('taste' )
#finish_dict, finish_corpus = loadcorpus('finish')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Saved dict to data/nose.dict
Saved corpus to data/nose.mm
Saved dict to data/taste.dict
Saved corpus to data/taste.mm
Saved dict to data/finish.dict
Saved corpus to data/finish.mm


True

In [5]:
#TODO?: Apply TFIDF here if needed
# https://radimrehurek.com/gensim/tut2.html



## Word2Vec

In [62]:
from gensim.models import Word2Vec

# let's train on the full dataset of nose, taste, and finish. This is more data to train on so should yield better results
texts = nose_texts + taste_texts + finish_texts

# Train Word2Vec:
model = Word2Vec(texts, size=100)

# Normalizes the vectors in the word2vec class.
model.init_sims(replace=True)  

In [65]:
# Test model
word = 'toffee'
model.wv.most_similar(positive=word)

[('fudge', 0.7731459140777588),
 ('seaweeod', 0.7609512805938721),
 ('honey', 0.6912168264389038),
 ('almond', 0.6875858306884766),
 ('hazelnut', 0.6726166009902954),
 ('marzipan', 0.6705068349838257),
 ('nougat', 0.6691604852676392),
 ('walnut', 0.6665315628051758),
 ('cream', 0.6619576215744019),
 ('toasty', 0.6591719388961792)]

## Word Mover Distance

In [67]:
# https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/WMD_tutorial.ipynb

In [69]:
# test a query
query = texts[0]
instance = WmdSimilarity(nose_texts, model, num_best=10)
sims = instance[query]

In [94]:
#whiskyp.reset_index()['Wild'.instr('Name')]

whiskyp.reset_index()[whiskyp.reset_index()['Name'].str.contains("WILD")]

Unnamed: 0,Name,itemnumber,RedditWhiskyIDs,reviewIDs,rating_mean,rating_std,style,nose,taste,finish
405,WILD TURKEY 101 KENTUCKY STRAIGHT BOURBON,479949,"[7652, 7653, 7654, 7657, 7658, 7659, 7660, 766...","[30255, 30256, 30257, 30258, 30259, 30260, 302...",82.778947,7.602042,Bourbon,"[fruity, overpower, alcohol, scent, sweet, cre...","[strong, oak, flavor, entire, mouth, initial, ...","[long, warm, burn, spicy, cinnamon, oak, tangy..."
406,WILD TURKEY 81 PROOF KENTUCKY STRAIGHT BOURBON,281824,[7686],"[30417, 30418, 30419, 30420, 30421, 30422, 304...",74.0,10.322086,Bourbon,"[vanilla, toffee, cinnamon, funk, background, ...","[oak, low, heat…, proof, oxidize, decade, expe...","[light, spice, short, clean, honestly, chug, p..."
407,WILD TURKEY LONGBRANCH,575001,[7723],"[30519, 30520, 30521, 30522, 30523]",75.2,10.425929,Bourbon,"[light, easy, smell, like, typical, bourbon, v...","[like, nose, sweet, oak, light, term, heat, su...","[light, warmth, medium, length, mellow, corn, ..."
408,WILD TURKEY RARE BREED KENTUCKY STRAIGHT BOURBON,455329,"[7681, 7682, 7683, 7684, 7685, 7693, 7695, 769...","[30409, 30410, 31487, 30412, 30413, 30414, 304...",84.130137,8.312238,Bourbon,"[cinnamon, nutmeg, honey, molasses, ginger, ol...","[damp, decay, oak, raisin, fig, savory, cream,...","[dry, rich, oak, currantsbalance, little, dry,..."


In [96]:
num = 405

# test a query
query = texts[num]
instance = WmdSimilarity(nose_texts, model, num_best=10)
sims = instance[query]

# Print the query and the retrieved documents, together with their similarities.
print ('Query:')
#print (query)
print(whiskyp.reset_index().Name.iloc[num])


#print 
for i in range(num_best):
    print(sims[i][1])
    #print (nose_texts[sims[i][0]])
    print(whiskyp.reset_index().Name.iloc[sims[i][0]])

Query:
WILD TURKEY 101 KENTUCKY STRAIGHT BOURBON
1.0
WILD TURKEY 101 KENTUCKY STRAIGHT BOURBON
0.8042383562254684
WILD TURKEY RARE BREED KENTUCKY STRAIGHT BOURBON
0.7762301146185003
FOUR ROSES SMALL BATCH BOURBON
0.7696283984608991
FOUR ROSES SINGLE BARREL BOURBON
0.7654284210558634
WELLER ANTIQUE 107 ORIGINAL WHEATED STRAIGHT BOURBON
0.7628704056238182
EVAN WILLIAMS SINGLE BARREL BOURBON
0.7620007955997348
W. L. WELLER 12-YEAR-OLD KENTUCKY STRAIGHT BOURBON
0.7599252339170192
BLANTON'S SINGLE BARREL SPECIAL RESERVE KENTUCKY STRAIGHT BOURBON
0.7590844822987646
OLD FORESTER
0.7589574820605155
COLONEL E.H. TAYLOR SINGLE BARREL KENTUCKY STRAIGHT BOURBON


In [95]:

# Print the query and the retrieved documents, together with their similarities.
print ('Query:')
#print (query)
print(whiskyp.reset_index().Name.iloc[405])
#print 
for i in range(num_best):
    print(sims[i][1])
    #print (nose_texts[sims[i][0]])
    print(whiskyp.reset_index().Name.iloc[sims[i][0]])

Query:
WILD TURKEY 101 KENTUCKY STRAIGHT BOURBON
1.0
12 YO KNAPPOGUE CASTLE IRISH SINGLE MALT WHISKEY
0.6830856045566778
BALBLAIR 2005 HIGHLAND SINGLE MALT SCOTCH WHISKY
0.6792056200964628
THE GLENLIVET 18 YEAR OLD SINGLE MALT SCOTCH WHISKY
0.6770222442160191
ANCNOC 12 YEAR OLD SINGLE MALT SCOTCH WHISKY
0.6759695414511607
DEANSTON 12 YEAR OLD SINGLE MALT
0.6745762423530303
AULTMORE 12 YEAR OLD SPEYSIDE SINGLE MALT SCOTCH WHISKY
0.6730929201080825
THE ARRAN MALT ISLE OF ARRAN 14-YEAR-OLD SINGLE MALT SCOTCH WHISKY
0.6723053109898927
THE BALVENIE 15-YEAR-OLD SINGLE BARREL SPEYSIDE SINGLE MALT SCOTCH WHISKY
0.6719093778709909
THE ARRAN MALT 10-YEAR-OLD SINGLE MALT SCOTCH WHISKY
0.6707176608302594
THE BALVENIE 12 YEAR OLD SINGLE BARREL SCOTCH WHISKY


In [83]:
whiskyp.reset_index().Name.iloc[0]

'12 YO KNAPPOGUE CASTLE IRISH SINGLE MALT WHISKEY'