In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# NLP Imports 
# set up spacy
import spacy

# get stopwords from NLTK
from nltk.corpus import stopwords
stopwords = stopwords.words("english")

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import PCA

In [2]:
sake = pd.read_json('prepared_data.json')
sake.head()

Unnamed: 0,url,name,type,price,prefecture,smv,acidity,kw_word,kw_wines,kw_beer,kw_foods,description
0,https://www.truesake.com/collections/all/produ...,"Dassai 45 Junmai Daiginjo ""Otter Festival""",junmai daiginjo,30,yamaguchi,3.0,1.4,chunky,pinot noir chewy whites,ales,mushroom risotto tempura fried chicken caviar ...,this sake has an older brother daiginjo that h...
1,https://www.truesake.com/collections/all/produ...,"Kikusui Junmai Ginjo ""Chrysanthemum Water""",junmai ginjo,33,niigata,1.0,1.6,citrus,sharp whites dry reds,ales ambers,suitable for light and stronger flavored dishe...,true sake owner beau timken recorded kikusui a...
2,https://www.truesake.com/collections/all/produ...,"Wakatake Onikoroshi Junmai Daiginjo ""Demon Sla...",junmai daiginjo,49,shizuoka,0.0,1.4,popular,elegant reds complex whites,crisp ales,cuisines that like big white wines,talk about a nice aroma package on this ultrap...
3,https://www.truesake.com/collections/all/produ...,"Shunnoten Tokubetsu Junmai ""Fishermen Cup""",tokubetsu junmai,9,yamanashi,3.0,1.6,smooth,pinot noir slippery whites,gentle ales,none,the nose on this awesome looking catch cup is ...
4,https://www.truesake.com/collections/all/produ...,"Kenbishi Kuromatsu Honjozo ""Black Pine""",honjozo,40,hyogo,0.5,1.5,rich,deep reds fat whites,stouts,perfect for pub food salty and savory and grea...,the nose on this extremely famous honjozo from...


In [3]:
df = sake[['name', 'url', 'price', 'smv', 'acidity', 'type', 'prefecture', 'kw_word', 'kw_wines', 'kw_beer', 'kw_foods', 'description']]

df.head()

Unnamed: 0,name,url,price,smv,acidity,type,prefecture,kw_word,kw_wines,kw_beer,kw_foods,description
0,"Dassai 45 Junmai Daiginjo ""Otter Festival""",https://www.truesake.com/collections/all/produ...,30,3.0,1.4,junmai daiginjo,yamaguchi,chunky,pinot noir chewy whites,ales,mushroom risotto tempura fried chicken caviar ...,this sake has an older brother daiginjo that h...
1,"Kikusui Junmai Ginjo ""Chrysanthemum Water""",https://www.truesake.com/collections/all/produ...,33,1.0,1.6,junmai ginjo,niigata,citrus,sharp whites dry reds,ales ambers,suitable for light and stronger flavored dishe...,true sake owner beau timken recorded kikusui a...
2,"Wakatake Onikoroshi Junmai Daiginjo ""Demon Sla...",https://www.truesake.com/collections/all/produ...,49,0.0,1.4,junmai daiginjo,shizuoka,popular,elegant reds complex whites,crisp ales,cuisines that like big white wines,talk about a nice aroma package on this ultrap...
3,"Shunnoten Tokubetsu Junmai ""Fishermen Cup""",https://www.truesake.com/collections/all/produ...,9,3.0,1.6,tokubetsu junmai,yamanashi,smooth,pinot noir slippery whites,gentle ales,none,the nose on this awesome looking catch cup is ...
4,"Kenbishi Kuromatsu Honjozo ""Black Pine""",https://www.truesake.com/collections/all/produ...,40,0.5,1.5,honjozo,hyogo,rich,deep reds fat whites,stouts,perfect for pub food salty and savory and grea...,the nose on this extremely famous honjozo from...


In [4]:
disabled_components = ['parser', 'tagger', 'ner']
nlp = spacy.load('en_core_web_sm', disable=disabled_components)

In [5]:
## Lemmatize!

def lemmatize(doc):
    doc = nlp(doc)
    lemma_list = [str(tok.lemma_) for tok in doc if tok.text not in stopwords]
    lem_string = " ".join(lemma_list)
    return lem_string

In [6]:
df['kw_foods'] = df['kw_foods'].map(lemmatize)
df['description'] = df['description'].map(lemmatize)

In [7]:
df['text'] = df['type'] + ' ' + \
            df['prefecture'] + ' ' + \
            df['kw_word'] + ' ' + \
            df['kw_wines'] + ' ' + \
            df['kw_beer'] + ' ' + \
            df['kw_foods'] + ' ' + \
            df['description']

df.drop(columns=['type', 
                 'prefecture', 
                 'kw_word', 
                 'kw_wines', 
                 'kw_beer', 
                 'kw_foods', 
                 'description'], inplace=True)

In [8]:
df.head()

Unnamed: 0,name,url,price,smv,acidity,text
0,"Dassai 45 Junmai Daiginjo ""Otter Festival""",https://www.truesake.com/collections/all/produ...,30,3.0,1.4,junmai daiginjo yamaguchi chunky pinot noir c...
1,"Kikusui Junmai Ginjo ""Chrysanthemum Water""",https://www.truesake.com/collections/all/produ...,33,1.0,1.6,junmai ginjo niigata citrus sharp whites dry r...
2,"Wakatake Onikoroshi Junmai Daiginjo ""Demon Sla...",https://www.truesake.com/collections/all/produ...,49,0.0,1.4,junmai daiginjo shizuoka popular elegant reds ...
3,"Shunnoten Tokubetsu Junmai ""Fishermen Cup""",https://www.truesake.com/collections/all/produ...,9,3.0,1.6,tokubetsu junmai yamanashi smooth pinot noir s...
4,"Kenbishi Kuromatsu Honjozo ""Black Pine""",https://www.truesake.com/collections/all/produ...,40,0.5,1.5,honjozo hyogo rich deep reds fat whites stouts...


In [9]:
df.text[1]

'junmai ginjo niigata citrus sharp whites dry reds ales ambers suitable light strong flavor dish dim sum chicken pork mild spice creamy cheese fresh fruit true sake owner beau timken record kikusui \ufeff1 entry personal journal 200 sake would late purchase kikusui \ufeff1 case sake thus kikusui hold special place shelve ginjo slightly fruity nose hint citrus persimmon kikusui confidently smooth begin gradual thicken viscosity end slightly salty aftertaste dry sharp smooth accord ginjo sit good mouth tease sweet fruit tone dry finish kikusui ferment low temperature long ginjos rid sake sometimes heavy characteristic junmai thus give overall light dry complexion gin vodka drinker enjoy'

In [10]:
# Vectorize
v = TfidfVectorizer(max_features=1000)
text_tfidf = v.fit_transform(df['text'])
txt = text_tfidf.toarray()

# Create tfidf_df
tfidf_df = pd.DataFrame(txt, columns=v.get_feature_names())

In [11]:
tfidf_df.head()

Unnamed: 0,10,12,17,18,20,23,40,50,80,achieve,...,yellow,yes,yet,yogurt,young,yup,yuzu,zesty,zins,zone
0,0.0,0.0,0.0,0.0,0.0,0.155144,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.163644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df = pd.concat([df, tfidf_df], axis=1)

df.drop(columns=['text'], inplace=True)

In [13]:
df.head()

Unnamed: 0,name,url,price,smv,acidity,10,12,17,18,20,...,yellow,yes,yet,yogurt,young,yup,yuzu,zesty,zins,zone
0,"Dassai 45 Junmai Daiginjo ""Otter Festival""",https://www.truesake.com/collections/all/produ...,30,3.0,1.4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Kikusui Junmai Ginjo ""Chrysanthemum Water""",https://www.truesake.com/collections/all/produ...,33,1.0,1.6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Wakatake Onikoroshi Junmai Daiginjo ""Demon Sla...",https://www.truesake.com/collections/all/produ...,49,0.0,1.4,0.0,0.0,0.163644,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Shunnoten Tokubetsu Junmai ""Fishermen Cup""",https://www.truesake.com/collections/all/produ...,9,3.0,1.6,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Kenbishi Kuromatsu Honjozo ""Black Pine""",https://www.truesake.com/collections/all/produ...,40,0.5,1.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
pca = PCA(n_components=2)

df_pca = pca.fit_transform(df.iloc[:, 5:])

In [15]:
# exp_variances = pca.explained_variance_ratio_.cumsum()

# # Plot explained variance
# plt.plot(exp_variances);
# plt.xlabel("Number of Principal Components")
# plt.ylabel("Explained Variance")

In [16]:
df = df[['name', 'url', 'price', 'smv', 'acidity']]

In [17]:
df['pca_1'] = df_pca[:, 0]
df['pca_2'] = df_pca[:, 1]

In [18]:
df.head()

Unnamed: 0,name,name.1,url,price,smv,smv.1,acidity,acidity.1,pca_1,pca_2
0,"Dassai 45 Junmai Daiginjo ""Otter Festival""",0.0,https://www.truesake.com/collections/all/produ...,30,3.0,0.0,1.4,0.0,-0.059754,0.01232
1,"Kikusui Junmai Ginjo ""Chrysanthemum Water""",0.0,https://www.truesake.com/collections/all/produ...,33,1.0,0.0,1.6,0.0,-0.030246,-0.052947
2,"Wakatake Onikoroshi Junmai Daiginjo ""Demon Sla...",0.0,https://www.truesake.com/collections/all/produ...,49,0.0,0.0,1.4,0.087142,-0.104865,-0.017802
3,"Shunnoten Tokubetsu Junmai ""Fishermen Cup""",0.0,https://www.truesake.com/collections/all/produ...,9,3.0,0.0,1.6,0.0,0.03832,0.062917
4,"Kenbishi Kuromatsu Honjozo ""Black Pine""",0.0,https://www.truesake.com/collections/all/produ...,40,0.5,0.0,1.5,0.0,-0.061922,0.304609


In [19]:
# Getting this weird magic duplicate 'name' column?

df.columns = ['name', 'n2', 'url', 'price', 'smv', 'smv', 'acidity', 'acidity',
       'pca_1', 'pca_2']

df.drop(columns=['n2'], inplace=True)

In [20]:
df.head()

Unnamed: 0,name,url,price,smv,smv.1,acidity,acidity.1,pca_1,pca_2
0,"Dassai 45 Junmai Daiginjo ""Otter Festival""",https://www.truesake.com/collections/all/produ...,30,3.0,0.0,1.4,0.0,-0.059754,0.01232
1,"Kikusui Junmai Ginjo ""Chrysanthemum Water""",https://www.truesake.com/collections/all/produ...,33,1.0,0.0,1.6,0.0,-0.030246,-0.052947
2,"Wakatake Onikoroshi Junmai Daiginjo ""Demon Sla...",https://www.truesake.com/collections/all/produ...,49,0.0,0.0,1.4,0.087142,-0.104865,-0.017802
3,"Shunnoten Tokubetsu Junmai ""Fishermen Cup""",https://www.truesake.com/collections/all/produ...,9,3.0,0.0,1.6,0.0,0.03832,0.062917
4,"Kenbishi Kuromatsu Honjozo ""Black Pine""",https://www.truesake.com/collections/all/produ...,40,0.5,0.0,1.5,0.0,-0.061922,0.304609


In [21]:
# import distance methods
from sklearn.metrics import pairwise_distances

dist = pairwise_distances(df.iloc[:, 2:])

In [22]:
dist.shape

(322, 322)

In [23]:
dist = pd.DataFrame(data=dist, index=df['name'], columns=df['name'])

In [24]:
dist.iloc[0:5, 0:5]

name,"Dassai 45 Junmai Daiginjo ""Otter Festival""","Kikusui Junmai Ginjo ""Chrysanthemum Water""","Wakatake Onikoroshi Junmai Daiginjo ""Demon Slayer""","Shunnoten Tokubetsu Junmai ""Fishermen Cup""","Kenbishi Kuromatsu Honjozo ""Black Pine"""
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Dassai 45 Junmai Daiginjo ""Otter Festival""",0.0,3.611804,19.235658,21.001242,10.312392
"Kikusui Junmai Ginjo ""Chrysanthemum Water""",3.611804,0.0,16.032916,24.083565,7.02772
"Wakatake Onikoroshi Junmai Daiginjo ""Demon Slayer""",19.235658,16.032916,0.0,40.113272,9.02072
"Shunnoten Tokubetsu Junmai ""Fishermen Cup""",21.001242,24.083565,40.113272,0.0,31.101904
"Kenbishi Kuromatsu Honjozo ""Black Pine""",10.312392,7.02772,9.02072,31.101904,0.0


In [None]:
# Write func to recommend 3 sakes based on selection
# clean up code into better pipeline