In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# NLP Imports 
# set up spacy
import spacy

# get stopwords from NLTK
from nltk.corpus import stopwords
stopwords = stopwords.words("english")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import PCA

In [2]:
raw_sake = pd.read_json('prepared_data.json')
raw_sake.head()

Unnamed: 0,url,name,type,price,prefecture,smv,acidity,kw_word,kw_wines,kw_beer,kw_foods,description
0,https://www.truesake.com/collections/all/produ...,"Dassai 45 Junmai Daiginjo ""Otter Festival""",junmai daiginjo,30,yamaguchi,3.0,1.4,chunky,pinot noir chewy whites,ales,mushroom risotto tempura fried chicken caviar ...,this sake has an older brother daiginjo that h...
1,https://www.truesake.com/collections/all/produ...,"Kikusui Junmai Ginjo ""Chrysanthemum Water""",junmai ginjo,33,niigata,1.0,1.6,citrus,sharp whites dry reds,ales ambers,suitable for light and stronger flavored dishe...,true sake owner beau timken recorded kikusui a...
2,https://www.truesake.com/collections/all/produ...,"Wakatake Onikoroshi Junmai Daiginjo ""Demon Sla...",junmai daiginjo,49,shizuoka,0.0,1.4,popular,elegant reds complex whites,crisp ales,cuisines that like big white wines,talk about a nice aroma package on this ultrap...
3,https://www.truesake.com/collections/all/produ...,"Shunnoten Tokubetsu Junmai ""Fishermen Cup""",tokubetsu junmai,9,yamanashi,3.0,1.6,smooth,pinot noir slippery whites,gentle ales,none,the nose on this awesome looking catch cup is ...
4,https://www.truesake.com/collections/all/produ...,"Kenbishi Kuromatsu Honjozo ""Black Pine""",honjozo,40,hyogo,0.5,1.5,rich,deep reds fat whites,stouts,perfect for pub food salty and savory and grea...,the nose on this extremely famous honjozo from...


In [16]:
class SakeRecommender:
    
    def __init__(self, preprocessed_sake_data):
        
        self.data = preprocessed_sake_data

        
        
    def recommend_sake(self, liked_sakes: list, num_recs=5, metric='cosine') -> list:
        
        self.sake = self.data[[
                               'name', 
                               'url', 
                               'price', 
                               'smv', 
                               'acidity', 
                               'type', 
                               'prefecture', 
                            ]]

        self.sake = pd.get_dummies(data=self.sake, 
                                   columns=['type', 'prefecture'])
         
        self.pca = PCA(n_components=1)

        self.df_pca = self.pca.fit_transform(self.sake.iloc[:, 5:])
        
        self.df = self.sake[['name', 'url', 'price', 'smv', 'acidity']]
        
        self.df['pca_feats'] = self.df_pca
        
        
        
        self.liked_sakes = liked_sakes
        self.num_recs = num_recs
        self.metric = metric
    
        self.dists = pairwise_distances(self.df.iloc[:, 2:], metric=self.metric)
        self.dists = pd.DataFrame(data=self.dists, index=self.df['name'], columns=self.df['name'])

        self.sake_summed = self.dists[self.liked_sakes].sum(axis=1)
        self.sake_summed = self.sake_summed.sort_values(ascending=True)


        self.ranked_sakes = self.sake_summed.index[~self.sake_summed.index.isin(self.liked_sakes)]
        self.ranked_sakes = self.ranked_sakes.tolist()

        self.recommendations = self.ranked_sakes[:self.num_recs]

        self.df_recs = self.data.copy()
        self.df_recs['rec_label'] = np.where(self.df_recs.name.isin(self.liked_sakes), 'Liked',
                                       np.where(self.df_recs.name.isin(self.recommendations), 'Recommended',
                                               'Other'))

        self.df_recs = self.df_recs[self.df_recs['rec_label'].isin(['Liked', 'Recommended'])]
        
        self.df_recs = self.df_recs[['rec_label', 
                                    'name',
                                    'type',
                                    'price',
                                    'prefecture',
                                    'smv',
                                    'acidity',
                                    'kw_word',
                                    'kw_wines',
                                    'kw_beer',
                                    'description',
                                    'url']]
        
        

        return self.df_recs

In [17]:
s = SakeRecommender(raw_sake)

In [20]:
sake_i_like = [
    'DEN Junmai Nama',
    'Tengumai Yamahai Junmai "Dance of the Demon"'
              ]

s.recommend_sake(sake_i_like, num_recs=3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['pca_feats'] = self.df_pca


Unnamed: 0,rec_label,name,type,price,prefecture,smv,acidity,kw_word,kw_wines,kw_beer,description,url
33,Liked,"Tengumai Yamahai Junmai ""Dance of the Demon""",yamahai junmai,30,ishikawa,4.0,1.8,amber,thick reds chewy whites,ambers honey stouts,tengumai is another of the to do sakes on your...,https://www.truesake.com/collections/all/produ...
63,Liked,DEN Junmai Nama,junmai nama,29,california,3.6,2.4,lively,zesty reds crisp whites,crisp fruity ales,den batch 12the nose on this locally made craf...,https://www.truesake.com/collections/all/produ...
68,Recommended,"Shirataki Jozen Jukusei Junmai Ginjo ""Pink""",,25,niigata,3.0,1.4,smooth,pinot noir white burgundy,ales,the upgrade on one of the most drinkable sakes...,https://www.truesake.com/collections/all/produ...
118,Recommended,"Amabuki Junmai Daiginjo Kimoto ""Rhododendron""",junmai daiginjo kimoto,36,saga,5.0,1.8,complex,fat reds fruity whites,crisp ales,this brewery is famous for making sakes using ...,https://www.truesake.com/collections/all/produ...
140,Recommended,"Kamotsuru Tokusei Kinpaku ""Gold Flake""",daiginjo,18,hiroshima,2.0,1.4,strawberry,fruity reds crisp whites,ales ambers,this gold flake sake has two pieces of gold th...,https://www.truesake.com/collections/all/produ...


In [None]:
s.df.head()

In [3]:
sake = raw_sake[[
           'name', 
           'url', 
           'price', 
           'smv', 
           'acidity', 
           'type', 
           'prefecture', 
           'kw_word', 
           'kw_wines', 
           'kw_beer', 
           'kw_foods', 
           'description'
          ]]

sake.head()

Unnamed: 0,name,url,price,smv,acidity,type,prefecture,kw_word,kw_wines,kw_beer,kw_foods,description
0,"Dassai 45 Junmai Daiginjo ""Otter Festival""",https://www.truesake.com/collections/all/produ...,30,3.0,1.4,junmai daiginjo,yamaguchi,chunky,pinot noir chewy whites,ales,mushroom risotto tempura fried chicken caviar ...,this sake has an older brother daiginjo that h...
1,"Kikusui Junmai Ginjo ""Chrysanthemum Water""",https://www.truesake.com/collections/all/produ...,33,1.0,1.6,junmai ginjo,niigata,citrus,sharp whites dry reds,ales ambers,suitable for light and stronger flavored dishe...,true sake owner beau timken recorded kikusui a...
2,"Wakatake Onikoroshi Junmai Daiginjo ""Demon Sla...",https://www.truesake.com/collections/all/produ...,49,0.0,1.4,junmai daiginjo,shizuoka,popular,elegant reds complex whites,crisp ales,cuisines that like big white wines,talk about a nice aroma package on this ultrap...
3,"Shunnoten Tokubetsu Junmai ""Fishermen Cup""",https://www.truesake.com/collections/all/produ...,9,3.0,1.6,tokubetsu junmai,yamanashi,smooth,pinot noir slippery whites,gentle ales,none,the nose on this awesome looking catch cup is ...
4,"Kenbishi Kuromatsu Honjozo ""Black Pine""",https://www.truesake.com/collections/all/produ...,40,0.5,1.5,honjozo,hyogo,rich,deep reds fat whites,stouts,perfect for pub food salty and savory and grea...,the nose on this extremely famous honjozo from...


In [4]:
disabled_components = ['parser', 'tagger', 'ner']
nlp = spacy.load('en_core_web_sm', disable=disabled_components)

In [5]:
## Lemmatize!

def lemmatize(doc: str) -> str:
    
    # Set up spacy 
    disabled_components = ['parser', 'tagger', 'ner']
    nlp = spacy.load('en_core_web_sm', disable=disabled_components)
    
    # Lemmatize doc
    doc = nlp(doc)
    lemma_list = [str(tok.lemma_) for tok in doc if tok.text not in stopwords]
    lem_string = " ".join(lemma_list)
    return lem_string

In [6]:
df = sake.copy()

# df['kw_foods'] = df['kw_foods'].map(lemmatize)
# df['description'] = df['description'].map(lemmatize)

In [7]:
s = 'this is a sample'

lemmatize(s)

'sample'

In [8]:
df['text'] = df['type'] + ' ' + \
            df['prefecture'] + ' ' + \
            df['kw_word'] + ' ' + \
            df['kw_wines'] + ' ' + \
            df['kw_beer'] + ' ' + \
            df['kw_foods'] + ' ' + \
            df['description']

df.drop(columns=['type', 
                 'prefecture', 
                 'kw_word', 
                 'kw_wines', 
                 'kw_beer', 
                 'kw_foods', 
                 'description'], inplace=True)

# df['text'] = df['text'].map(lemmatize)

In [9]:
df.head()

Unnamed: 0,name,url,price,smv,acidity,text
0,"Dassai 45 Junmai Daiginjo ""Otter Festival""",https://www.truesake.com/collections/all/produ...,30,3.0,1.4,junmai daiginjo yamaguchi chunky pinot noir c...
1,"Kikusui Junmai Ginjo ""Chrysanthemum Water""",https://www.truesake.com/collections/all/produ...,33,1.0,1.6,junmai ginjo niigata citrus sharp whites dry r...
2,"Wakatake Onikoroshi Junmai Daiginjo ""Demon Sla...",https://www.truesake.com/collections/all/produ...,49,0.0,1.4,junmai daiginjo shizuoka popular elegant reds ...
3,"Shunnoten Tokubetsu Junmai ""Fishermen Cup""",https://www.truesake.com/collections/all/produ...,9,3.0,1.6,tokubetsu junmai yamanashi smooth pinot noir s...
4,"Kenbishi Kuromatsu Honjozo ""Black Pine""",https://www.truesake.com/collections/all/produ...,40,0.5,1.5,honjozo hyogo rich deep reds fat whites stouts...


In [10]:
df['text'] = df['text'].map(lemmatize)

KeyboardInterrupt: 

In [None]:
df.head()

In [None]:
df.text[1]

In [None]:
# Vectorize
v = TfidfVectorizer(max_features=1000)
text_tfidf = v.fit_transform(df['text'])
txt = text_tfidf.toarray()

# Create tfidf_df
tfidf_df = pd.DataFrame(txt, columns=v.get_feature_names())

In [None]:
tfidf_df.head()

In [None]:
df = pd.concat([df, tfidf_df], axis=1)

df.drop(columns=['text'], inplace=True)

In [None]:
df.head()

In [None]:
pca = PCA(n_components=1)

df_pca = pca.fit_transform(df.iloc[:, 5:])

In [None]:
df = df[[
        'name', 
        'url', 
        'price', 
        'smv', 
        'acidity'
        ]]

In [None]:
# df['pca_1'] = df_pca[:, 0]
# df['pca_2'] = df_pca[:, 1]

df['pca_feats'] = df_pca

In [None]:
df.head()

In [None]:
# Getting this weird magic duplicate 'name' column?

df.columns = ['name', 
              'n2', 
              'url', 
              'price', 
              'smv', 
              'smv', 
              'acidity', 
              'n2',
              'pca_feats'
             ]

df.drop(columns=['n2'], inplace=True)

In [None]:
df.head()

In [None]:
# import distance methods

dist = pairwise_distances(df.iloc[:, 2:])

In [None]:
def recommend_sake(liked_sakes: list, num_recs=5, metric='cosine') -> list:
    
    dists = pairwise_distances(df.iloc[:, 2:], metric=metric)
    dists = pd.DataFrame(data=dist, index=df['name'], columns=df['name'])
    
    sake_summed = dists[liked_sakes].sum(axis=1)
    sake_summed = sake_summed.sort_values(ascending=True)
    
    
    ranked_sakes = sake_summed.index[~sake_summed.index.isin(liked_sakes)]
    ranked_sakes = ranked_sakes.tolist()
    
    recommendations = ranked_sakes[:num_recs]
    
    df_recs = sake.copy()
    df_recs['rec_label'] = np.where(df_recs.name.isin(liked_sakes), 'Liked',
                                   np.where(df_recs.name.isin(recommendations), 'Recommended',
                                           'Other'))
    
    df_recs = df_recs[df_recs.rec_label.isin(['Liked', 'Recommended'])]
    
    return df_recs

In [None]:
sake_i_like = [
               'Dassai 45 Junmai Daiginjo "Otter Festival"', 
               'Kikusui Junmai Ginjo "Chrysanthemum Water"',
               'Shunnoten Tokubetsu Junmai "Fishermen Cup"'
              ]

recommend_sake(sake_i_like, num_recs=3)

In [None]:
recommend_sake(sake_i_like, num_recs=3)