In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import pickle

from IPython.display import display, HTML
import pandas as pd

pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
wine_reviews = pd.read_csv('..\datasets\winemag-data-130k-v2.csv').drop(['Unnamed: 0', 'region_2'], axis=1).dropna().drop_duplicates()
wine_reviews.head()
wine_reviews.shape

(47673, 12)

In [4]:
#for memory optimization
wine_reviews_sampled = wine_reviews.sample(25000).reset_index()
wine_reviews_sampled.head()

Unnamed: 0,index,country,description,designation,points,price,province,region_1,taster_name,taster_twitter_handle,title,variety,winery
0,75509,US,"Deep and dense aromas of olallieberry and black plum integrate with vanilla bean and lilac on the nose of this bottling by Steve Martell, formerly of Sextant Winery. A soft mouthfeel unleashes well-integrated flavors, with lavender, pepper, baked blackberry and charcoal simmering to the surface through intense acidity. Great now, it will last many years.",Praying Mantis,93,65.0,California,Paso Robles,Matt Kettmann,@mattkettmann,Kaleidos 2013 Praying Mantis Syrah (Paso Robles),Syrah,Kaleidos
1,115981,France,"Raspberry ripeness and soft tannins lend warmth to this structured wine. It has weight and a smoky wood character, with a juicy finish. Age for 2–3 years.",Domaine de la Créa Les Cent Vignes Premier Cru,88,41.0,Burgundy,Beaune,Roger Voss,@vossroger,Louis Max 2009 Domaine de la Créa Les Cent Vignes Premier Cru (Beaune),Pinot Noir,Louis Max
2,86235,Spain,"A clear step up from the overdone 2003 Propiedad, this vintage is sweet and easy to like. There's a blast of hickory smoke on the new-oak nose, but also black fruit and molasses. The ripe palate deals chewy fruit, medium tannins and moderate length. It's a charming but blunt wine, and all in all it's probably not one to hold for too long. Drink now through 2009.",Propiedad,91,40.0,Northern Spain,Rioja,Michael Schachner,@wineschach,Palacios Remondo 2005 Propiedad (Rioja),Tempranillo,Palacios Remondo
3,37735,France,"This wine is crisp and fruity. With acidity as well as a light toast character, it is already balanced, bringing yellow and melon fruit flavors into the mineral texture. Drink from 2022.",Genevrières Premier Cru,93,160.0,Burgundy,Meursault,Roger Voss,@vossroger,Domaine Buisson Battault 2015 Genevrières Premier Cru (Meursault),Chardonnay,Domaine Buisson Battault
4,20005,Spain,"This is Cáceres' bread-and-butter red Rioja, and the '04 is fruity, light and very approachable. The nose shows some cotton candy and other bouncy red-fruit aromas, while the palate is easygoing, fleshy and balanced by a beam of juicy acidity. Along the way, you'll pull out flavors of cherry, raspberry and a finishing note of bitter coffee.",Crianza,87,15.0,Northern Spain,Rioja,Michael Schachner,@wineschach,Marqués de Cáceres 2004 Crianza (Rioja),Tempranillo,Marqués de Cáceres


In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Tokenize and convert to lowercase
    tokens = word_tokenize(text.lower())
    # Remove stopwords and lemmatize the words
    processed_text = ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])
    return processed_text

wine_reviews_sampled['processed_description'] = wine_reviews_sampled['description'].apply(preprocess_text)

In [6]:
max_features = 700  # Adjust this value based on your available memory
tfidf = TfidfVectorizer(max_features=max_features)
description_matrix = tfidf.fit_transform(wine_reviews_sampled['processed_description'])

In [7]:
similarity_matrix = cosine_similarity(description_matrix)

In [8]:
def recommend_wines(wine_title, n_recommendations=5):
    # Find the index of the wine with the given title
    wine_index = wine_reviews_sampled[wine_reviews_sampled['title'] == wine_title].index[0]
    
    # Get similarity scores for the given wine
    similarity_scores = list(enumerate(similarity_matrix[wine_index]))
    # Sort the wines based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    # Get the indices of the most similar wines (excluding the input wine itself)
    most_similar_indices = [score[0] for score in similarity_scores[1:n_recommendations+1]]
    # Return the names of the most similar wines
    return wine_reviews['title'].iloc[most_similar_indices]

In [9]:
wine_reviews_sampled['title']

0                                        Kaleidos 2013 Praying Mantis Syrah (Paso Robles)
1                 Louis Max 2009 Domaine de la Créa Les Cent Vignes Premier Cru  (Beaune)
2                                                Palacios Remondo 2005 Propiedad  (Rioja)
3                      Domaine Buisson Battault 2015 Genevrières Premier Cru  (Meursault)
4                                                Marqués de Cáceres 2004 Crianza  (Rioja)
                                               ...                                       
24995    Syncline 2012 Underwood Mountain Vineyard Grüner Veltliner (Columbia Gorge (WA))
24996                               Stonestreet 2014 Estate Chardonnay (Alexander Valley)
24997                   Château des Arnauds 2012 Cuvée des Capucins  (Lalande de Pomerol)
24998                                 Cadaretta 2014 Windthrow Red (Columbia Valley (WA))
24999                                                 Mauro Veglio 2013 Gattera  (Barolo)
Name: titl

In [10]:
wine_title = 'Condado de Oriza 2006 Roble  (Ribera del Duero)'  # You can search by wine title
recommendations = recommend_wines(wine_title)
display(recommendations)

43884                         Domaine Dujac 2009 Premier Cru  (Morey-Saint-Denis)
21235    Inkberry 2009 Mountain Estate Shiraz-Cabernet Sauvignon (Central Ranges)
25405                Jada Vineyard & Winery 2011 Jack of Hearts Red (Paso Robles)
21327                                      Antucura 2006 Calcura Red (Uco Valley)
2083      Marchiori & Barraud 2007 Cuartel 2 Marchiori Vineyard Malbec (Perdriel)
Name: title, dtype: object

In [12]:
# import pickle


# with open("../web_app/wine_recommender.pkl", "wb") as f:
#     pickle.dump((wine_reviews_sampled['title'], similarity_matrix), f)