In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import pickle

from IPython.display import display, HTML
import pandas as pd

pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
wine_reviews = pd.read_csv('datasets\winemag-data-130k-v2.csv').drop(['Unnamed: 0', 'region_2'], axis=1).dropna().drop_duplicates()
wine_reviews.head()
wine_reviews.shape

(47673, 12)

In [3]:
#for memory optimization
wine_reviews_sampled = wine_reviews.sample(25000).reset_index()
wine_reviews_sampled.head()

Unnamed: 0,index,country,description,designation,points,price,province,region_1,taster_name,taster_twitter_handle,title,variety,winery
0,47648,France,"Concentrated citrus fruit sings on nose and palate. The body reveals a stony, mineral texture of exquisitely clean dryness. There is the slightest edge of pleasant bitterness, just like in dried lemon peel. Linear, concentrated with lots of citric backbone. Drink 2016–2025.",Hatschbourg Grand Cru,92,30.0,Alsace,Alsace,Anne Krebiehl MW,@AnneInVino,Joseph Cattin 2013 Hatschbourg Grand Cru Riesling (Alsace),Riesling,Joseph Cattin
1,42223,US,"Thick tannins obscure some of the richer flavors trying to burst through in this wine, a mix of blackberry, black olive and cedar. Full bodied and generous in every way, it finishes with a slow drag of smoke.",Las Amigas Vineyard,88,60.0,California,Carneros,Virginie Boone,@vboone,Signorello 2012 Las Amigas Vineyard Pinot Noir (Carneros),Pinot Noir,Signorello
2,40661,Australia,"The De Bortoli family does a solid job with Chardonnay—even this entry-level wine shows some complexity and finesse. Hints of smoke and graphite frame bright pineapple and lime flavors, delivered without excessive weight or richness.",DB Family Selection,85,9.0,Australia Other,Australia,Joe Czerwinski,@JoeCz,De Bortoli 2013 DB Family Selection Chardonnay (Australia),Chardonnay,De Bortoli
3,105586,Argentina,"This meaty, ripe Syrah is smoky and deep on the nose, with lusty berry aromas. The palate is tannic but supported by bold, fleshy blackberry flavors backed by maple, vanilla and burnt toast. Caramel richness makes its way onto the finish, which is rocky and tannic but tastes good. This value Syrah was built for grilled meat.",Cosecha Nocturna,88,12.0,Mendoza Province,Mendoza,Michael Schachner,@wineschach,Domiciano de Barrancas 2012 Cosecha Nocturna Syrah (Mendoza),Syrah,Domiciano de Barrancas
4,66393,Italy,"Musclebound with an earthy, spicy character to the nose. The body is fleshy but pure, offering black cherry flavors accented by tar and licorice. It finishes smooth, with just the right touch of vanilla oak. It's a polished Brunello, good for near-term drinking, but also a candidate for the cellar.",Greppone Mazzi,90,60.0,Tuscany,Brunello di Montalcino,Michael Schachner,@wineschach,Ruffino 1997 Greppone Mazzi (Brunello di Montalcino),Sangiovese,Ruffino


In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Tokenize and convert to lowercase
    tokens = word_tokenize(text.lower())
    # Remove stopwords and lemmatize the words
    processed_text = ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])
    return processed_text

wine_reviews_sampled['processed_description'] = wine_reviews_sampled['description'].apply(preprocess_text)

In [5]:
max_features = 700  # Adjust this value based on your available memory
tfidf = TfidfVectorizer(max_features=max_features)
description_matrix = tfidf.fit_transform(wine_reviews_sampled['processed_description'])

In [6]:
similarity_matrix = cosine_similarity(description_matrix)

In [7]:
def recommend_wines(wine_title, n_recommendations=5):
    # Find the index of the wine with the given title
    wine_index = wine_reviews_sampled[wine_reviews_sampled['title'] == wine_title].index[0]
    
    # Get similarity scores for the given wine
    similarity_scores = list(enumerate(similarity_matrix[wine_index]))
    # Sort the wines based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    # Get the indices of the most similar wines (excluding the input wine itself)
    most_similar_indices = [score[0] for score in similarity_scores[1:n_recommendations+1]]
    # Return the names of the most similar wines
    return wine_reviews['title'].iloc[most_similar_indices]

In [8]:
wine_reviews_sampled['title']

0                                     Joseph Cattin 2013 Hatschbourg Grand Cru Riesling (Alsace)
1                                      Signorello 2012 Las Amigas Vineyard Pinot Noir (Carneros)
2                                     De Bortoli 2013 DB Family Selection Chardonnay (Australia)
3                                   Domiciano de Barrancas 2012 Cosecha Nocturna Syrah (Mendoza)
4                                          Ruffino 1997 Greppone Mazzi  (Brunello di Montalcino)
                                                  ...                                           
24995                                                 Manos Negras 2015 Atrevida Malbec (Agrelo)
24996                                   Roblar 2013 Gold Collection Chardonnay (Sta. Rita Hills)
24997                                           Comm. G. B. Burlotto 2013 Aves  (Barbera d'Alba)
24998                           Tondré 2013 Tondre Grapefield Pinot Noir (Santa Lucia Highlands)
24999    Canoe Ridge 2012 The 

In [9]:
wine_title = 'Condado de Oriza 2006 Roble  (Ribera del Duero)'  # You can search by wine title
recommendations = recommend_wines(wine_title)
display(recommendations)

33284                                   Jorge Ordóñez & Co. 2014 Botani Moscato (Málaga)
56545                                 Celler Pasanau 2004 Finca La Planeta Red (Priorat)
1507                              Château de Gaudou 2009 Réserve Caillou Malbec (Cahors)
56156                                        Badia di Morrona 2012 N'Antia Red (Toscana)
554      Mas de Cadenet 2015 Mas Negrel Cadenet Rosé (Côtes de Provence Sainte-Victoire)
Name: title, dtype: object

In [10]:
# import pickle
#

# with open("wine_recommender.pkl", "wb") as f:
#     pickle.dump((wine_reviews_sampled['title'], similarity_matrix), f)