In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import pickle

from IPython.display import display, HTML
import pandas as pd

pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
wine_reviews = pd.read_csv('datasets\winemag-data-130k-v2.csv').drop(['Unnamed: 0', 'region_2'], axis=1).dropna().drop_duplicates()
wine_reviews.head()
wine_reviews.shape

(47673, 12)

In [3]:
#for memory optimization
wine_reviews_sampled = wine_reviews.sample(25000).reset_index()
wine_reviews_sampled.head()

Unnamed: 0,index,country,description,designation,points,price,province,region_1,taster_name,taster_twitter_handle,title,variety,winery
0,58815,France,"Attractively perfumed, this wine is still full of youthful apple and citrus acidity. It carries light flavors of almonds and tangy Seville orange zest, with the crispest aftertaste. Drink now.",Les Grenouilles Sec,86,17.0,Loire Valley,Vouvray,Roger Voss,@vossroger,Domaine du Petit Coteau 2012 Les Grenouilles Sec (Vouvray),Chenin Blanc,Domaine du Petit Coteau
1,72808,Italy,"Densely concentrated, this plush wine opens with aromas of black currant, espresso and cedar. The robust, one-dimensional palate delivers raisin, chocolate and licorice alongside velvety tannins that hold the flavors together.",Cuculaia,89,65.0,Tuscany,Cortona,Kerin O’Keefe,@kerinokeefe,Fabrizio Dionisio 2010 Cuculaia Syrah (Cortona),Syrah,Fabrizio Dionisio
2,36362,US,"Bedell continues to raise the bar for Long Island Merlot with this offering. A rich, savory nose of meat, leather spice and dark berry leads into a poised balance of savory spice, rich fruit, acids and tannins. The wine has a friendly brightness about it but overall, is restrained and integrated.",Reserve,87,48.0,New York,North Fork of Long Island,Susan Kostrzewa,@suskostrzewa,Bedell 2005 Reserve Merlot (North Fork of Long Island),Merlot,Bedell
3,60059,Spain,"This is almost exactly like the winery's excellent 2004 Gran Reserva, proof that Olabarri knows how to make this style of Tempranillo. This is smooth and concentrated, with a deep color and ripe plum, blackberry and tobacco aromas. Flavors of mossy berry, fresh prune, chocolate and vanilla come with light herbal accents. This is elegant, with aging potential. Drink now–2020.",Gran Reserva,92,32.0,Northern Spain,Rioja,Michael Schachner,@wineschach,Viña Olabarri 2005 Gran Reserva (Rioja),Tempranillo,Viña Olabarri
4,99338,US,"From a selection of the original vines at this heritage vineyard, which was planted in the early 1970s, this is densely layered with strawberry, raspberry and cherry fruit. Details are well-integrated, bringing light notes of toasted coconut and caramel. The texture and mouthfeel express the little extras that old vines, when handled properly, can deliver.",Maresh Vineyard Red Barn Blocks,95,70.0,Oregon,Dundee Hills,Paul Gregutt,@paulgwine,Kelley Fox 2014 Maresh Vineyard Red Barn Blocks Pinot Noir (Dundee Hills),Pinot Noir,Kelley Fox


In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Tokenize and convert to lowercase
    tokens = word_tokenize(text.lower())
    # Remove stopwords and lemmatize the words
    processed_text = ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])
    return processed_text

wine_reviews_sampled['processed_description'] = wine_reviews_sampled['description'].apply(preprocess_text)

In [5]:
max_features = 700  # Adjust this value based on your available memory
tfidf = TfidfVectorizer(max_features=max_features)
description_matrix = tfidf.fit_transform(wine_reviews_sampled['processed_description'])

In [6]:
similarity_matrix = cosine_similarity(description_matrix)

In [7]:
def recommend_wines(wine_title, n_recommendations=5):
    # Find the index of the wine with the given title
    wine_index = wine_reviews_sampled[wine_reviews_sampled['title'] == wine_title].index[0]
    
    # Get similarity scores for the given wine
    similarity_scores = list(enumerate(similarity_matrix[wine_index]))
    # Sort the wines based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    # Get the indices of the most similar wines (excluding the input wine itself)
    most_similar_indices = [score[0] for score in similarity_scores[1:n_recommendations+1]]
    # Return the names of the most similar wines
    return wine_reviews['title'].iloc[most_similar_indices]

In [8]:
wine_reviews_sampled['title']

0                      Domaine du Petit Coteau 2012 Les Grenouilles Sec  (Vouvray)
1                                  Fabrizio Dionisio 2010 Cuculaia Syrah (Cortona)
2                           Bedell 2005 Reserve Merlot (North Fork of Long Island)
3                                         Viña Olabarri 2005 Gran Reserva  (Rioja)
4        Kelley Fox 2014 Maresh Vineyard Red Barn Blocks Pinot Noir (Dundee Hills)
                                           ...                                    
24995               Janzen 2012 Cloudy's Vineyard Cabernet Sauvignon (Napa Valley)
24996                               Ponzi 2015 Pinot Noir Rosé (Willamette Valley)
24997          Hawks View 2010 Hawks View Vineyard Pinot Noir (Chehalem Mountains)
24998           Lucienne 2012 Lone Oak Vineyard Pinot Noir (Santa Lucia Highlands)
24999               Rideau 2015 La Encantada Vineyard Pinot Noir (Sta. Rita Hills)
Name: title, Length: 25000, dtype: object

In [9]:
wine_title = 'Condado de Oriza 2006 Roble  (Ribera del Duero)'  # You can search by wine title
recommendations = recommend_wines(wine_title)
display(recommendations)

6781                     Georges Vigouroux 2005 Château de Mercuès Red (Cahors)
57703    Château Tour des Gendres 2008 Moulin des Dames Red (Côtes de Bergerac)
65560                               Oak Ridge 2012 Big Bad Zin Zinfandel (Lodi)
17352                              Istine 2013 Vigna Istine  (Chianti Classico)
19933                    Gassier 2012 Château Beaulieu Rosé (Côtes de Provence)
Name: title, dtype: object

In [11]:
# import pickle


# with open("web_app/wine_recommender.pkl", "wb") as f:
#     pickle.dump((wine_reviews_sampled['title'], similarity_matrix), f)