In [29]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

from IPython.display import display, HTML
import pandas as pd

pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [30]:
wine_reviews = pd.read_csv('datasets\winemag-data-130k-v2.csv').drop(['Unnamed: 0', 'region_2'], axis=1).dropna().drop_duplicates()
wine_reviews.head()
wine_reviews.shape

(47673, 12)

In [31]:
#for memory optimization
wine_reviews_sampled = wine_reviews.sample(25000).reset_index()
wine_reviews_sampled.head()

Unnamed: 0,index,country,description,designation,points,price,province,region_1,taster_name,taster_twitter_handle,title,variety,winery
0,32914,Spain,"Here's a highly recommended, everyday Ribera del Duero wine that starts with a full, round nose of charcoal, wild berry patch and oak. The oak, however, is tame (just three months of barrel time for this youngster) not forced, thus the cherry and raspberry fruit has vibrancy and balance.",Roble,88,10.0,Northern Spain,Ribera del Duero,Michael Schachner,@wineschach,Condado de Oriza 2006 Roble (Ribera del Duero),Tempranillo,Condado de Oriza
1,99795,Italy,"Made with organically cultivated grapes, this savory white offers alluring aromas of yellow stone fruit, pear and white spring flower. Juicy and delicious, the creamy palate doles out succulent yellow peach, golden apple, nectarine and a savory saline note. Tangy acidity lifts the creamy flavors while a hint of white almond graces the finish.",Selvabianca,91,18.0,Tuscany,Vernaccia di San Gimignano,Kerin O’Keefe,@kerinokeefe,Il Colombaio di Santa Chiara 2015 Selvabianca (Vernaccia di San Gimignano),Vernaccia,Il Colombaio di Santa Chiara
2,125810,US,"Lava Cap has made a decent version of Barbera here, simple and blueberry-toned but the wine is a tad heavy and overly oaked and bitter on the finish.",Reserve,85,20.0,California,El Dorado,Virginie Boone,@vboone,Lava Cap 2008 Reserve Barbera (El Dorado),Barbera,Lava Cap
3,38539,Italy,"Featuring Cabernet, Sangiovese, and Merlot, this New World red is big and powerful, with beautiful aromas of berry fruit, smoke, orange rind and spice. The mouth is fully enveloping and downright delicious. It finishes with a blast of black coffee and toast. Big-boned and aggressive, but still fairly friendly. Drink now or hold for up to six years.",Tassinaia,92,55.0,Tuscany,Toscana,Michael Schachner,@wineschach,Castello del Terriccio 1998 Tassinaia Sangiovese (Toscana),Sangiovese,Castello del Terriccio
4,835,France,"Attractive cherry flavors give a ripe, rounded feel that's balanced by the acidity and tannins. Black cherries burst easily from the glass, delivering a full aftertaste.",Flower Label,86,12.0,Beaujolais,Beaujolais,Roger Voss,@vossroger,Georges Duboeuf 2013 Flower Label (Beaujolais),Gamay,Georges Duboeuf


In [32]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Tokenize and convert to lowercase
    tokens = word_tokenize(text.lower())
    # Remove stopwords and lemmatize the words
    processed_text = ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])
    return processed_text

wine_reviews_sampled['processed_description'] = wine_reviews_sampled['description'].apply(preprocess_text)

In [33]:
max_features = 700  # Adjust this value based on your available memory
tfidf = TfidfVectorizer(max_features=max_features)
description_matrix = tfidf.fit_transform(wine_reviews_sampled['processed_description'])

In [34]:
similarity_matrix = cosine_similarity(description_matrix)

In [35]:
def recommend_wines(wine_title, n_recommendations=5):
    # Find the index of the wine with the given title
    wine_index = wine_reviews_sampled[wine_reviews_sampled['title'] == wine_title].index[0]
    
    # Get similarity scores for the given wine
    similarity_scores = list(enumerate(similarity_matrix[wine_index]))
    # Sort the wines based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    # Get the indices of the most similar wines (excluding the input wine itself)
    most_similar_indices = [score[0] for score in similarity_scores[1:n_recommendations+1]]
    # Return the names of the most similar wines
    return wine_reviews['title'].iloc[most_similar_indices]

In [36]:
wine_reviews_sampled['title']

0                                    Condado de Oriza 2006 Roble  (Ribera del Duero)
1        Il Colombaio di Santa Chiara 2015 Selvabianca  (Vernaccia di San Gimignano)
2                                          Lava Cap 2008 Reserve Barbera (El Dorado)
3                         Castello del Terriccio 1998 Tassinaia Sangiovese (Toscana)
4                                    Georges Duboeuf 2013 Flower Label  (Beaujolais)
                                            ...                                     
24995                                   Lenné Estate 2012 Karen's Pommard Pinot Noir
24996              Left Coast Cellars 2010 Left Bank Pinot Blanc (Willamette Valley)
24997            Katnook Estate 2014 Founder's Block Cabernet Sauvignon (Coonawarra)
24998                                    Drytown 2014 Estate Rosé (Sierra Foothills)
24999                                      Donnafugata 2016 Anthilia White (Sicilia)
Name: title, Length: 25000, dtype: object

In [38]:
wine_title = 'Condado de Oriza 2006 Roble  (Ribera del Duero)'  # You can search by wine title
recommendations = recommend_wines(wine_title)
display(recommendations)

11576     Cadaretta 2014 Southwind Red (Walla Walla Valley (WA))
23648                Cusumano 2014 Benuara Red (Terre Siciliane)
25349    Dunham 2008 Lewis Vineyard Syrah (Columbia Valley (WA))
23301        Zocker 2012 Paragon Vineyard Riesling (Edna Valley)
47236              Muga 2011 Conde de Haro Brut Sparkling (Cava)
Name: title, dtype: object