In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
wine_reviews = pd.read_csv('datasets\winemag-data-130k-v2.csv').drop(['Unnamed: 0', 'region_2'], axis=1).dropna().drop_duplicates()
wine_reviews.head()
wine_reviews.shape

(47673, 12)

In [27]:
#for memory optimization
wine_reviews_sampled = wine_reviews.sample(25000).reset_index()
wine_reviews_sampled.head()

Unnamed: 0,index,country,description,designation,points,price,province,region_1,taster_name,taster_twitter_handle,title,variety,winery
0,47372,US,Drew is single-handedly making a case for the ...,Perli Vineyard,93,40.0,California,Mendocino Ridge,Virginie Boone,@vboone,Drew 2011 Perli Vineyard Syrah (Mendocino Ridge),Syrah,Drew
1,95177,US,Dried sage and a hint of saddle leather highli...,Carlisle Vineyard,91,34.0,California,Russian River Valley,Virginie Boone,@vboone,Novy 2014 Carlisle Vineyard Zinfandel (Russian...,Zinfandel,Novy
2,49479,US,"Earth, mushroom and forest floor intrigue from...",Sangiacomo Vineyard,92,59.0,California,Sonoma Coast,Virginie Boone,@vboone,Sojourn 2015 Sangiacomo Vineyard Pinot Noir (S...,Pinot Noir,Sojourn
3,85848,US,"Lovely delineation here, the berry/cherry frui...",Ciel du Cheval Vineyard,91,35.0,Washington,Red Mountain,Paul Gregutt,@paulgwine,:Nota Bene 2007 Ciel du Cheval Vineyard Syrah ...,Syrah,:Nota Bene
4,81254,US,"Dry and layered in crisp melon, lime and lemon...",Magnolia Lane,89,17.0,California,Sonoma Valley,Virginie Boone,@vboone,Kunde 2013 Magnolia Lane Sauvignon Blanc (Sono...,Sauvignon Blanc,Kunde


In [14]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Tokenize and convert to lowercase
    tokens = word_tokenize(text.lower())
    # Remove stopwords and lemmatize the words
    processed_text = ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])
    return processed_text

wine_reviews_sampled['processed_description'] = wine_reviews_sampled['description'].apply(preprocess_text)

In [15]:
max_features = 700  # Adjust this value based on your available memory
tfidf = TfidfVectorizer(max_features=max_features)
description_matrix = tfidf.fit_transform(wine_reviews_sampled['processed_description'])

In [16]:
similarity_matrix = cosine_similarity(description_matrix)

In [17]:
def recommend_wines(wine_title, n_recommendations=5):
    # Find the index of the wine with the given title
    wine_index = wine_reviews_sampled[wine_reviews_sampled['title'] == wine_title].index[0]
    
    # Get similarity scores for the given wine
    similarity_scores = list(enumerate(similarity_matrix[wine_index]))
    # Sort the wines based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    # Get the indices of the most similar wines (excluding the input wine itself)
    most_similar_indices = [score[0] for score in similarity_scores[1:n_recommendations+1]]
    # Return the names of the most similar wines
    return wine_reviews['title'].iloc[most_similar_indices]

In [26]:
wine_reviews_sampled['title']

0        Les Vins de Vienne 2010 La Sillote Red (Vacque...
1        Jean-Luc Colombo 2015 La Redonne White (Côtes ...
2           Michele Chiarlo 2015 Le Orme  (Barbera d'Asti)
3        Leah Jørgensen Cellars 2013 Clos Rogue Valley ...
4        Greenwood Ridge 2013 Estate Bottled Syrah (Men...
                               ...                        
24995    Monte Tondo 2011 Foscarin Slavinus  (Soave Cla...
24996    MacMurray Estate Vineyards 2013 Winemaker's Bl...
24997    Small Vines 2014 Estate Grown Chardonnay (Sono...
24998    Barton & Guestier 2007 Thomas Barton Réserve  ...
24999                  Artimino 2012 Riserva  (Carmignano)
Name: title, Length: 25000, dtype: object

In [24]:
wine_title = 'Santi 2008 Proemio  (Amarone della Valpolicella Classico)'  # You can search by wine title
recommendations = recommend_wines(wine_title)
print(recommendations)

41804    Firriato 2012 Santagostino Baglio Sorìa Red (T...
65424    AntoLin Cellars 2010 Estate Malbec (Yakima Val...
63616    Côte Bonneville 2011 Estate Bottled DuBrul Vin...
4039                        Fattori 2015 Runcaris  (Soave)
17867                 Durigutti 2012 Reserva Red (Mendoza)
Name: title, dtype: object
