# Experimenting different Models for Document Similarity of wine reviews

In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...


## 1. Read the data and split to train and test sets

In [2]:
data = pd.read_csv('wine_data.csv')

split = StratifiedShuffleSplit(n_splits=1, test_size=0.05, random_state=42)
for train_index, test_index in split.split(data, data["variety"]):
    train = data.loc[train_index]
    test = data.loc[test_index]

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

## 2. Function for Preprocessing the data

In [8]:
stopword_list = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def process_description(description):
    description = re.sub('[^a-zA-Z]', ' ', description)
    description = description.lower()
    description_words = description.split()
    description_words = [word for word in description_words if word not in stopword_list]
    description_words = [lemmatizer.lemmatize(word) for word in description_words]
    return ' '.join(description_words)

train['description'] = train['description'].apply(process_description)
test['description'] = test['description'].apply(process_description)

# Experimenting Different Models

### First, lets define the evaluation function to calculate the accuracy metric to compare different models

In [9]:
def calculate_accuracy(train, test, recommendation_indices):
    train_varieties = train['variety'].to_numpy()
    test_variety = test['variety'].to_numpy()
    nearest_varieties = train_varieties[recommendation_indices]
    match = 0
    accuracy = 0
    for i in range(len(test_variety)):
        for j in range(5):
            if test_variety[i] == nearest_varieties[i][j]:
                match += 1
                break
        #match = match/5        
        accuracy += match
        match = 0
    return accuracy/len(test_variety)

## Next, we need to vectorize the data before we can calculate similarity metrics

In [10]:
vectorizer = TfidfVectorizer(max_features=1000, max_df=0.9, min_df=2)
train_vectors = vectorizer.fit_transform(train['description'])
test_vectors = vectorizer.transform(test['description'])

## 1. Nearest Neighbours (using Manhattan Distance)

In [11]:
# Step 1: Use NearestNeighbors based on Manhattan distance
neighbors = NearestNeighbors(n_neighbors=5, metric='manhattan')
neighbors.fit(train_vectors)

# Step 2: Find the 5 nearest neighbors for each wine in the test dataset
distances, indices = neighbors.kneighbors(test_vectors)

accuracy_knn = calculate_accuracy(train, test, indices)

print('Accuracy of recommendations based on KNN:', accuracy_knn)

Accuracy of recommendations based on KNN: 0.26288879420536854


## 2. Euclidean Distance

In [12]:
# Step 1: Calculate Euclidean distance
euclidean_dist_matrix = euclidean_distances(test_vectors, train_vectors)

# Step 2: Find the indices of top 5 recommendations
top_5_indices_euclidean = np.argsort(euclidean_dist_matrix, axis=1)[:, :5]

# Step 3: Find the accuracy of recommendations
accuracy_euclidean = calculate_accuracy(train, test, top_5_indices_euclidean)

print('Accuracy of recommendations based on Euclidean Distance:', accuracy_euclidean)

Accuracy of recommendations based on Euclidean Distance: 0.6254793353216873


## 3. Cosine Similarity

In [13]:
# Step 1: Calculate Cosine Similarity
cosine_sim_matrix = cosine_similarity(test_vectors, train_vectors)

# Step 2: Find the indices of top 5 recommendations
top_5_indices_cosine = np.argsort(-cosine_sim_matrix, axis=1)[:, :5]

# Step 3: Find the accuracy of recommendations
accuracy_cosine = calculate_accuracy(train, test, top_5_indices_cosine)

print('Accuracy of recommendations based on Cosine Similarity:', accuracy_cosine)

Accuracy of recommendations based on Cosine Similarity: 0.8035790370685982


## 4. Vectorization using word2vec

In [14]:
from gensim.models import Word2Vec

# Step1: Data Preprocessing for Word2Vec

model = Word2Vec(sentences=(pd.concat([train['description'], test['description']])), vector_size=1000, window=5, min_count=1, workers=4)

def document_vector_word2vec(doc, model):
    doc = [word for word in doc if word in model.wv.key_to_index]
    if not doc:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[doc], axis=0)

word2vec_train_vectors = np.array([document_vector_word2vec(doc, model) for doc in train['description']])
word2vec_test_vectors = np.array([document_vector_word2vec(doc, model) for doc in test['description']])

# Step 2: Calculate Euclidean distance
cosine_sim_matrix_word2vec = cosine_similarity(word2vec_test_vectors, word2vec_train_vectors)

# Step 3: Find the indices of top 5 recommendations
top_5_indices_cosine_word2vec = np.argsort(-cosine_sim_matrix_word2vec, axis=1)[:, :5]

# Step 4: Find the accuracy of recommendations
accuracy_cosine_word2vec = calculate_accuracy(train, test, top_5_indices_cosine_word2vec)

print('Accuracy of recommendations based on Word2Vec vectors:', accuracy_cosine_word2vec)

Accuracy of recommendations based on Word2Vec vectors: 0.5155517682147422


### View results to explain the evaluation metric for different models

In [15]:
nearest_neighbors_df = nearest_neighbors_df = pd.DataFrame(top_5_indices_cosine, index=test.index, columns=[f'Nearest {i+1}' for i in range(5)])

test_new = test.reset_index()
train_new = train.reset_index()

n1 = nearest_neighbors_df.copy()
n1['test_variety'] = test_new['variety']

n1 = pd.merge(n1, train_new[['index','variety']], left_on='Nearest 1', right_on=['index'], how='left')
n1.rename(columns={'variety':'n1_var'}, inplace=True)
n1.drop('index', axis=1, inplace=True)

n1 = pd.merge(n1, train_new[['index','variety']], left_on='Nearest 2', right_on=['index'], how='left')
n1.rename(columns={'variety':'n2_var'}, inplace=True)
n1.drop('index', axis=1, inplace=True)

n1 = pd.merge(n1, train_new[['index','variety']], left_on='Nearest 3', right_on=['index'], how='left')
n1.rename(columns={'variety':'n3_var'}, inplace=True)
n1.drop('index', axis=1, inplace=True)

n1 = pd.merge(n1, train_new[['index','variety']], left_on='Nearest 4', right_on=['index'], how='left')
n1.rename(columns={'variety':'n4_var'}, inplace=True)
n1.drop('index', axis=1, inplace=True)

n1 = pd.merge(n1, train_new[['index','variety']], left_on='Nearest 5', right_on=['index'], how='left')
n1.rename(columns={'variety':'n5_var'}, inplace=True)
n1.drop('index', axis=1, inplace=True)

n1

Unnamed: 0,Nearest 1,Nearest 2,Nearest 3,Nearest 4,Nearest 5,test_variety,n1_var,n2_var,n3_var,n4_var,n5_var
0,58187,49423,49248,39889,76363,Red Blend,Red Blend,Red Blend,Rhône-style Red Blend,Rhône-style Red Blend,Rhône-style Red Blend
1,16672,9061,47038,62808,55782,Riesling,Riesling,Riesling,Riesling,Riesling,Riesling
2,70626,81524,63569,18154,35963,Chardonnay,Chardonnay,Sauvignon Blanc,Chardonnay,Chardonnay,Chardonnay
3,30345,29368,7225,76434,52830,Red Blend,Red Blend,Red Blend,Bordeaux-style Red Blend,Red Blend,Bordeaux-style Red Blend
4,11305,83787,53928,11327,50180,Bordeaux-style Red Blend,Bordeaux-style Red Blend,Bordeaux-style Red Blend,Red Blend,Red Blend,Bordeaux-style Red Blend
...,...,...,...,...,...,...,...,...,...,...,...
4689,56436,82828,34965,67159,71625,Pinot Noir,Bordeaux-style Red Blend,Pinot Noir,Pinot Noir,Zinfandel,Pinot Noir
4690,31404,42479,68909,11942,93,Riesling,Merlot,Riesling,Riesling,Riesling,Riesling
4691,12685,34925,12341,23182,44752,Pinot Noir,Pinot Noir,Pinot Noir,Pinot Noir,Pinot Noir,Pinot Noir
4692,41341,84332,29464,13324,10798,Bordeaux-style Red Blend,Red Blend,Red Blend,Red Blend,Red Blend,Red Blend
