# Modeling Content-Based Filtering Recommender System

## Importing Data

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import joblib

In [2]:
df = pd.read_csv('wines_and_ratings.csv', low_memory=False, compression="zstd")
df

Unnamed: 0,RatingID,UserID,Rating,Vintage,WineName,ABV,Body,Acidity,Country,RegionName,...,Harmonize3,Harmonize4,Harmonize5,Harmonize6,Harmonize7,Harmonize8,Harmonize9,Harmonize10,Harmonize11,Harmonize12
0,143,1356810,4.5,1950,Presidential Colheita Port,14.5,Very full bodied,High,Portugal,Porto,...,Hard Cheese,,,,,,,,,
1,199,1173759,5.0,1951,Pauillac Premier Grand Cru Classé,13.0,Full bodied,High,France,Pauillac,...,Game Meat,Poultry,,,,,,,,
2,348,1164877,5.0,1952,Pauillac Premier Grand Cru Classé,13.0,Full bodied,High,France,Pauillac,...,Game Meat,Poultry,,,,,,,,
3,374,1207665,5.0,1953,Saint Julien Grand Cru Classé,13.5,Full bodied,High,France,Saint Julien,...,Game Meat,Poultry,,,,,,,,
4,834,1075841,5.0,1955,Saint Julien Grand Cru Classé,14.0,Full bodied,High,France,Saint Julien,...,Game Meat,Poultry,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,21013438,1000052,4.5,N.V.,Blanc de Blancs Brut Champagne,12.5,Medium bodied,High,France,Champagne,...,Shellfish,Soft Cheese,,,,,,,,
149996,21013467,1180844,4.0,N.V.,Brut R de Ruinart Champagne,12.0,Medium bodied,High,France,Champagne,...,Shellfish,Soft Cheese,,,,,,,,
149997,21013494,1218581,3.5,N.V.,Crémant d Alsace Cuvée Julien Brut,12.5,Light bodied,High,France,Crémant d Alsace,...,Appetizer,Snack,Lean Fish,,,,,,,
149998,21013505,1106198,4.5,N.V.,Blanc de Blancs Brut Champagne,12.5,Medium bodied,High,France,Champagne,...,Shellfish,Soft Cheese,,,,,,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 37 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   RatingID     150000 non-null  int64  
 1   UserID       150000 non-null  int64  
 2   Rating       150000 non-null  float64
 3   Vintage      150000 non-null  object 
 4   WineName     150000 non-null  object 
 5   ABV          150000 non-null  float64
 6   Body         150000 non-null  object 
 7   Acidity      150000 non-null  object 
 8   Country      150000 non-null  object 
 9   RegionName   150000 non-null  object 
 10  WineryName   150000 non-null  object 
 11  Website      150000 non-null  object 
 12  Type1        150000 non-null  object 
 13  Type2        2522 non-null    object 
 14  Elaborate1   150000 non-null  object 
 15  Elaborate2   150000 non-null  object 
 16  Grapes1      150000 non-null  object 
 17  Grapes2      44538 non-null   object 
 18  Grapes3      28735 non-n

## Combining Selected Columns into a Single 'Attributes' Column

In [4]:
df = df.fillna("")
common_columns = ['WineName', 'Body', 'Acidity', 'Country', 'RegionName', 'WineryName']
type_columns = ['Type1', 'Type2']
elaborate_columns = ['Elaborate1', 'Elaborate2']
grapes_columns = [f'Grapes{i}' for i in range(1, 10)]
harmonize_columns = [f'Harmonize{i}' for i in range(1, 13)]
all_columns = common_columns + type_columns + elaborate_columns + grapes_columns + harmonize_columns
df['Attributes'] = df[all_columns].astype(str).agg(' '.join, axis=1)
df

Unnamed: 0,RatingID,UserID,Rating,Vintage,WineName,ABV,Body,Acidity,Country,RegionName,...,Harmonize4,Harmonize5,Harmonize6,Harmonize7,Harmonize8,Harmonize9,Harmonize10,Harmonize11,Harmonize12,Attributes
0,143,1356810,4.5,1950,Presidential Colheita Port,14.5,Very full bodied,High,Portugal,Porto,...,,,,,,,,,,Presidential Colheita Port Very full bodied Hi...
1,199,1173759,5.0,1951,Pauillac Premier Grand Cru Classé,13.0,Full bodied,High,France,Pauillac,...,Poultry,,,,,,,,,Pauillac Premier Grand Cru Classé Full bodie...
2,348,1164877,5.0,1952,Pauillac Premier Grand Cru Classé,13.0,Full bodied,High,France,Pauillac,...,Poultry,,,,,,,,,Pauillac Premier Grand Cru Classé Full bodie...
3,374,1207665,5.0,1953,Saint Julien Grand Cru Classé,13.5,Full bodied,High,France,Saint Julien,...,Poultry,,,,,,,,,Saint Julien Grand Cru Classé Full bodied Hi...
4,834,1075841,5.0,1955,Saint Julien Grand Cru Classé,14.0,Full bodied,High,France,Saint Julien,...,Poultry,,,,,,,,,Saint Julien Grand Cru Classé Full bodied Hi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,21013438,1000052,4.5,N.V.,Blanc de Blancs Brut Champagne,12.5,Medium bodied,High,France,Champagne,...,Soft Cheese,,,,,,,,,Blanc de Blancs Brut Champagne Medium bodied H...
149996,21013467,1180844,4.0,N.V.,Brut R de Ruinart Champagne,12.0,Medium bodied,High,France,Champagne,...,Soft Cheese,,,,,,,,,Brut R de Ruinart Champagne Medium bodied Hi...
149997,21013494,1218581,3.5,N.V.,Crémant d Alsace Cuvée Julien Brut,12.5,Light bodied,High,France,Crémant d Alsace,...,Snack,Lean Fish,,,,,,,,Crémant d Alsace Cuvée Julien Brut Light bodie...
149998,21013505,1106198,4.5,N.V.,Blanc de Blancs Brut Champagne,12.5,Medium bodied,High,France,Champagne,...,Soft Cheese,,,,,,,,,Blanc de Blancs Brut Champagne Medium bodied H...


## Removing Duplicate Rows based on the 'Attributes' and 'WineName' Columns

In [5]:
df.drop_duplicates(subset=['Attributes'], inplace=True)
df.drop_duplicates(subset=['WineName'], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,RatingID,UserID,Rating,Vintage,WineName,ABV,Body,Acidity,Country,RegionName,...,Harmonize4,Harmonize5,Harmonize6,Harmonize7,Harmonize8,Harmonize9,Harmonize10,Harmonize11,Harmonize12,Attributes
0,143,1356810,4.5,1950,Presidential Colheita Port,14.5,Very full bodied,High,Portugal,Porto,...,,,,,,,,,,Presidential Colheita Port Very full bodied Hi...
1,199,1173759,5.0,1951,Pauillac Premier Grand Cru Classé,13.0,Full bodied,High,France,Pauillac,...,Poultry,,,,,,,,,Pauillac Premier Grand Cru Classé Full bodie...
2,374,1207665,5.0,1953,Saint Julien Grand Cru Classé,13.5,Full bodied,High,France,Saint Julien,...,Poultry,,,,,,,,,Saint Julien Grand Cru Classé Full bodied Hi...
3,1020,1147051,5.0,1955,Saint Estèphe Grand Cru Classé,12.8,Full bodied,High,France,Saint Estèphe,...,Poultry,,,,,,,,,Saint Estèphe Grand Cru Classé Full bodied H...
4,1664,1173759,5.0,1958,Barolo Tradizione,14.0,Very full bodied,High,Italy,Barolo,...,Game Meat,,,,,,,,,Barolo Tradizione Very full bodied High Ital...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
798,20587832,1138481,3.5,N.V.,Blu Prosecco Extra Dry,12.5,Very light bodied,High,Italy,Prosecco,...,Snack,Cured Meat,,,,,,,,Blu Prosecco Extra Dry Very light bodied High ...
799,20591243,1271440,4.5,N.V.,Grappa di Amarone della Valpolicella,14.5,Full bodied,High,Italy,Valpolicella,...,,,,,,,,,,Grappa di Amarone della Valpolicella Full bodi...
800,20593813,1004772,3.0,N.V.,Blanc de Blancs,12.9,Medium bodied,High,United States,Sonoma County,...,Lean Fish,,,,,,,,,Blanc de Blancs Medium bodied High United Stat...
801,20612284,1239551,4.0,N.V.,Vecchia Grappa Moscato,14.5,Full bodied,High,Brazil,Serra Gaúcha,...,,,,,,,,,,Vecchia Grappa Moscato Full bodied High Brazil...


## Converting 'Attributes' into a TF-IDF Matrix

In [6]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['Attributes'])
tfidf_matrix.shape

(803, 2176)

## Computing Cosine Similarity between Wines based on TF-IDF Matrix

In [7]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

## Testing the Model

In [8]:
def get_recommendations(WineName):
    idx = df.index[df['WineName'] == WineName].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    wine_indices = [i[0] for i in sim_scores]
    return df['WineName'].iloc[wine_indices]

In [9]:
get_recommendations('Vecchia Grappa Moscato')

780       Colheita Tardia Malvasia Moscato
36                                Moscatel
301                         Moscato d Asti
772                                Moscato
601                  Dulcis Moscato d Asti
196                     Espumante Moscatel
344    Moncalvina Moscato d Asti  Canelli 
311                     101 Moscato d Asti
74             Aquarela Moscatel Espumante
716        Colheita Tardia Malvasia Bianca
Name: WineName, dtype: object

## Exporting the Model

In [10]:
joblib.dump((cosine_sim, df), 'content_based_recommender_model.pkl')

['content_based_recommender_model.pkl']