In [6]:
import pandas as pd
from nltk.corpus import stopwords
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords


## Build the Dataset

In [7]:
df = pd.read_csv('../data/csv/sample_cleaned_reviews.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  499 non-null    int64 
 1   beer_id     499 non-null    int64 
 2   username    499 non-null    object
 3   review      499 non-null    object
dtypes: int64(2), object(2)
memory usage: 15.7+ KB


In [8]:
# Create a function that lemmatizes documents and removes stop words

stop = stopwords.words('english')
stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()

def black_txt(token):
    return  token not in stop_words_ and token not in list(string.punctuation)  and len(token)>2   

def clean_text(text):
    clean_text = []
    clean_text2 = []
    text = re.sub("'", "", text)
    text = re.sub("(\\d|\\W)+", " ", text)
    clean_text = [wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
    clean_text2 = [word for word in clean_text if black_txt(word)]
    return " ".join(clean_text2)
    

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,beer_id,username,review
0,0,271781,bluejacket74,"750 ml bottle, 2016 vintage, bottle #304 of 36..."
1,3,125646,GratefulBeerGuy,0% 16 oz can. Funny story: As I finally walked...
2,4,125646,LukeGude,Classic TH NEIPA. Overflowing head and bouquet...
3,7,125646,MFMB,Pours a creamy opaque light straw yellow with ...
4,13,125646,jngrizzaffi,Pours a cloudy yellow color with a thin foamy ...


In [10]:
df["review"] = df["review"].map(str).apply(clean_text)
df["review"] = df["review"].str.lower()
df = df[["beer_id", "username", "review"]]


Unnamed: 0,beer_id,username,review
0,271781,bluejacket74,bottle vintage bottle serve dfh snifter beer p...
1,125646,GratefulBeerGuy,funny story finally walk doors min wait line f...
2,125646,LukeGude,classic neipa overflow head bouquet strong fla...
3,125646,MFMB,pour creamy opaque light straw yellow whispy f...
4,125646,jngrizzaffi,pour cloudy yellow color thin foamy head head ...


In [20]:
df.head(15)

Unnamed: 0,beer_id,username,review
0,271781,bluejacket74,bottle vintage bottle serve dfh snifter beer p...
1,125646,GratefulBeerGuy,funny story finally walk doors min wait line f...
2,125646,LukeGude,classic neipa overflow head bouquet strong fla...
3,125646,MFMB,pour creamy opaque light straw yellow whispy f...
4,125646,jngrizzaffi,pour cloudy yellow color thin foamy head head ...
5,125646,PDOR1960,another great brew treehouse
6,125646,Lucular,pour cloudy yellow orange haze burst tropical ...
7,205644,Brutaltruth,tall boy enjoy snifter via friend work thank j...
8,205644,secondtooth,appearance pour deep black tan head little lac...
9,150672,Derek,beautiful crystal clear pour nice head good re...


## Evaluate TF-IDF Vectorization and Count Vectorization Recommendation Models

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Intializ TF-IDF vecotorizer
tfidf = TfidfVectorizer()

# Construct the required TF-IDF matrix by applying the fit_transform method
tfidf_matrix = tfidf.fit_transform(df["review"])

# Get the shape of the tfidf_matrix
tfidf_matrix.shape

(499, 3486)

In [12]:
# Compute the dot product
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [13]:
# Construct a reverse mapping of indices and beerid. Drop duplicate beer_ids
indices = pd.Series(df.index, index=df['beer_id']).drop_duplicates()

In [27]:
# Recommendation modeel with TFIDF default arguments

def recommender(user_input, cosine_sim=cosine_sim, df=df, indices=indices):
    str(user_input)
    # Get the index of the beer that matches the user input
    idx = indices[user_input]
    
    # Get the pairwise similarity scores of all beers with that description
    # convert it into a list od tuples
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    #Sort the beer based on the cosine similarity score
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    
    #Get the scores of the 5 most similar beers
    sim_scores = sim_scores[1:6]
    
    #Get beer indices
    beer_indices = [i[0] for i in sim_scores]
    
    return df['beer_id'].iloc[beer_indices]

In [17]:
tfidf_matrix

<499x3486 sparse matrix of type '<class 'numpy.float64'>'
	with 24691 stored elements in Compressed Sparse Row format>

In [28]:
recommender("crisp")

KeyError: 'crisp'

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

#Initialize Count Vectorizer and create matrix
count = CountVectorizer()

# Construct the required CV matrix by applying the fit_transform method
count_matrix = count.fit_transform(df["review"])

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [24]:
# Reset index of df and construct reverse mapping again
df = df.reset_index()

indices2 = pd.Series(df.index, index=df['beer_id'])

In [25]:
# Run recommender with CV parameters

recommender(text, cosine_sim2, df, indices2)

KeyError: 'another great brew treehouse'

In [None]:
count_matrix