In [1]:
import pandas as pd
from nltk.corpus import stopwords
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords


## Build the Dataset

In [2]:
df = pd.read_csv('../data/csv/sample_cleaned_reviews.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      499 non-null    object
 1   beer_id   499 non-null    int64 
 2   username  499 non-null    object
 3   review    499 non-null    object
dtypes: int64(1), object(3)
memory usage: 15.7+ KB


In [3]:
# Create a function that lemmatizes documents and removes stop words

stop = stopwords.words('english')
stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()

def black_txt(token):
    return  token not in stop_words_ and token not in list(string.punctuation)  and len(token)>2   

def clean_text(text):
    clean_text = []
    clean_text2 = []
    text = re.sub("'", "", text)
    text = re.sub("(\\d|\\W)+", " ", text)
    clean_text = [wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
    clean_text2 = [word for word in clean_text if black_txt(word)]
    return " ".join(clean_text2)
    

In [4]:
df.head()

Unnamed: 0,name,beer_id,username,review
0,Motorbreath Imperial Stout,271781,bluejacket74,"750 ml bottle, 2016 vintage, bottle #304 of 36..."
1,Haze,125646,GratefulBeerGuy,0% 16 oz can. Funny story: As I finally walked...
2,Haze,125646,LukeGude,Classic TH NEIPA. Overflowing head and bouquet...
3,Haze,125646,MFMB,Pours a creamy opaque light straw yellow with ...
4,Haze,125646,jngrizzaffi,Pours a cloudy yellow color with a thin foamy ...


In [5]:
df["review"] = df["review"].map(str).apply(clean_text)
df["review"] = df["review"].str.lower()
df.head()

Unnamed: 0,name,beer_id,username,review
0,Motorbreath Imperial Stout,271781,bluejacket74,bottle vintage bottle serve dfh snifter beer p...
1,Haze,125646,GratefulBeerGuy,funny story finally walk doors min wait line f...
2,Haze,125646,LukeGude,classic neipa overflow head bouquet strong fla...
3,Haze,125646,MFMB,pour creamy opaque light straw yellow whispy f...
4,Haze,125646,jngrizzaffi,pour cloudy yellow color thin foamy head head ...


In [12]:
# Group allthe reviews for each beer into 1 column. Essentially making a bag of word for each beer
df = df.groupby(['name','beer_id'])['review'].apply(','.join).reset_index()

In [13]:
df.head()

Unnamed: 0,name,beer_id,review
0,Airdale Dark And Stormy,47606,tap beachwood pour dark brown black colour pro...
1,Backyard Porter,74580,bottle stfun appearance opaque black short las...
2,Barrel Aged Cockfight Farmhouse Ale,156241,tap cbc day vanilla farmhouse funk hint bourbo...
3,Barrel Aged The Jones Dog,104824,batch pour thick dark viscous black color thin...
4,Battle Point Stout,209991,serve tap pour deep dark ruby ting black color...


## Evaluate TF-IDF Vectorization and Count Vectorization Recommendation Models

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Intializ TF-IDF vecotorizer
tfidf = TfidfVectorizer()

# Construct the required TF-IDF matrix by applying the fit_transform method
tfidf_matrix = tfidf.fit_transform(df["review"])

# Get the shape of the tfidf_matrix
tfidf_matrix.shape

(72, 3486)

In [24]:
print(tfidf_matrix)

  (0, 1563)	0.030369967725686377
  (0, 1479)	0.025520481814393373
  (0, 3095)	0.018146110370031318
  (0, 3036)	0.0246512134781097
  (0, 1158)	0.025520481814393373
  (0, 2197)	0.025520481814393373
  (0, 2895)	0.0246512134781097
  (0, 1926)	0.02649224113816107
  (0, 1283)	0.019801727566816696
  (0, 2446)	0.032210995385737747
  (0, 737)	0.028865738600065558
  (0, 608)	0.032210995385737747
  (0, 1233)	0.01893245923053302
  (0, 3263)	0.0246512134781097
  (0, 297)	0.020273308395678886
  (0, 224)	0.0246512134781097
  (0, 1363)	0.023146984352488884
  (0, 2838)	0.0246512134781097
  (0, 867)	0.030369967725686377
  (0, 2903)	0.018529919962670022
  (0, 1726)	0.032210995385737747
  (0, 662)	0.027593930995918888
  (0, 885)	0.042611913384875015
  (0, 244)	0.030369967725686377
  (0, 214)	0.023864864617608
  :	:
  (71, 2404)	0.0888737752834459
  (71, 495)	0.11653344925740236
  (71, 1196)	0.040636581425753465
  (71, 3010)	0.0888737752834459
  (71, 2443)	0.057263289742074244
  (71, 3161)	0.08887377528344

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

#Initialize Count Vectorizer and create matrix
count = CountVectorizer()

# Construct the required CV matrix by applying the fit_transform method
count_matrix = count.fit_transform(df["review"])

In [26]:
print(count_matrix)

  (0, 3052)	3
  (0, 264)	1
  (0, 2354)	4
  (0, 795)	15
  (0, 426)	3
  (0, 319)	6
  (0, 640)	1
  (0, 2389)	1
  (0, 1568)	1
  (0, 2934)	8
  (0, 2820)	1
  (0, 1774)	5
  (0, 3044)	4
  (0, 1444)	8
  (0, 2884)	1
  (0, 1711)	5
  (0, 1182)	1
  (0, 1901)	4
  (0, 566)	14
  (0, 1486)	5
  (0, 2580)	8
  (0, 630)	11
  (0, 2002)	1
  (0, 231)	1
  (0, 2982)	1
  :	:
  (71, 1721)	1
  (71, 3457)	2
  (71, 2865)	2
  (71, 2603)	1
  (71, 584)	5
  (71, 1228)	1
  (71, 1738)	1
  (71, 718)	2
  (71, 1960)	1
  (71, 2944)	1
  (71, 2247)	1
  (71, 1438)	3
  (71, 1282)	1
  (71, 602)	1
  (71, 2509)	1
  (71, 2264)	1
  (71, 1924)	1
  (71, 2613)	2
  (71, 1854)	1
  (71, 1296)	1
  (71, 223)	1
  (71, 2381)	1
  (71, 1432)	1
  (71, 1245)	1
  (71, 2475)	1


In [18]:
# Compute the dot product
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [10]:
# Construct a reverse mapping of indices and beerid
indices = pd.Series(df.index, index=df['name'])

In [11]:
indices.head()

beer_id
271781    0
125646    1
125646    2
125646    3
125646    4
dtype: int64

In [10]:
# Recommendation model with TFIDF default arguments

def recommender(user_input, cosine_sim=cosine_sim):
    
    recommended_beers = []
    
    # Get the index of the beer that matches the user input
    idx = indices[indices == 'beer_id'].index[0]
    
    # Get the pairwise similarity scores of all beers with that description
    # convert it into a list od tuples
    sim_scores = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    
    #Sort the beer based on the cosine similarity score
    #sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    
    #Get the scores of the 5 most similar beers
    top_5_beers = list(sim_scores.iloc[1:6].index)
    
    #Get beer indices
    for i in top_5_beers:
        recommended_beers.append(list(df.index)[i])
    
    return recommended_beers

In [21]:
tfidf_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
recommender("crisp")

  res_values = method(rvalues)


IndexError: index 0 is out of bounds for axis 0 with size 0

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

#Initialize Count Vectorizer and create matrix
count = CountVectorizer()

# Construct the required CV matrix by applying the fit_transform method
count_matrix = count.fit_transform(df["review"])

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [24]:
# Reset index of df and construct reverse mapping again
df = df.reset_index()

indices2 = pd.Series(df.index, index=df['beer_id'])

In [25]:
# Run recommender with CV parameters

recommender(text, cosine_sim2, df, indices2)

KeyError: 'another great brew treehouse'

In [None]:
count_matrix