In [1]:
import os
import pandas as pd
import re
import spacy
from ast import literal_eval
from collections import Counter
import nltk
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rsj99\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [45]:
df = pd.read_csv('../collect_reviews/beer_scores.csv')
df.head()

Unnamed: 0,name,look,smell,taste,feel,review
0,Asahi Super Dry,4.0,3.5,3.75,4.25,11.2 oz. Bottle into pilsner glass\nLook: ligh...
1,Asahi Super Dry,3.25,3.0,3.25,3.25,Poured a 500 ml can into a pint glass. Best be...
2,Asahi Super Dry,3.75,3.75,3.5,3.5,I bought a can at the beerstore in Ontario 500...
3,Asahi Super Dry,3.75,3.25,3.25,3.25,"Small bbq at the neighbors. He knows I "" love ..."
4,Asahi Super Dry,2.25,1.5,1.75,2.0,This looks terrible. Two 11.2 ounce brown bott...


In [4]:
# 리뷰 갯수가 10개 미만인 맥주 지우기
drop_names = list(df['name'].value_counts()[df.name.value_counts() < 10].keys())
drop_index = df[df.name.str.contains('|'.join(drop_names))].index
beers = df.drop(index=drop_index)
beers

Unnamed: 0,name,look,smell,taste,feel,review
0,Asahi Super Dry,4.00,3.50,3.75,4.25,11.2 oz. Bottle into pilsner glass\nLook: ligh...
1,Asahi Super Dry,3.25,3.00,3.25,3.25,Poured a 500 ml can into a pint glass. Best be...
2,Asahi Super Dry,3.75,3.75,3.50,3.50,I bought a can at the beerstore in Ontario 500...
3,Asahi Super Dry,3.75,3.25,3.25,3.25,"Small bbq at the neighbors. He knows I "" love ..."
4,Asahi Super Dry,2.25,1.50,1.75,2.00,This looks terrible. Two 11.2 ounce brown bott...
...,...,...,...,...,...,...
4647,Coopers Best Extra Stout,3.50,3.50,3.50,3.50,"Black color with brown head. Aroma is malty, r..."
4648,Coopers Best Extra Stout,4.00,3.75,3.75,4.00,"This one's best by 2010, so rating is set to c..."
4649,Coopers Best Extra Stout,4.50,4.50,5.00,4.75,I had this one a long time ago and just recent...
4650,Coopers Best Extra Stout,4.00,4.00,4.25,4.25,"Jet black pour with tight, creamy tan head whi..."


In [5]:
beers.isna().sum()

name      0
look      0
smell     0
taste     0
feel      0
review    0
dtype: int64

In [6]:
beers.review[0]

'11.2 oz. Bottle into pilsner glass\nLook: light straw, 3 finger head, good lace and retention, nice lacing\nSmell: malt forward, sweet\nTaste: Cereal, medium bitterness, dry finish, \nFeel: Tongue tingler, excellent mouthfeel, flavor lingers\nOverall: A pleasing experience.\nPair with spicy food, sushi or a hamburger.Thursday at 11:41 PM'

In [7]:
beers.review[1]

'Poured a 500 ml can into a pint glass. Best before July 2022. A one finger white head on a clear straw. The head soon diminishes to a thin layer. Some patchy lacing.\nClean mild taste with a crisp finish.\nLight body with medium carbonation.\nOverall a clean refreshing lager with major faults.Oct 08, 2021'

In [8]:
beers.review[4]

'This looks terrible. Two 11.2 ounce brown bottles have just released their contents into a clear glass liter mug. The starchy white head, despite a deliberate pour, is ridiculous. Then there\'s the "color", one of the lightest ambers that still can be considered amber. Probably the most transparent beer you will find. There\'s only a little starchy character to smell. Only a little glass lacing rescues the visual, but nothing can salvage the lack of aroma. Taste, what taste? This is as tasteless a beer as one could find. There\'s remarkably only traces of hops and malt here. It drinks like a Coor\'s Light diluted with club soda. It\'s best feature is bringing a 5% ABV that\'s virtually undetectable, even with the lack of everything else. It is dry.Jul 01, 2021'

In [9]:
beers.name.nunique()

191

In [10]:
def text_tokenize(text):
    nlp = spacy.load('en_core_web_sm')
    stop_words = nlp.Defaults.stop_words.union('i', 'I', '\n', '\n\n')

    tokens = []
    doc = nlp(text)
    for token in doc:
        if (token.text.lower() not in stop_words) & (token.is_punct == False): # 불용어, 구두점 검출
            tokens.append(token.lemma_) # 표제어 추출하여 담기
    return tokens


In [11]:
beers['token'] = beers['review'].apply(text_tokenize)
beers.head()

Unnamed: 0,name,look,smell,taste,feel,review,token
0,Asahi Super Dry,4.0,3.5,3.75,4.25,11.2 oz. Bottle into pilsner glass\nLook: ligh...,"[11.2, oz, bottle, pilsner, glass, look, light..."
1,Asahi Super Dry,3.25,3.0,3.25,3.25,Poured a 500 ml can into a pint glass. Best be...,"[pour, 500, ml, pint, glass, well, July, 2022,..."
2,Asahi Super Dry,3.75,3.75,3.5,3.5,I bought a can at the beerstore in Ontario 500...,"[buy, beerstore, Ontario, 500ml, 5.2, interest..."
3,Asahi Super Dry,3.75,3.25,3.25,3.25,"Small bbq at the neighbors. He knows I "" love ...","[small, bbq, neighbor, know, love, fancy, beer..."
4,Asahi Super Dry,2.25,1.5,1.75,2.0,This looks terrible. Two 11.2 ounce brown bott...,"[look, terrible, 11.2, ounce, brown, bottle, r..."


In [12]:
beers.to_csv('beers_data/lemmatization_complete_beers.csv', index=False)

In [13]:
df = pd.read_csv('beers_data/lemmatization_complete_beers.csv')
df

Unnamed: 0,name,look,smell,taste,feel,review,token
0,Asahi Super Dry,4.00,3.50,3.75,4.25,11.2 oz. Bottle into pilsner glass\nLook: ligh...,"['11.2', 'oz', 'bottle', 'pilsner', 'glass', '..."
1,Asahi Super Dry,3.25,3.00,3.25,3.25,Poured a 500 ml can into a pint glass. Best be...,"['pour', '500', 'ml', 'pint', 'glass', 'well',..."
2,Asahi Super Dry,3.75,3.75,3.50,3.50,I bought a can at the beerstore in Ontario 500...,"['buy', 'beerstore', 'Ontario', '500ml', '5.2'..."
3,Asahi Super Dry,3.75,3.25,3.25,3.25,"Small bbq at the neighbors. He knows I "" love ...","['small', 'bbq', 'neighbor', 'know', 'love', '..."
4,Asahi Super Dry,2.25,1.50,1.75,2.00,This looks terrible. Two 11.2 ounce brown bott...,"['look', 'terrible', '11.2', 'ounce', 'brown',..."
...,...,...,...,...,...,...,...
4554,Coopers Best Extra Stout,3.50,3.50,3.50,3.50,"Black color with brown head. Aroma is malty, r...","['black', 'color', 'brown', 'head', 'Aroma', '..."
4555,Coopers Best Extra Stout,4.00,3.75,3.75,4.00,"This one's best by 2010, so rating is set to c...","['good', '2010', 'rating', 'set', 'current', '..."
4556,Coopers Best Extra Stout,4.50,4.50,5.00,4.75,I had this one a long time ago and just recent...,"['long', 'time', 'ago', 'recently', 'Chiang', ..."
4557,Coopers Best Extra Stout,4.00,4.00,4.25,4.25,"Jet black pour with tight, creamy tan head whi...","['Jet', 'black', 'pour', 'tight', 'creamy', 't..."


In [14]:
df['token'] = df['token'].apply(literal_eval)
df.head()

Unnamed: 0,name,look,smell,taste,feel,review,token
0,Asahi Super Dry,4.0,3.5,3.75,4.25,11.2 oz. Bottle into pilsner glass\nLook: ligh...,"[11.2, oz, bottle, pilsner, glass, look, light..."
1,Asahi Super Dry,3.25,3.0,3.25,3.25,Poured a 500 ml can into a pint glass. Best be...,"[pour, 500, ml, pint, glass, well, July, 2022,..."
2,Asahi Super Dry,3.75,3.75,3.5,3.5,I bought a can at the beerstore in Ontario 500...,"[buy, beerstore, Ontario, 500ml, 5.2, interest..."
3,Asahi Super Dry,3.75,3.25,3.25,3.25,"Small bbq at the neighbors. He knows I "" love ...","[small, bbq, neighbor, know, love, fancy, beer..."
4,Asahi Super Dry,2.25,1.5,1.75,2.0,This looks terrible. Two 11.2 ounce brown bott...,"[look, terrible, 11.2, ounce, brown, bottle, r..."


In [15]:
def token_cleaning(token):
    comp = re.compile('[^A-Za-z]') # 영어로 된 단어만 검출
    clean = []
    for t in token:
        t = comp.sub('', t)
        if len(t) > 4:
            clean.append(t.lower()) # 검출된 단어 소문자로 통합하여 저장
    return clean

df['clean_token'] = df['token'].apply(token_cleaning)
df.head()

Unnamed: 0,name,look,smell,taste,feel,review,token,clean_token
0,Asahi Super Dry,4.0,3.5,3.75,4.25,11.2 oz. Bottle into pilsner glass\nLook: ligh...,"[11.2, oz, bottle, pilsner, glass, look, light...","[bottle, pilsner, glass, light, straw, finger,..."
1,Asahi Super Dry,3.25,3.0,3.25,3.25,Poured a 500 ml can into a pint glass. Best be...,"[pour, 500, ml, pint, glass, well, July, 2022,...","[glass, finger, white, clear, straw, diminish,..."
2,Asahi Super Dry,3.75,3.75,3.5,3.5,I bought a can at the beerstore in Ontario 500...,"[buy, beerstore, Ontario, 500ml, 5.2, interest...","[beerstore, ontario, interesting, italy, japan..."
3,Asahi Super Dry,3.75,3.25,3.25,3.25,"Small bbq at the neighbors. He knows I "" love ...","[small, bbq, neighbor, know, love, fancy, beer...","[small, neighbor, fancy, fridge, large, straw,..."
4,Asahi Super Dry,2.25,1.5,1.75,2.0,This looks terrible. Two 11.2 ounce brown bott...,"[look, terrible, 11.2, ounce, brown, bottle, r...","[terrible, ounce, brown, bottle, release, cont..."


In [16]:
new_beers_df = round(df.groupby('name')[['look', 'smell', 'taste', 'feel']].mean(),2).reset_index()
new_beers_df

Unnamed: 0,name,look,smell,taste,feel
0,Aecht Schlenkerla Rauchbier Marzen,4.41,4.29,4.14,4.08
1,Aecht Schlenkerla Rauchbier Weizen,4.11,4.13,4.07,4.02
2,Amarcord Gradisca,3.19,3.00,3.12,3.11
3,Amarcord Tabachera,3.44,3.31,3.35,3.21
4,Anderson Valley Boont Amber Ale,4.04,3.89,3.95,3.83
...,...,...,...,...,...
186,Widmer Brothers Hefeweizen,3.95,3.33,3.56,3.71
187,Widmer Brothers Nelson Imperial IPA,3.96,4.00,3.92,3.88
188,Widmer Brothers Okto Festival Ale,3.54,3.30,3.42,3.40
189,Widmer Brothers Pitch Black IPA,3.96,3.59,3.73,3.61


In [17]:
token_df = df.groupby('name')['clean_token'].sum().reset_index()
new_beers_df = new_beers_df.merge(token_df, on='name')
new_beers_df.head()

Unnamed: 0,name,look,smell,taste,feel,clean_token
0,Aecht Schlenkerla Rauchbier Marzen,4.41,4.29,4.14,4.08,"[pours, slightly, brown, amber, light, carbona..."
1,Aecht Schlenkerla Rauchbier Weizen,4.11,4.13,4.07,4.02,"[wonder, flavor, profile, rauchbier, flavor, p..."
2,Amarcord Gradisca,3.19,3.0,3.12,3.11,"[thing, italy, craft, brewer, carry, purist, b..."
3,Amarcord Tabachera,3.44,3.31,3.35,3.21,"[copper, filter, little, bubbled, carbonation,..."
4,Anderson Valley Boont Amber Ale,4.04,3.89,3.95,3.83,"[draught, thick, creamy, undulating, ivory, lo..."


In [18]:
Counter(new_beers_df['clean_token'][7]).most_common(20)

[('light', 18),
 ('lager', 13),
 ('grain', 12),
 ('japan', 12),
 ('taste', 11),
 ('asahi', 11),
 ('roasted', 10),
 ('black', 9),
 ('drink', 9),
 ('flavor', 8),
 ('chocolate', 7),
 ('medium', 7),
 ('finger', 6),
 ('brown', 5),
 ('aroma', 5),
 ('bottle', 5),
 ('carbonation', 5),
 ('japanese', 5),
 ('super', 5),
 ('average', 5)]

In [19]:
# 필요없는 단어 확인중 smokey, smoky와 같이 훈연된 냄새를 가르키는 단어가 여러가지로 표현됨
# 그래 token들 바꿔줌
def cng_smok(tokens):
    cng = []
    for token in tokens:
        if 'smok' in token:
            token = 'smoke'
        cng.append(token)
    return cng
new_beers_df['clean_token'] = new_beers_df['clean_token'].apply(cng_smok)

In [20]:
# 자주 등장하는 단어 상위 20개 중 특징에 해당하지 않는 단어들을 검출함
stop_words = ['taste', 'flavor', 'bottle', 'color', 'mouthfeel', 'glass', 'lacing', 'finger',
              'little', 'finish', 'interesting', 'aftertaste', 'overall', 'slight', 'colour',
              'flavour', 'style', 'aroma', 'smell', 'review', 'leave', 'decent']

In [21]:
cleared = []
most_tokens = []
for token in new_beers_df['clean_token']:
    for t in token:
        if t in stop_words:
            continue
        else:
            cleared.append(t)
    cnt = Counter(cleared).most_common(20)
    for c in cnt:
        text, _ = c
        most_tokens.append(text)
res = Counter(most_tokens)
exp_tokens = sorted(res.items(), key=lambda x : x[1])
exp_tokens # 설명 가능한 토큰들의 집합

[('lightly', 1),
 ('different', 1),
 ('think', 1),
 ('heavy', 1),
 ('woody', 1),
 ('schlenkerla', 1),
 ('clove', 1),
 ('toffee', 1),
 ('pretty', 1),
 ('porter', 1),
 ('marzen', 2),
 ('weizen', 2),
 ('rauchbier', 2),
 ('bacon', 2),
 ('alcohol', 2),
 ('banana', 3),
 ('bodied', 3),
 ('roasted', 5),
 ('clean', 7),
 ('bread', 10),
 ('wheat', 10),
 ('black', 11),
 ('grain', 13),
 ('great', 42),
 ('citrus', 43),
 ('amber', 52),
 ('smoke', 57),
 ('malty', 57),
 ('yeast', 72),
 ('chocolate', 92),
 ('crisp', 98),
 ('fruit', 125),
 ('golden', 136),
 ('bitter', 162),
 ('orange', 173),
 ('sweetness', 176),
 ('slightly', 183),
 ('clear', 184),
 ('lager', 185),
 ('drink', 186),
 ('white', 189),
 ('bitterness', 190),
 ('medium', 190),
 ('light', 191),
 ('brown', 191),
 ('sweet', 191),
 ('carbonation', 191),
 ('caramel', 191),
 ('smooth', 191)]

In [22]:
# 빈도수가 3개 이상인 토큰만 선택
exp_token_list = []
for token in exp_tokens:
    text, cnt = token
    if cnt >= 3:
        exp_token_list.append(text)
exp_token_list

['banana',
 'bodied',
 'roasted',
 'clean',
 'bread',
 'wheat',
 'black',
 'grain',
 'great',
 'citrus',
 'amber',
 'smoke',
 'malty',
 'yeast',
 'chocolate',
 'crisp',
 'fruit',
 'golden',
 'bitter',
 'orange',
 'sweetness',
 'slightly',
 'clear',
 'lager',
 'drink',
 'white',
 'bitterness',
 'medium',
 'light',
 'brown',
 'sweet',
 'carbonation',
 'caramel',
 'smooth']

In [23]:
temp = []
for t in df.loc[df.name == 'Aecht Schlenkerla Rauchbier Marzen', 'clean_token']:
    temp.extend(t)
Counter(temp).most_common(20)

[('smoke', 61),
 ('taste', 24),
 ('light', 23),
 ('brown', 18),
 ('smoky', 18),
 ('flavor', 15),
 ('sweet', 14),
 ('aroma', 14),
 ('carbonation', 13),
 ('bottle', 12),
 ('finish', 11),
 ('marzen', 11),
 ('color', 11),
 ('bread', 10),
 ('smell', 10),
 ('caramel', 9),
 ('style', 8),
 ('smoked', 7),
 ('smooth', 7),
 ('mouthfeel', 7)]

In [24]:
# 토큰들을 통해 각 맥주별 토큰 빈도수 체크
token_count = []
for i in range(len(new_beers_df)):
    exp_token_dict = {}
    exp_token_dict = {k : 0 for k in exp_token_list}
    tokens = new_beers_df['clean_token'][i]
    for token in tokens:
        if token in exp_token_list:
            exp_token_dict[token] += 1
    token_count.append(exp_token_dict)
token_count_df = pd.DataFrame(token_count, index=new_beers_df.name)
token_count_df

Unnamed: 0_level_0,banana,bodied,roasted,clean,bread,wheat,black,grain,great,citrus,...,drink,white,bitterness,medium,light,brown,sweet,carbonation,caramel,smooth
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aecht Schlenkerla Rauchbier Marzen,0,5,2,3,10,0,2,0,7,0,...,7,0,5,5,23,18,14,13,9,7
Aecht Schlenkerla Rauchbier Weizen,12,3,0,1,6,26,1,3,3,1,...,1,6,2,12,11,14,15,10,12,6
Amarcord Gradisca,0,2,0,3,0,1,0,7,2,1,...,1,6,5,5,23,0,5,10,1,1
Amarcord Tabachera,3,4,0,0,1,2,0,5,0,2,...,6,9,4,8,17,13,26,15,15,3
Anderson Valley Boont Amber Ale,0,8,1,6,4,0,0,1,3,2,...,2,9,2,13,12,3,15,13,29,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Widmer Brothers Hefeweizen,22,1,0,1,7,23,0,2,7,19,...,5,12,5,12,18,0,7,14,1,5
Widmer Brothers Nelson Imperial IPA,0,4,0,1,0,0,0,1,1,11,...,5,17,13,13,13,0,11,15,8,2
Widmer Brothers Okto Festival Ale,0,2,1,8,10,0,0,5,1,2,...,4,9,5,9,23,6,8,12,26,7
Widmer Brothers Pitch Black IPA,0,2,18,0,1,0,30,2,6,4,...,4,4,6,12,12,12,5,13,6,3


In [25]:
# 설명할 요소가 없는 맥주가 있는지 확인
(token_count_df.sum(axis=1) == 0).sum()

0

In [26]:
# 각 요소들을 가중치로 변환하기 위해 스케일러 적용
scaler = MinMaxScaler()
beers_df = pd.DataFrame(scaler.fit_transform(token_count_df),
                        columns=token_count_df.columns,
                        index = token_count_df.index)
beers_df.head()

Unnamed: 0_level_0,banana,bodied,roasted,clean,bread,wheat,black,grain,great,citrus,...,drink,white,bitterness,medium,light,brown,sweet,carbonation,caramel,smooth
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aecht Schlenkerla Rauchbier Marzen,0.0,0.555556,0.105263,0.176471,0.769231,0.0,0.066667,0.0,0.777778,0.0,...,0.388889,0.0,0.294118,0.227273,0.338983,0.382979,0.209677,0.65,0.310345,0.466667
Aecht Schlenkerla Rauchbier Weizen,0.352941,0.333333,0.0,0.058824,0.461538,0.787879,0.033333,0.214286,0.333333,0.052632,...,0.055556,0.171429,0.117647,0.545455,0.135593,0.297872,0.225806,0.5,0.413793,0.4
Amarcord Gradisca,0.0,0.222222,0.0,0.176471,0.0,0.030303,0.0,0.5,0.222222,0.052632,...,0.055556,0.171429,0.294118,0.227273,0.338983,0.0,0.064516,0.5,0.034483,0.066667
Amarcord Tabachera,0.088235,0.444444,0.0,0.0,0.076923,0.060606,0.0,0.357143,0.0,0.105263,...,0.333333,0.257143,0.235294,0.363636,0.237288,0.276596,0.403226,0.75,0.517241,0.2
Anderson Valley Boont Amber Ale,0.0,0.888889,0.052632,0.352941,0.307692,0.0,0.0,0.071429,0.333333,0.105263,...,0.111111,0.257143,0.117647,0.590909,0.152542,0.06383,0.225806,0.65,1.0,0.2


In [27]:
last_beers_df = beers_df.reset_index().merge(new_beers_df, on='name')
last_beers_df.drop('clean_token', axis=1, inplace=True)
last_beers_df.head()

Unnamed: 0,name,banana,bodied,roasted,clean,bread,wheat,black,grain,great,...,light,brown,sweet,carbonation,caramel,smooth,look,smell,taste,feel
0,Aecht Schlenkerla Rauchbier Marzen,0.0,0.555556,0.105263,0.176471,0.769231,0.0,0.066667,0.0,0.777778,...,0.338983,0.382979,0.209677,0.65,0.310345,0.466667,4.41,4.29,4.14,4.08
1,Aecht Schlenkerla Rauchbier Weizen,0.352941,0.333333,0.0,0.058824,0.461538,0.787879,0.033333,0.214286,0.333333,...,0.135593,0.297872,0.225806,0.5,0.413793,0.4,4.11,4.13,4.07,4.02
2,Amarcord Gradisca,0.0,0.222222,0.0,0.176471,0.0,0.030303,0.0,0.5,0.222222,...,0.338983,0.0,0.064516,0.5,0.034483,0.066667,3.19,3.0,3.12,3.11
3,Amarcord Tabachera,0.088235,0.444444,0.0,0.0,0.076923,0.060606,0.0,0.357143,0.0,...,0.237288,0.276596,0.403226,0.75,0.517241,0.2,3.44,3.31,3.35,3.21
4,Anderson Valley Boont Amber Ale,0.0,0.888889,0.052632,0.352941,0.307692,0.0,0.0,0.071429,0.333333,...,0.152542,0.06383,0.225806,0.65,1.0,0.2,4.04,3.89,3.95,3.83


In [37]:
# 데이터 확인중 Hoegaarden이 Original로 검색되어 저장된 것을 확인하여 삭제
last_beers_df.drop(last_beers_df[last_beers_df.name == 'Original'].index, axis=0, inplace=True)

In [38]:
# 마지막 데이터 저장
last_beers_df.to_pickle('beers_data/Beers_TFIDF_ended.pkl')

In [39]:
import pickle
with open('beers_data/Beers_TFIDF_ended.pkl', 'rb') as f:
    last_beers_df = pickle.load(f)

In [40]:
matrix = last_beers_df.set_index('name').drop(['look', 'smell', 'taste', 'feel'], axis=1)
cosine_sim = cosine_similarity(matrix, matrix)

In [41]:
indices = pd.Series(data=last_beers_df.index, index=last_beers_df.name)
indices.head()

name
Aecht Schlenkerla Rauchbier Marzen    0
Aecht Schlenkerla Rauchbier Weizen    1
Amarcord Gradisca                     2
Amarcord Tabachera                    3
Anderson Valley Boont Amber Ale       4
dtype: int64

In [64]:
def get_recommendations(name, cosine_sim=cosine_sim):
    # 다른 맥주와의 유사도 가져오기
    idx = indices[name]

    # 해당 맥주와의 유사도 구하기
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도에 따라 맥주 정렬
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # 가장 유사한 맥주 3개 가져오기
    sim_scores = sim_scores[1:4]

    # 가장 유사한 맥주 3개의 인덱스 가져오기
    beer_indices = [i[0] for i in sim_scores]

    return indices.iloc[beer_indices].index.tolist()

In [65]:
get_recommendations('Hoegaarden Original White Ale')

['Hitachino Nest Weizen', 'Blanche', 'Celis White']