# Personalized Shopping Assistant

## 1. Setup

In [None]:
#@title Install requirements

# https://pgh268400.tistory.com/286
# 이 부분은 처음 한번만 실행하면 됌.
# 코드 수정 - "The reason is that the last Ubuntu update update supports chromium driver just via snap."
# 최근 우분투 업데이트에서 크롬 드라이버 설치를 snap을 이용해서만 하도록 바뀜
# 고로 snap 없이 설치하는 아래 우회 코드로 변경
# 출처 : https://colab.research.google.com/drive/1cbEvuZOhkouYLda3RqiwtbM-o9hxGLyC
# 출처2 : https://stackoverflow.com/questions/75155063/selenium-use-chrome-on-colab-got-unexpectedly-exited

%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

In [None]:
!pip install -q selenium transformers kiwipiepy

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m384.9/384.9 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.5/30.5 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m3.1 MB/s[0m eta

In [None]:
import time
import easydict
import pandas as pd
from kiwipiepy import Kiwi
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
cfg = easydict.EasyDict(
    device = 'cuda:0',
    model_name = 'yongsun-yoon/electra-ko-base-nli'
)

## 2. Run

In [None]:
def load_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('start-maximized')
    options.add_argument('disable-infobars')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-browser-side-navigation')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-default-apps')
    options.add_argument('--disable-translate')
    options.add_argument('--disable-notifications')

    driver = webdriver.Chrome('chromedriver', options=options)
    return driver


def collect_reviews(product_id, driver):
    page = 1
    product_url = f'https://search.shopping.naver.com/catalog/{product_id}'
    driver.get(product_url)

    reviews = []
    for page in range(1, 11):
        page_btn = driver.find_element(By.XPATH, f"//a[@data-nclick='N=a:rev.page,r:{page}']")
        page_btn.click() 
        time.sleep(1)

        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        _reviews = soup.select('p[class^="reviewItems_text"]')
        _reviews = [r.text for r in _reviews]
        reviews += _reviews
    
    return reviews


def split_reviews(reviews):
    sentence_reviews = []
    for review in reviews:
        sentence_reviews += [s.text for s in kiwi.split_into_sents(review)]
    return sentence_reviews


def evaluate_reviews(model, tokenizer, reviews, criteria, batch_size=16):
    device = next(model.parameters()).device
    num_reviews = len(reviews)
    reviews = split_reviews(reviews)

    points = []
    for criterion in criteria:
        hypothesis = f'이 제품은 {criterion}'
        point = 0

        for i in range(0, len(reviews), batch_size):
            batch_reviews = reviews[i:i+batch_size]
            batch_criterion = [criterion] * len(batch_reviews)
            inputs = tokenizer(batch_reviews, batch_criterion, padding=True, truncation='only_first', max_length=128, return_tensors='pt')
            inputs = inputs.to(device)
            outputs = model(**inputs)

            preds = outputs.logits.argmax(dim=-1)
            point += len(torch.where(preds == 0)[0])
            point -= len(torch.where(preds == 2)[0])

        point /= num_reviews
        points.append(point)
    
    return points

In [None]:
kiwi = Kiwi()
driver = load_driver()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
model = AutoModelForSequenceClassification.from_pretrained(cfg.model_name)
_ = model.eval().requires_grad_(False).to(cfg.device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/365 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/238k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/741k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/988 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

In [None]:
candidates = ['20174570219', '28692516563', '33957955618']

In [None]:
criterions = [
    '용량이 크다.',
    '설거지가 편하다.',
    '소음이 없다.',
    '디자인이 예쁘다.',
    '사용하기 쉽다.'
]

In [None]:
results = []
for product_id in candidates:
    reviews = collect_reviews(product_id, driver)
    points = evaluate_reviews(model, tokenizer, reviews, criterions)
    results.append({criterion:point for criterion, point in zip(criterions, points)})
    print(f'{product_id} done')

results = pd.DataFrame(results)
results.index = candidates

20174570219 done
28692516563 done
33957955618 done


In [None]:
results

Unnamed: 0,용량이 크다.,설거지가 편하다.,소음이 없다.,디자인이 예쁘다.,사용하기 쉽다.
20174570219,0.54,-0.745,0.12,0.275,-0.1
28692516563,0.315,-0.395,0.165,0.32,0.16
33957955618,0.15,-0.295,0.14,0.41,0.17
