<a href="https://colab.research.google.com/github/semishen/Chinese-NLP-Practices/blob/main/08_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# load data
import pandas as pd
import numpy as np

url = 'https://raw.githubusercontent.com/ywchiu/tibame_tm/master/data/google_comments.xlsx'
google = pd.read_excel(url, index_col=0)
print(google.shape)
google.head()

(500, 3)


Unnamed: 0,username,score,comment
0,倪依芙,5,起初找不到入口，還以為沒座位，但是原來入口在旁邊樓梯～座位區不大，但是料理很好吃～肉圓的醬微...
1,Mimi龍,4,聞名已久，今天特別來現場試試，目前改成點餐機點餐， ⋯⋯更多聞名已久，今天特別來現場試試，目...
2,林昌逸,1,非常生氣服務員的態度😠服務很糟糕，麵都還沒吃完，一起身就把麵收走，擺明趕客人😡😡😡
3,林幸蓁,5,內用與外帶有不同的點餐方式內用須至店內使用機台點菜、輸入桌號並付款外帶好像只需在入口處點餐即...
4,台灣TAXI市區叫車禮車包車約拍找小鄭,5,乾淨衛生，廁所很大在地下室，有免費飲料可以喝，拉麵愛好吃，麻辣豆腐不錯、肉粽不錯，目前吃過很...


In [None]:
# convert score to status
def score_to_status(x):
  assert isinstance(x, int), 'x is not an int'
  assert (x >= 0) and (x <= 5), 'x is out of range 0-5'

  
  if x > 3:
    return 'good'
  elif x == 3:
    return 'soso'
  else :
    return 'bad'


google['status'] = google['score'].map(score_to_status)
google.head()

Unnamed: 0,username,score,comment,status
0,倪依芙,5,起初找不到入口，還以為沒座位，但是原來入口在旁邊樓梯～座位區不大，但是料理很好吃～肉圓的醬微...,good
1,Mimi龍,4,聞名已久，今天特別來現場試試，目前改成點餐機點餐， ⋯⋯更多聞名已久，今天特別來現場試試，目...,good
2,林昌逸,1,非常生氣服務員的態度😠服務很糟糕，麵都還沒吃完，一起身就把麵收走，擺明趕客人😡😡😡,bad
3,林幸蓁,5,內用與外帶有不同的點餐方式內用須至店內使用機台點菜、輸入桌號並付款外帶好像只需在入口處點餐即...,good
4,台灣TAXI市區叫車禮車包車約拍找小鄭,5,乾淨衛生，廁所很大在地下室，有免費飲料可以喝，拉麵愛好吃，麻辣豆腐不錯、肉粽不錯，目前吃過很...,good


In [None]:
google['status'].value_counts()

good    239
bad     201
soso     60
Name: status, dtype: int64

In [None]:
google_good_bad = google[google['status'].isin(['good','bad'])]
google_good_bad.shape

(440, 4)

In [None]:
# word segmentation and build a corpus
import jieba

corpus = []

for row in google_good_bad['comment']:
  corpus.append(list(jieba.cut(row))) # convert to word list for Gensim
# corpus[:5]


In [None]:
# build a word2vec model 
from gensim.models import word2vec

# params
seed = 42
sg = 0
window_size = 10
vector_dim = 500
min_count = 1
workers = -1
epochs = 1000
sample = 1e-5 

model = word2vec.Word2Vec(
  corpus,
  min_count = min_count,
  size = vector_dim,
  workers=workers,
  iter=epochs,
  window=window_size,
  sg=sg,
  seed=seed,
  sample = sample
)

In [None]:
# check vocabulary of model
# for item in model.wv.vocab:
#     print(item)
len(model.wv.vocab)

3321

In [None]:
# get wordvec of selected word
wordvec = model.wv.get_vector('好吃')
wordvec.shape

(500,)

In [None]:
# check simularity of selected word
for item in model.most_similar('好吃'):
    print(item)

('冒出', 0.15061572194099426)
('少見', 0.14769664406776428)
('居然', 0.1376318782567978)
('髒', 0.1347464770078659)
('盛裝', 0.13122522830963135)
('無法', 0.13087119162082672)
('不建議', 0.12777474522590637)
('愣住', 0.12751369178295135)
('社會', 0.12421494722366333)
('粒粒', 0.12154917418956757)


  
  if np.issubdtype(vec.dtype, np.int):


In [None]:
# sentence to vector in good, bad
vecs = []
tags = []
for tag, s in zip(google_good_bad['status'], corpus):
  vec = np.zeros(vector_dim)
  cnt = 0
  for w in s:
    if w in model:
      vec += model.wv.get_vector(w)
      cnt += 1
  if cnt > 0:
    vecs.append(vec / cnt) # use average to represent sentence
    tags.append(tag)

# sentence vector
x = np.array(vecs)
print(x.shape)

# target
y = np.array(tags)
print(y.shape)

(440, 500)
(440,)


  


In [None]:
# split data in train, test
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state = 42)
print(train_x.shape)
print(test_x.shape)

(352, 500)
(88, 500)


In [None]:
# SVM
from sklearn.svm import SVC
clf = SVC()
clf.fit(train_x, train_y)
pred_y = clf.predict(test_x)

In [None]:
# model evaluation
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(test_y,pred_y)

0.8295454545454546

In [None]:
cm = confusion_matrix(test_y,pred_y, labels=['good','bad'])
print('good','bad')
print(cm)

good bad
[[39  9]
 [ 6 34]]
