In [20]:
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
說明：
    用RoBERTa把評論資料轉成詞嵌入向量(包含情感語義等特徵)

注意事項：
    無，照順序跑下去即可
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

import sys
import torch
from transformers import RobertaTokenizer, RobertaModel, logging
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
from pymongo import MongoClient
import numpy as np
from tqdm import tqdm

def printf(format, *args):
    sys.stdout.write(format % args)

In [2]:
### 檢查有沒有啟動CUDA

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
### 定義文字前處理的函數
## 1. 用nltk移除停用詞
## 2. 移除標點符號
## 3. 移除空白
## 4. 轉成小寫
stop_words = set(stopwords.words('english'))
translator = str.maketrans('', '', string.punctuation)

def preprocess_text(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if not word.lower() in stop_words]
    filtered_text = ' '.join(filtered_words)
    removePunctuation_text = filtered_text.translate(translator)
    removeSpace_text = re.sub(r'\s+', ' ', removePunctuation_text).strip()
    return removeSpace_text.lower()

In [4]:
### 定義用Roberta將文字轉成詞嵌入的函數
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

def getSentimentVector(text):
    caption = [text]
    tokens = tokenizer(caption, return_tensors='pt', padding=True, truncation=True, max_length=512)
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']
    output = model(input_ids, attention_mask, output_hidden_states=True)

    ### concatenate last 4 layers
    encoded_layers = output.hidden_states[-4:]
    token_emb = torch.cat(encoded_layers, -1)
    
    average_pooling = torch.mean(token_emb, dim=1).tolist()

    return average_pooling[0]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
client = MongoClient('localhost', 27017)
db_new = client.Yelp_New
review_new = db_new.review
review_sentiment = db_new.review_sentiment

# totalReview = review_new.count_documents({})
# count = 0
bar = tqdm(total=review_new.count_documents({}), desc='Sentiment Analysis')
tempIds = review_new.find({}, no_cursor_timeout=True, batch_size=10)
for item in tempIds:
    tempText = preprocess_text(item['text'])
    item['newText'] = tempText
    item['sentiment_vector'] = getSentimentVector(tempText)
    review_sentiment.insert_one(item)
    bar.update(1)
    # count += 1
    # if count % 10000 == 0:
    #     printf('%d/%d done\n', count, totalReview)
tempIds.close()
bar.close()
# print('All done')

  return Cursor(self, *args, **kwargs)
Sentiment Analysis: 100%|██████████| 77596/77596 [58:49<00:00, 21.99it/s]
