In [None]:
# !pip install transformers
# !pip install torch
# !pip install sentence_transformers
# !pip install textblob

In [1]:
from transformers import BertModel, BertTokenizer
from torch.optim import Adam
import torch
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import datetime

  from .autonotebook import tqdm as notebook_tqdm


### 原始評論(利用bert原始的分詞方法)(會把每一個字都拆開)

In [72]:
df_ = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# 篩選評論數大於等於 30 的飯店
df_filtered = df_.groupby('飯店名稱').filter(lambda x: len(x) >= 30)

In [None]:
# 開始測量(爬取時間)
startime = datetime.datetime.now()

# 初始化 RoBERTa-wwm-ext 模型和分詞器
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')

# 評論文本
sentences = list(df_filtered.loc[:, '綜合評論'])

# 初始化一個空的 list 用於儲存飯店資訊
hotel_info_list = []

# 獲取每個文本的嵌入向量，並儲存對應的飯店資訊
embeddings = []
for idx, sentence in enumerate(sentences):
    tokens = tokenizer.tokenize(sentence)  # 分詞
    inputs = tokenizer(sentence, return_tensors="pt")

    # 檢查序列長度是否超過 512
    if len(inputs["input_ids"][0]) > 512:
        print("The sentence is too long. Skipping...")
        continue

    with torch.no_grad():
        outputs = model(**inputs)
    embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
    hotel_info_list.append(df_filtered.iloc[idx].to_dict())

# 儲存飯店資訊與 embeddings
df_hotel_info = pd.DataFrame(hotel_info_list)
df_hotel_info.to_csv('hotel_info_roberta.csv', index=False)
np.save('embeddings_roberta.npy', embeddings)

# 結束測量
endtime = datetime.datetime.now()

# 輸出結果
print("執行時間：", endtime - startime)

#### 利用迴圈跑

In [5]:
import numpy as np
import pandas as pd
import os
import datetime
from transformers import BertModel, BertTokenizer
import torch
import datetime

df_ = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# 篩選評論數大於等於 30 的飯店
df_filtered = df_.groupby('飯店名稱').filter(lambda x: len(x) >= 30)

# 初始化 RoBERTa-wwm-ext 模型和分詞器
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')

# 定義每個批次的大小
batch_size = 10000

# 計算需要進行的批次數
n_batches = (len(df_filtered) // batch_size) + 1;n_batches

# 開始進行批次處理
for batch_idx in range(n_batches):
    print(f"Processing batch {batch_idx + 1} of {n_batches}...")
    
    # 開始測量(時間)
    startime = datetime.datetime.now()
    
    start_idx = batch_idx * batch_size
    end_idx = (batch_idx + 1) * batch_size

    # 評論文本
    sentences = list(df_filtered.loc[start_idx:end_idx, '正評'])

    # 初始化一個空的 list 用於儲存飯店資訊
    hotel_info_list = []

    # 獲取每個文本的嵌入向量，並儲存對應的飯店資訊
    embeddings = []
    for idx, sentence in enumerate(sentences):
        if isinstance(sentence, str):
            tokens = tokenizer.tokenize(sentence)  # 分詞
            inputs = tokenizer(sentence, return_tensors="pt")
        else:
            print(f"Sentence at index {idx} is not a string. Skipping...")
            continue

        # 檢查序列長度是否超過 512
        if len(inputs["input_ids"][0]) > 512:
            print("The sentence is too long. Skipping...")
            continue

        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
        hotel_info_list.append(df_filtered.iloc[idx].to_dict())

    # 讀取原有的飯店資訊和 embeddings
    if os.path.exists('hotel_info_roberta.csv') and os.path.exists('embeddings_roberta.npy'):
        df_hotel_info_old = pd.read_csv('hotel_info_roberta.csv')
        embeddings_old = np.load('embeddings_roberta.npy')
    else:
        df_hotel_info_old = pd.DataFrame()
        embeddings_old = np.array([]).reshape(0,768)

    # 儲存飯店資訊與 embeddings
    df_hotel_info_new = pd.DataFrame(hotel_info_list)
    df_hotel_info = pd.concat([df_hotel_info_old, df_hotel_info_new], ignore_index=True)
    df_hotel_info.to_csv('hotel_info_roberta.csv', index=False)

    embeddings_new = np.array(embeddings)
    embeddings = np.vstack([embeddings_old, embeddings_new])
    np.save('embeddings_roberta.npy', embeddings)
    
    # 結束測量
    endtime = datetime.datetime.now()

    # 輸出結果
    print("執行時間：", endtime - startime)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing batch 1 of 110...
Sentence at index 34 is not a string. Skipping...
Sentence at index 36 is not a string. Skipping...
Sentence at index 43 is not a string. Skipping...
Sentence at index 45 is not a string. Skipping...
Sentence at index 60 is not a string. Skipping...
Sentence at index 69 is not a string. Skipping...
Sentence at index 90 is not a string. Skipping...
Sentence at index 95 is not a string. Skipping...
Sentence at index 101 is not a string. Skipping...
Sentence at index 106 is not a string. Skipping...
Sentence at index 107 is not a string. Skipping...
Sentence at index 120 is not a string. Skipping...
Sentence at index 133 is not a string. Skipping...
Sentence at index 138 is not a string. Skipping...
Sentence at index 139 is not a string. Skipping...
Sentence at index 142 is not a string. Skipping...
Sentence at index 144 is not a string. Skipping...
Sentence at index 145 is not a string. Skipping...
Sentence at index 148 is not a string. Skipping...
Sentence a

Sentence at index 1212 is not a string. Skipping...
Sentence at index 1213 is not a string. Skipping...
Sentence at index 1214 is not a string. Skipping...
Sentence at index 1215 is not a string. Skipping...
Sentence at index 1216 is not a string. Skipping...
Sentence at index 1217 is not a string. Skipping...
Sentence at index 1218 is not a string. Skipping...
Sentence at index 1219 is not a string. Skipping...
Sentence at index 1220 is not a string. Skipping...
Sentence at index 1221 is not a string. Skipping...
Sentence at index 1222 is not a string. Skipping...
Sentence at index 1223 is not a string. Skipping...
Sentence at index 1224 is not a string. Skipping...
Sentence at index 1225 is not a string. Skipping...
Sentence at index 1226 is not a string. Skipping...
Sentence at index 1227 is not a string. Skipping...
Sentence at index 1228 is not a string. Skipping...
Sentence at index 1229 is not a string. Skipping...
Sentence at index 1230 is not a string. Skipping...
Sentence at 

Sentence at index 2325 is not a string. Skipping...
Sentence at index 2329 is not a string. Skipping...
Sentence at index 2340 is not a string. Skipping...
Sentence at index 2350 is not a string. Skipping...
Sentence at index 2353 is not a string. Skipping...
Sentence at index 2355 is not a string. Skipping...
Sentence at index 2358 is not a string. Skipping...
Sentence at index 2367 is not a string. Skipping...
Sentence at index 2368 is not a string. Skipping...
Sentence at index 2370 is not a string. Skipping...
Sentence at index 2377 is not a string. Skipping...
Sentence at index 2379 is not a string. Skipping...
Sentence at index 2382 is not a string. Skipping...
Sentence at index 2385 is not a string. Skipping...
Sentence at index 2387 is not a string. Skipping...
Sentence at index 2399 is not a string. Skipping...
Sentence at index 2403 is not a string. Skipping...
Sentence at index 2406 is not a string. Skipping...
Sentence at index 2407 is not a string. Skipping...
Sentence at 

Sentence at index 3199 is not a string. Skipping...
Sentence at index 3202 is not a string. Skipping...
Sentence at index 3214 is not a string. Skipping...
Sentence at index 3222 is not a string. Skipping...
Sentence at index 3232 is not a string. Skipping...
Sentence at index 3234 is not a string. Skipping...
Sentence at index 3239 is not a string. Skipping...
Sentence at index 3247 is not a string. Skipping...
Sentence at index 3264 is not a string. Skipping...
Sentence at index 3272 is not a string. Skipping...
Sentence at index 3273 is not a string. Skipping...
Sentence at index 3275 is not a string. Skipping...
Sentence at index 3279 is not a string. Skipping...
Sentence at index 3290 is not a string. Skipping...
Sentence at index 3303 is not a string. Skipping...
Sentence at index 3343 is not a string. Skipping...
Sentence at index 3348 is not a string. Skipping...
Sentence at index 3356 is not a string. Skipping...
Sentence at index 3363 is not a string. Skipping...
Sentence at 

Sentence at index 4003 is not a string. Skipping...
Sentence at index 4006 is not a string. Skipping...
Sentence at index 4023 is not a string. Skipping...
Sentence at index 4045 is not a string. Skipping...
Sentence at index 4054 is not a string. Skipping...
Sentence at index 4057 is not a string. Skipping...
Sentence at index 4140 is not a string. Skipping...
Sentence at index 4148 is not a string. Skipping...
Sentence at index 4155 is not a string. Skipping...
Sentence at index 4161 is not a string. Skipping...
Sentence at index 4169 is not a string. Skipping...
Sentence at index 4172 is not a string. Skipping...
Sentence at index 4173 is not a string. Skipping...
Sentence at index 4182 is not a string. Skipping...
Sentence at index 4188 is not a string. Skipping...
Sentence at index 4219 is not a string. Skipping...
Sentence at index 4228 is not a string. Skipping...
Sentence at index 4246 is not a string. Skipping...
Sentence at index 4247 is not a string. Skipping...
Sentence at 

KeyboardInterrupt: 

#### 讀取並進行篩選

In [67]:
def get_similar_hotels(df, embeddings, filter_dict, sentence, top_n):
    
    # 篩選
    for key, value in filter_dict.items():
        if key in df.columns:
            operation, target_value = value
            if operation == "==":
                df = df[df[key] == target_value]
            elif operation == "!=":
                df = df[df[key] != target_value]
            elif operation == ">":
                df = df[df[key] > target_value]
            elif operation == "<":
                df = df[df[key] < target_value]
            elif operation == ">=":
                df = df[df[key] >= target_value]
            elif operation == "<=":
                df = df[df[key] <= target_value]
    
    embeddings_filtered = embeddings[df.index.values]

    # 新的文本
    tokens = tokenizer.tokenize(sentence)  # 分詞
    inputs = tokenizer(sentence, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)
    new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()

    # 計算與新文本的相似性
    similarities = cosine_similarity(np.vstack((embeddings_filtered, new_embedding)))

    # 建立新的 DataFrame，儲存相似度與對應的飯店名稱
    df_similarity = pd.DataFrame({'飯店名稱': df['飯店名稱'], '相似度': similarities[-1, :-1]})

    # 以飯店名稱分組，計算每間飯店的平均相似度
    df_avg_similarity = df_similarity.groupby('飯店名稱').mean()

    # 取出平均相似度最高的前 top_n 間飯店
    top_hotels = df_avg_similarity.sort_values(by='相似度', ascending=False).head(top_n)
    
    return top_hotels

# 讀取 DataFrame
df_hotel_info = pd.read_csv('hotel_info_roberta.csv')

# 讀取 embeddings
embeddings = np.load('embeddings_roberta.npy')

# 建立篩選條件字典
filter_dict = {'整體評分': (">", 7.0)}

# 使用函數
top_hotels = get_similar_hotels(df_hotel_info, embeddings, filter_dict, "房間好", 5)
print(top_hotels)

                           相似度
飯店名稱                          
朝日民宿 - Peng's Family  0.672762
亞哥之家                  0.667231
九份輝明民宿                0.663794
墾丁海園別館Hai Yuan Inn    0.662487
百事達國際飯店               0.658002


### 測試自行斷詞後的結果

In [8]:
df = pd.read_csv('0_10000.csv', header=0)

In [4]:
import ast

tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')

sentences_tokenized = [eval(d) for d in df.loc[:, "綜合評論_ws"].values]

# 獲取每個文本的嵌入向量
embeddings = []
sentences = []  # 新建一個列表來保存被處理的句子

for sentence in sentences_tokenized:
    
    inputs = tokenizer.encode_plus(sentence, is_split_into_words=True, return_tensors="pt")
    
    # 檢查序列長度是否超過 512
    if len(inputs["input_ids"][0]) > 512:
        print("The sentence is too long. Skipping...")
        continue
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings.append(outputs.last_hidden_state.mean(dim = 1).numpy().flatten())
    sentences.append(sentence)  # 將被處理的句子加到列表中

# 新的文本
new_sentence = "房間大"
new_tokens = ['房間', '大'] # 分詞
inputs = tokenizer(new_tokens, is_split_into_words=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()

# 計算與新文本的相似性
similarities = cosine_similarity(embeddings + [new_embedding])

# 獲取最相似的 50 個文本的索引
top50_indices = np.argsort(similarities[-1][:-1])[:-51:-1]

# 印出最相似的 50 個文本
for i, index in enumerate(top50_indices):
    print(f"相似的文本 (索引 {index}): {sentences[index]}, 相似度: {similarities[-1][index]}")

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The sentence is too long. Skipping...
The sentence is too long. Skipping...
相似的文本 (索引 9302): ['房間', '小'], 相似度: 0.9714384078979492
相似的文本 (索引 7713): ['房間', '算', '大'], 相似度: 0.9639410376548767
相似的文本 (索引 6559): ['房間', '很', '大'], 相似度: 0.9479893445968628
相似的文本 (索引 6710): ['房間', '很', '大'], 相似度: 0.9479893445968628
相似的文本 (索引 6730): ['房間', '很', '大'], 相似度: 0.9479893445968628
相似的文本 (索引 3851): ['房間', '很', '大'], 相似度: 0.9479893445968628
相似的文本 (索引 7572): ['房間', '很', '大'], 相似度: 0.9479893445968628
相似的文本 (索引 6796): ['房間', '很', '大'], 相似度: 0.9479893445968628
相似的文本 (索引 9294): ['房間', '偏', '小'], 相似度: 0.927994966506958
相似的文本 (索引 4188): ['房間', '稍', '小'], 相似度: 0.9203976392745972
相似的文本 (索引 6951): ['乾淨', '房間', '大'], 相似度: 0.9122157096862793
相似的文本 (索引 1642): ['房間', '太', '小'], 相似度: 0.900887131690979
相似的文本 (索引 7985): ['房間', '大小'], 相似度: 0.8921487331390381
相似的文本 (索引 1403): ['房間', '大小'], 相似度: 0.8921487331390381
相似的文本 (索引 8195): ['房間', '大小'], 相似度: 0.8921487331390381
相似的文本 (索引 6233): ['便宜', '房間', '大'], 相似度: 0.89132845401763

In [5]:
# 新的文本
new_sentence = "早餐好吃"
new_tokens = ['早餐', '好吃'] # 分詞
inputs = tokenizer(new_tokens, is_split_into_words=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()

# 計算與新文本的相似性
similarities = cosine_similarity(embeddings + [new_embedding])

# 獲取最相似的 50 個文本的索引
top50_indices = np.argsort(similarities[-1][:-1])[:-51:-1]

# 印出最相似的 50 個文本
for i, index in enumerate(top50_indices):
    print(f"相似的文本 (索引 {index}): {sentences[index]}, 相似度: {similarities[-1][index]}")

相似的文本 (索引 4343): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 1529): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 6567): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 7509): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 4348): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 6958): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 6491): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 4098): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 2429): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 4827): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 4190): ['早餐', '很', '好吃'], 相似度: 0.9652129411697388
相似的文本 (索引 6052): ['早餐', '很', '好吃'], 相似度: 0.9652129411697388
相似的文本 (索引 5181): ['早餐', '很', '好吃'], 相似度: 0.9652129411697388
相似的文本 (索引 6429): ['早餐', '很', '好吃'], 相似度: 0.9652129411697388
相似的文本 (索引 4300): ['早餐', '很', '好吃'], 相似度: 0.9652129411697388
相似的文本 (索引 9561): ['早餐', '超', '好吃'], 相似度: 0.9607133865356445
相似的文本 (索引 1245): ['早餐', '不', '好吃'], 相似度: 0.9587073922157288
相似的文本 (索引 9445): ['早餐', '很', '好吃', '呦'],

##### 結果不是很好

##### 無法有效判斷語意的相似度

##### 這個現象的出現是由於您在計算評論與新評論之間的相似度時，採用了cosine similarity。這種相似度衡量的是兩個向量之間的角度，即它們的方向是否相似，而不是他們的長度或大小
##### 如果您希望找到與新評論在語義上更相似的評論，一種可能的方法是使用一種能夠捕捉語義相似度的相似度度量，比如Word Mover's Distance。Word Mover's Distance是一種在詞嵌入空間中度量文本之間距離的方法，它可以捕捉詞與詞之間的相似性，並利用這種相似性來度量文本之間的距離。

##### 另一種可能的方法是，不僅僅依賴於評論的平均詞向量來表示評論。例如，您可以使用一些更複雜的方法來獲得評論的向量表示，比如使用Doc2Vec模型，或者將BERT模型中不同層的輸出合併起來，這可能會獲得更豐富的表示。這樣，即使評論的長度不同，也能得到更好的結果。

#### 使用Gensim的Doc2Vec模型

In [24]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 分詞後評論
sentences_tokenized = [eval(d) for d in df.loc[:, "綜合評論_ws"].values]

# 準備訓練數據(轉成模型會吃的樣子)
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences_tokenized)]

# 訓練 Doc2Vec 模型
model = Doc2Vec(documents, vector_size=50, min_count=2, epochs=40)

# 现在，我们可以用模型获取句子的向量表示
embeddings = [model.infer_vector(doc) for doc in sentences_tokenized]

# 对新的句子执行相同的操作
new_sentence = "早餐不好"
new_embedding = model.infer_vector(new_sentence.split())

# 将嵌入向量列表转换为 numpy 数组，以便我们可以使用 cosine_similarity 函数
embeddings_np = np.array(embeddings)

# 计算新的句子和已有句子的相似度
similarities = cosine_similarity(embeddings_np, new_embedding.reshape(1, -1))

# 以降序排列相似度
top_indices = np.argsort(similarities, axis=0)[::-1]

# 输出最相似的 10 个句子
for i, index in enumerate(top_indices[:10]):
    print(f"第 {i+1} 相似的文本 (索引 {index}): {sentences_tokenized[index[0]]}, 相似度: {similarities[index[0]][0]}")

第 1 相似的文本 (索引 [283]): ['住', '頂樓', '海景房', '真的', '是', '很', '賭', '運氣', '跟', '天氣', '的', '房型', '如果', '天氣', '好', '無敵', '海景', '第一', '排', '真的', '很', '美', '可惜', '當天', '入住', '外圍', '環流', '下雨天', '天氣', '非常', '糟糕', '也', '因為', '下雨天', '這', '間', '房型', '的', '缺點', '完全', '顯現', '雨水', '打', '在', '頂樓', '鐵皮屋', '聲音', '非常', '大', '其實', '外面', '沒有', '到', '很', '大', '的', '雨', '但', '在', '房', '內', '卻', '感覺', '是', '滂沱大雨', '整', '晚', '睡', '不', '好', '房', '內', '有', '小', '蟲', '床', '上', '有', '螞蟻', '不', '知道', '是', '不', '是', '因為', '下雨', '所以', '都', '跑出來', '了', '是', '我們', '自己', '運氣', '不', '好', '遇到', '雨天', '所以', '雨天', '在', '這', '間', '房', '內', '就', '變', '的', '比', '其它', '房型', '值', '還要', '低', '雖然', '隔天', '早上', '雨', '停', '了', '但', '照片', '都', '是', '黃黃', '的', '海水', '很', '可惜'], 相似度: 0.13586002588272095
第 2 相似的文本 (索引 [83]): ['入住', '三', '次', '了', '每', '次', '都', '是', '兩', '晚', '以上', '覺得', '不錯', '房間', '細節', '的', '地方', '可以', '在', '加強', '喔'], 相似度: 0.12522174417972565
第 3 相似的文本 (索引 [123]): ['整體', '還', '不錯', '床', '很', '舒服', '整體', '配置', '還', '不錯'

#### 使用BERT的不同層输出

In [25]:
sentences_tokenized_ = [" ".join(eval(d)) for d in df_.loc[0:50, "綜合評論_ws"].values]

In [28]:
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np

tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext', output_hidden_states=True)

embeddings = []
for sentence in sentences_tokenized_:
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # 获取最后四层的输出
    last_four_layers = outputs.hidden_states[-4:]
    # 将它们合并起来
    embedding = torch.mean(torch.stack(last_four_layers), dim=0).mean(dim=1).numpy()
    embeddings.append(embedding.squeeze())  # 添加这个改动

new_tokens = ['早餐', '不好'] # 分詞
inputs = tokenizer(new_tokens, is_split_into_words=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
last_four_layers = outputs.hidden_states[-4:]
new_embedding = torch.mean(torch.stack(last_four_layers), dim=0).mean(dim=1).numpy()

embeddings_np = np.array(embeddings)

similarities = cosine_similarity(embeddings_np, new_embedding.reshape(1, -1))
top_indices = np.argsort(similarities, axis=0)[::-1]

for i, index in enumerate(top_indices[:10]):
    print(f"第 {i+1} 相似的文本 (索引 {index}): {sentences_tokenized_[index[0]]}, 相似度: {similarities[index[0]][0]}")


Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


第 1 相似的文本 (索引 [45]): 早餐 超 豐盛 超, 相似度: 0.6999987363815308
第 2 相似的文本 (索引 [3]): 沒有 附 早餐, 相似度: 0.6805572509765625
第 3 相似的文本 (索引 [22]): 滿意, 相似度: 0.5698429346084595
第 4 相似的文本 (索引 [23]): 沒有 髒, 相似度: 0.558742105960846
第 5 相似的文本 (索引 [35]): 地 上 有 頭髮 不 夠 乾淨, 相似度: 0.5520568490028381
第 6 相似的文本 (索引 [39]): 房間 升級 房間 及 衛浴 空間 舒適 早餐 每 天 都 是 培根 起司 口味, 相似度: 0.5247654914855957
第 7 相似的文本 (索引 [40]): 草地 夠 大 風 好 大, 相似度: 0.5237064957618713
第 8 相似的文本 (索引 [17]): 地點 方便 停車位 多 早餐 也 很 不錯 補菜 的 速度 也 很 快 早餐 沒有 醬瓜 房間 太 昏暗, 相似度: 0.5115199089050293
第 9 相似的文本 (索引 [1]): 整體 上 都 不錯 無 飲水機 有點 不 方便, 相似度: 0.5094053745269775
第 10 相似的文本 (索引 [8]): 有 桌遊 可以 玩, 相似度: 0.5080175399780273


### 觀察自行分詞與bert內建分詞的差異

In [11]:
new_sentence = "房間小"

# inputs = tokenizer(new_sentence, return_tensors="pt")
new_tokens = tokenizer.tokenize(new_sentence)
inputs = tokenizer(new_tokens, return_tensors="pt")

print(new_tokens)
print(inputs)

new_tokens = ['房間', '小'] # 分詞
inputs = tokenizer(new_tokens, is_split_into_words=True, return_tensors="pt")

print(new_tokens)
print(inputs)

['房', '間', '小']
{'input_ids': tensor([[ 101, 2791,  102],
        [ 101, 7279,  102],
        [ 101, 2207,  102]]), 'token_type_ids': tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]])}
['房間', '小']
{'input_ids': tensor([[ 101, 2791, 7279, 2207,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


#### 嘗試只保留有意義的詞性

In [6]:
df = pd.read_csv('0_10000.csv', header=0)

In [11]:
stop_pos = set(['Nep', 'Nh', 'Nb'])  # 這 3 種詞性不保留

# 分詞後評論
sentences_tokenized = [eval(d) for d in df.loc[:, "綜合評論_ws"].values]

# 分詞後詞性
pos_tags = [eval(d) for d in df.loc[:, "綜合評論_pos"].values]

short_sentence = []  # 放過濾後的句子

for sentence, sentence_pos_tags in zip(sentences_tokenized, pos_tags):

    short_sentence_sentence = []  # 存储这个句子过滤后的词（不带词性）
    
    for word_ws, word_pos in zip(sentence, sentence_pos_tags):
        
        # 只留名詞和動詞
        is_N_or_V = word_pos.startswith("V") or word_pos.startswith("N")

        # 去掉名詞裡的某些詞性
        is_not_stop_pos = word_pos not in stop_pos

        if is_N_or_V and is_not_stop_pos:
            
            short_sentence_sentence.append(f"{word_ws}")

    short_sentence.append(short_sentence_sentence)

# print(sentences_tokenized[0:3])
# print(short_sentence[0:3])

# 去除空的列表
short_sentence = [sentence for sentence in short_sentence if sentence]  

### 利用篩選詞性後的結果做 roberta

In [10]:
import ast

tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')

# 獲取每個文本的嵌入向量
embeddings = []
sentences = []  # 新建一個列表來保存被處理的句子

for sentence in short_sentence:
    
    inputs = tokenizer.encode_plus(sentence, is_split_into_words=True, return_tensors="pt")
    
    # 檢查序列長度是否超過 512
    if len(inputs["input_ids"][0]) > 512:
        print("The sentence is too long. Skipping...")
        continue
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings.append(outputs.last_hidden_state.mean(dim = 1).numpy().flatten())
    sentences.append(sentence)  # 將被處理的句子加到列表中

# 新的文本
new_sentence = "房間大"
new_tokens = ['房間', '大'] # 分詞
inputs = tokenizer(new_tokens, is_split_into_words=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()

# 計算與新文本的相似性
similarities = cosine_similarity(embeddings + [new_embedding])

# 獲取最相似的 50 個文本的索引
top50_indices = np.argsort(similarities[-1][:-1])[:-51:-1]

# 印出最相似的 50 個文本
for i, index in enumerate(top50_indices):
    print(f"相似的文本 (索引 {index}): {sentences[index]}, 相似度: {similarities[-1][index]}")

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The sentence is too long. Skipping...
相似的文本 (索引 3834): ['房間', '大'], 相似度: 0.9999998211860657
相似的文本 (索引 6677): ['房間', '大'], 相似度: 0.9999998211860657
相似的文本 (索引 7534): ['房間', '大'], 相似度: 0.9999998211860657
相似的文本 (索引 6762): ['房間', '大'], 相似度: 0.9999998211860657
相似的文本 (索引 6697): ['房間', '大'], 相似度: 0.9999998211860657
相似的文本 (索引 6527): ['房間', '大'], 相似度: 0.9999998211860657
相似的文本 (索引 4170): ['房間', '小'], 相似度: 0.9714384078979492
相似的文本 (索引 9246): ['房間', '小'], 相似度: 0.9714384078979492
相似的文本 (索引 1637): ['房間', '小'], 相似度: 0.9714384078979492
相似的文本 (索引 7674): ['房間', '算', '大'], 相似度: 0.9639410376548767
相似的文本 (索引 8884): ['房間', '差'], 相似度: 0.9296784996986389
相似的文本 (索引 9238): ['房間', '偏', '小'], 相似度: 0.927994966506958
相似的文本 (索引 989): ['房間', '浴室', '大'], 相似度: 0.9196709394454956
相似的文本 (索引 8559): ['房間', '小', '床', '小'], 相似度: 0.9166770577430725
相似的文本 (索引 6916): ['乾淨', '房間', '大'], 相似度: 0.9122157096862793
相似的文本 (索引 7094): ['房間', '大', '寬敞'], 相似度: 0.9050126075744629
相似的文本 (索引 5854): ['床', '房間', '大小'], 相似度: 0.8997808694839478
相似

#### TF-IDF

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from ast import literal_eval

df_tf_idf = pd.read_csv('0_10000.csv', header=0)

sentences_tokenized = [eval(d) for d in df_tf_idf["綜合評論_ws"].values]
pos_tags = [eval(d) for d in df_tf_idf["綜合評論_pos"].values]

# 嘗試只保留有意義的詞性
stop_pos = set(['Nep', 'Nh', 'Nb'])  # 這 3 種詞性不保留

short_with_pos = []  # 放過濾後的詞性與句子
short_sentence = []  # 放過濾後的句子

for sentence, sentence_pos_tags in zip(sentences_tokenized, pos_tags):
    short_with_pos_sentence = []  # 儲存這個句子過濾後的詞（帶詞性）
    short_sentence_sentence = []  # 儲存這個句子過濾後的詞（不帶詞性）
    
    for word_ws, word_pos in zip(sentence, sentence_pos_tags):
        
        # 只留名詞和動詞
        is_N_or_V = word_pos.startswith("V") or word_pos.startswith("N")

        # 去掉名詞裡的某些詞性
        is_not_stop_pos = word_pos not in stop_pos

        if is_N_or_V and is_not_stop_pos:
            short_with_pos_sentence.append(f"{word_ws}({word_pos})")
            short_sentence_sentence.append(f"{word_ws}")

    short_with_pos.append(short_with_pos_sentence)
    short_sentence.append(short_sentence_sentence)

# 將 '綜合評論_ws' 欄位更新為 'short_sentence' 列表
df_tf_idf['綜合評論_ws'] = [' '.join(s) for s in short_sentence]

# 將綜合評論過濾後為空的資料去除
non_empty_mask = df_tf_idf['綜合評論_ws'].str.strip() != ''

# 使用此遮罩來過濾 DataFrame
df_tf_idf = df_tf_idf[non_empty_mask]

# 使用 TF-IDF 將所有評論文本轉換為數值向量
vectorizer = TfidfVectorizer()
vectorizer.fit(df_tf_idf['綜合評論_ws'])

# 假設新用戶輸入的需求
new_user_input = "房間 大"

# 將新用戶輸入轉換為相同的數值向量
new_vector = vectorizer.transform([new_user_input])

# 獲取所有的飯店名稱
hotel_names = df['飯店名稱'].unique()

# 初始化一個空的字典來儲存每個飯店的所有評論相似度和評論文本
hotel_reviews_similarities = {}

# 對每個飯店進行處理
for hotel_name in hotel_names:
    # 只選取該飯店的評論
    hotel_reviews = df_tf_idf[df_tf_idf['飯店名稱'] == hotel_name]['綜合評論_ws']
    
    # 使用擬合好的TF-IDF將飯店的評論轉換為數值向量
    X = vectorizer.transform(hotel_reviews)

    # 計算新用戶輸入與所有評論的 cosine similarity
    cos_similarities = cosine_similarity(new_vector, X).flatten()
    
    # 將這個飯店的所有評論相似度和評論文本儲存到字典中
    hotel_reviews_similarities[hotel_name] = list(zip(hotel_reviews, cos_similarities))

# 初始化一個空的字典來儲存每個飯店與新用戶需求的平均相似度
hotel_similarities = {hotel: np.mean([sim for _, sim in reviews]) for hotel, reviews in hotel_reviews_similarities.items()}

# 對飯店的平均相似度進行排序，並只取前10個
top10_hotels = sorted(hotel_similarities.items(), key=lambda x: x[1], reverse=True)[:10]

# 輸出與新用戶需求最相似的前 10 個飯店
for hotel_name, avg_sim in top10_hotels:
    print(f"飯店名稱：{hotel_name}")
    print(f"平均相似度:{avg_sim}")
    
    # 對這家飯店的所有評論相似度進行排序，並只取前 10 個
    top10_reviews = sorted(hotel_reviews_similarities[hotel_name], key=lambda x: x[1], reverse=True)[:10]

    # 輸出與新用戶需求最相似的前10條評論
    print("前10條相似的評論:")
    for review_text, sim in top10_reviews:
        print(f"評論：{review_text}，相似度：{sim}")
    
    print("\n")

飯店名稱：陽光滿屋民宿
平均相似度:0.11570316506156822
前10條相似的評論:
評論：舒適 房間，相似度：0.5655955612619636
評論：乾淨 舒適 房間，相似度：0.4529094419157692
評論：推薦 房間 乾淨 不錯 房間 內 無 冰箱 有 共用，相似度：0.3734171246078148
評論：房間 乾淨 看 出 屋主 用心，相似度：0.22074116484931985
評論：整體 棒 地點 不錯 大 片 窗戶 外面 有 連結感 地板 乾淨 熱水 燙 水壓 足 沒 暖氣 訂 個 房間 牆壁 上 有 幅 畫 房間 陽光感 風格 不符 感覺 看 久 頭 暈 正面 床 難 看到，相似度：0.1696819678971862
評論：陽光 地方 房間 不錯 熱水 熱 晚上 安靜 有 小 螞蟻，相似度：0.15946363273550007
評論：民宿 地點 活水湖 近 車程 五 分鐘 無 市區 鬧區 噪音 老闆 熱心 解答 任何 問題 房間 整潔 乾淨 希望 有 含 早餐 退房 時間 早點 房間 冷氣 涼 需要 設定到 低 溫度，相似度：0.15127160639049955
評論：海岸 公園 近 民宿 房間 內 看見 海 民宿 地點 不錯 安靜 停車 方便 進門處 鞋子 沒 地方 擺 顯 亂 房間 垃圾桶 裏 遺留 上 個 房客 泡麵碗 洗手台 螞蟻 一 大 堆 腳 撞 狂 床框，相似度：0.14818309750777228
評論：房間 乾淨 水壓 大 熱水 快 來 洗澡 舒服 市區 開車 分鐘 內 到達 路 邊 停車 方便 消毒水 味道 重，相似度：0.11527584378112371
評論：早上 五點 多 有 公雞 叫 起床 六點 多 有 戰機 飛過 房間 廁所 乾淨，相似度：0.09638061657609528


飯店名稱：家有囍宿
平均相似度:0.09766333643934164
前10條相似的評論:
評論：房間 大 間 衛浴 設備 讚 有 按摩 浴缸 值 高 下 次 來 時 有 房間 住，相似度：0.3924007059710153
評論：早餐 直接 送 餐券 方便 安靜 舒適 房間 大 枕頭 扁 床 軟 中間 陷下去 睡 右邊 一點 建議 翻面，相似度：0.09591597622569287
評論：老闆

#### 將同一間飯店評論合併後，使用 roberta 模型

In [28]:
import re
import pandas as pd
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from collections import defaultdict

def sentence_based_sliding_window(text, window_size):
    sentences = re.split('(?<=[。])', text)
    current_group = []
    current_length = 0
    grouped_sentences = []

    for sentence in sentences:
        sentence_length = len(tokenizer.tokenize(sentence))
        if sentence_length > window_size:
            # If the length of a single sentence exceeds the window size, skip that data
            continue
        elif current_length + sentence_length > window_size:
            grouped_sentences.append("".join(current_group))
            current_group = [sentence]
            current_length = sentence_length
        else:
            current_group.append(sentence)
            current_length += sentence_length

    # Add the remaining sentences to grouped_sentences
    if current_group:
        grouped_sentences.append("".join(current_group))

    return grouped_sentences

tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')


df = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# Merge the comprehensive comments of the same hotel using a period
df_grouped = df[0:10000].groupby('飯店名稱')['綜合評論'].apply(lambda x: '。'.join(x)).reset_index()

# Text
sentences = list(df_grouped.loc[:, '綜合評論'])

# Get the embeddings for each text
embeddings = []
hotel_indices = []
hotel_embeddings = defaultdict(list)

for i, sentence in enumerate(sentences):
    # If the sentence is too long, split it into multiple fragments
    if len(tokenizer.tokenize(sentence)) > tokenizer.model_max_length:
        sentence_windows = sentence_based_sliding_window(sentence, 4000)
        for sentence_window in sentence_windows:
            if len(tokenizer.tokenize(sentence_window)) > tokenizer.model_max_length:
                continue
            inputs = tokenizer(sentence_window, return_tensors="pt", truncation=False, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
            hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
    else:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=False, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())

# Compute the average embedding for each hotel
for i, hotel_embedding in hotel_embeddings.items():
    embeddings.append(np.mean(hotel_embedding, axis=0))
    hotel_indices.append(i)

# 新的文本
new_sentence = "房間大"
inputs = tokenizer(new_sentence, return_tensors="pt", truncation=True, max_length=512, padding=True)

with torch.no_grad():
    outputs = model(**inputs)
new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()


# Calculate similarity with the new text
similarities = cosine_similarity(embeddings + [new_embedding])

# Get the indices of the top 10 most similar texts
top10_indices = np.argsort(similarities[-1][:-1])[:-10:-1]

# Print the top 10 most similar texts
for index in top10_indices:
    print(f"Similar hotel: {df_grouped.loc[hotel_indices[index], '飯店名稱']}, similarity: {similarities[-1][index]}")


Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: The size of tensor a (5910) must match the size of tensor b (512) at non-singleton dimension 1

#### 將同一間飯店評論合併後，使用可處理長句子的 bert

In [15]:
def sentence_based_sliding_window(text, window_size):
    sentences = re.split('(?<=[。])', text)
    current_group = []
    current_length = 0
    grouped_sentences = []

    for sentence in sentences:
        sentence_length = len(tokenizer.tokenize(sentence))
        if sentence_length > window_size:
            # 如果單一句子的長度超過窗口大小，跳過該筆數據
            continue
        elif current_length + sentence_length > window_size:
            grouped_sentences.append("".join(current_group))
            current_group = [sentence]
            current_length = sentence_length
        else:
            current_group.append(sentence)
            current_length += sentence_length

    # 把剩下的句子加入 grouped_sentences
    if current_group:
        grouped_sentences.append("".join(current_group))
    
    return grouped_sentences

In [16]:
from transformers import LongformerModel, LongformerTokenizer
from collections import defaultdict

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')

df = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# 將相同飯店的綜合評論用句號合併
df_grouped = df[0:10000].groupby('飯店名稱')['綜合評論'].apply(lambda x: '。'.join(x)).reset_index()

# 文本
sentences = list(df_grouped.loc[:, '綜合評論'])

# 獲取每個文本的嵌入向量
embeddings = []
hotel_indices = []
hotel_embeddings = defaultdict(list)

for i, sentence in enumerate(sentences):
    # 如果句子太長，就將其分割成多個片段
    if len(tokenizer.tokenize(sentence)) > tokenizer.model_max_length:
        sentence_windows = sentence_based_sliding_window(sentence, 4000)
        for sentence_window in sentence_windows:
            if len(tokenizer.tokenize(sentence_window)) > tokenizer.model_max_length:
                continue
            inputs = tokenizer(sentence_window, return_tensors="pt", truncation=False, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
            hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
    else:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=False, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())

# 計算每一間飯店的平均向量
for i, hotel_embedding in hotel_embeddings.items():
    embeddings.append(np.mean(hotel_embedding, axis=0))
    hotel_indices.append(i)

# 新的文本
new_sentence = "房間大"
inputs = tokenizer(new_sentence, return_tensors="pt", truncation=False, padding=True)

with torch.no_grad():
    outputs = model(**inputs)
new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()

# 計算與新文本的相似性
similarities = cosine_similarity(embeddings + [new_embedding])

# 獲取最相似的 10 個文本的索引
top10_indices = np.argsort(similarities[-1][:-1])[:-10:-1]

# 印出最相似的 10 個文本
for index in top10_indices:
    print(f"相似的飯店: {df_grouped.loc[hotel_indices[index], '飯店名稱']}, 相似度: {similarities[-1][index]}")

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


相似的飯店: 大寶的民宿 Tabohouse, 相似度: 0.9586089849472046
相似的飯店: 翠園歐風庭園民宿, 相似度: 0.9230974912643433
相似的飯店: 花爵墾丁, 相似度: 0.9208060503005981
相似的飯店: 白鷺灣 民宿(安平古堡), 相似度: 0.9181367754936218
相似的飯店: 希望恆春休閒會館, 相似度: 0.9181272983551025
相似的飯店: 卡爾登飯店 the Carlton, 相似度: 0.9166074991226196
相似的飯店: 陽光滿屋民宿, 相似度: 0.9149898886680603
相似的飯店: 家有囍宿, 相似度: 0.9144690036773682
相似的飯店: 海洋風情民宿, 相似度: 0.9132213592529297


#### 將同一間飯店評論合併後，使用可處理長句子的 bert，並排除評論數不足30的飯店

In [None]:
from transformers import LongformerModel, LongformerTokenizer

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')

df = pd.read_csv('./booking_comments_分詞update.csv', header=0)

In [None]:
from collections import defaultdict

# 篩選評論數大於等於 30 的飯店
df_filtered = df[0:10000].groupby('飯店名稱').filter(lambda x: len(x) >= 30);df_filtered

# 將相同飯店的綜合評論用句號合併，並保留其他重要欄位
df_grouped = df_filtered.groupby('飯店名稱').agg({
    '綜合評論': lambda x: '。'.join(x),
    '縣市': lambda x: x.iloc[0],
    '鄉鎮': lambda x: x.iloc[0],
    '整體評分': lambda x: x.iloc[0],
    '單項評分_整潔度': lambda x: x.iloc[0],
    '單項評分_舒適程度': lambda x: x.iloc[0],
    '單項評分_住宿地點': lambda x: x.iloc[0],
    '單項評分_設施': lambda x: x.iloc[0],
    '單項評分_員工素質': lambda x: x.iloc[0],
    '單項評分_性價比': lambda x: x.iloc[0]
}).reset_index();df_grouped

# 文本
sentences = list(df_grouped.loc[:, '綜合評論'])

# 獲取每個文本的嵌入向量
embeddings = []
hotel_indices = []
hotel_embeddings = defaultdict(list)

for i, sentence in enumerate(sentences):
    
    # 如果句子太長，就分割成多個片段
    if len(tokenizer.tokenize(sentence)) > tokenizer.model_max_length:
        sentence_windows = sentence_based_sliding_window(sentence, 4000)
        for sentence_window in sentence_windows:
            if len(tokenizer.tokenize(sentence_window)) > tokenizer.model_max_length:
                continue
            inputs = tokenizer(sentence_window, return_tensors="pt", truncation=False, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
            hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
    else:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=False, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())

# 計算每間飯店的平均向量
for i, hotel_embedding in hotel_embeddings.items():
    embeddings.append(np.mean(hotel_embedding, axis=0))
    hotel_indices.append(i)

In [52]:
# 儲存嵌入向量
np.save('embeddings_longformer_base_4096.npy', embeddings)

# 儲存飯店資訊
df_grouped.loc[hotel_indices, :].to_csv('hotel_info_longformerbase4096.csv')

#### 使用迴圈跑

In [11]:
def sentence_based_sliding_window(text, window_size):
    sentences = re.split('(?<=[。])', text)
    current_group = []
    current_length = 0
    grouped_sentences = []

    for sentence in sentences:
        sentence_length = len(tokenizer.tokenize(sentence))
        if sentence_length > window_size:
            # If the length of a single sentence exceeds the window size, skip that data
            continue
        elif current_length + sentence_length > window_size:
            grouped_sentences.append("".join(current_group))
            current_group = [sentence]
            current_length = sentence_length
        else:
            current_group.append(sentence)
            current_length += sentence_length

    # Add the remaining sentences to grouped_sentences
    if current_group:
        grouped_sentences.append("".join(current_group))

    return grouped_sentences

In [13]:
import torch
import numpy as np
import pandas as pd
from collections import defaultdict
from transformers import LongformerModel, LongformerTokenizer
import re
from sklearn.metrics.pairwise import cosine_similarity

# 如果有 GPU 就使用，否則用 CPU 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
model = model.to(device)

df = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# 讀取之前的檔案
try:
    embeddings = np.load('embeddings_longformer_base_4096.npy').tolist()
    df_grouped = pd.read_csv('hotel_info_longformerbase4096.csv')
except FileNotFoundError:
    embeddings = []
    df_grouped = pd.DataFrame()

hotel_names = df['飯店名稱'].unique()
n_hotels = len(hotel_names)

# 分批執行(每50間飯店儲存一次)
for i in range(0, n_hotels, 50):
    print(f'Processing hotels {i} to {min(i + 50, n_hotels)}')
    batch_hotel_names = hotel_names[i:i + 50]
    df_filtered = df[df['飯店名稱'].isin(batch_hotel_names)]
    df_grouped_batch = df_filtered.groupby('飯店名稱').agg({
        '正評': lambda x: '。'.join(str(v) for v in x if not pd.isna(v)),
        '縣市': lambda x: x.iloc[0],
        '鄉鎮': lambda x: x.iloc[0],
        '整體評分': lambda x: x.iloc[0],
        '單項評分_整潔度': lambda x: x.iloc[0],
        '單項評分_舒適程度': lambda x: x.iloc[0],
        '單項評分_住宿地點': lambda x: x.iloc[0],
        '單項評分_設施': lambda x: x.iloc[0],
        '單項評分_員工素質': lambda x: x.iloc[0],
        '單項評分_性價比': lambda x: x.iloc[0]
    }).reset_index()
    
    sentences = list(df_grouped_batch.loc[:, '正評'])
    hotel_embeddings = defaultdict(list)

    for sentence in sentences:
        
        # 如果文本過長就拆分
        if len(tokenizer.tokenize(sentence)) > tokenizer.model_max_length:
            sentence_windows = sentence_based_sliding_window(sentence, 4000)
            for sentence_window in sentence_windows:
                if len(tokenizer.tokenize(sentence_window)) > tokenizer.model_max_length:
                    continue
                inputs = tokenizer(sentence_window, return_tensors="pt", truncation=False, padding=True).to(device)
                with torch.no_grad():
                    outputs = model(**inputs)
                hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten())
        else:
            inputs = tokenizer(sentence, return_tensors="pt", truncation=False, padding=True).to(device)
            with torch.no_grad():
                outputs = model(**inputs)
            hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten())
    
    # 計算平均向量
    for _, hotel_embedding in hotel_embeddings.items():
        embeddings.append(np.mean(hotel_embedding, axis=0))
    
    # 合併檔案內容
    df_grouped = pd.concat([df_grouped, df_grouped_batch])
    
    # 儲存
    np.save('embeddings_longformer_base_4096.npy', np.array(embeddings))
    df_grouped.to_csv('hotel_info_longformerbase4096.csv', index=False)


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing hotels 0 to 50
Processing hotels 50 to 100
Processing hotels 100 to 150
Processing hotels 150 to 200
Processing hotels 200 to 250
Processing hotels 250 to 300
Processing hotels 300 to 350
Processing hotels 350 to 400
Processing hotels 400 to 450
Processing hotels 450 to 500
Processing hotels 500 to 550
Processing hotels 550 to 600
Processing hotels 600 to 650
Processing hotels 650 to 700
Processing hotels 700 to 750
Processing hotels 750 to 800
Processing hotels 800 to 850
Processing hotels 850 to 900
Processing hotels 900 to 950


Token indices sequence length is longer than the specified maximum sequence length for this model (4097 > 4096). Running this sequence through the model will result in indexing errors


IndexError: index out of range in self

#### 進行二次篩選(根據某些特定條件)

In [14]:
def get_similar_hotels(condition, embeddings, new_embedding, hotel_data, n):
    
    # 根據條件篩選出飯店
    filtered_hotels = hotel_data[condition]

    # 取得符合條件的飯店嵌入向量
    filtered_embeddings = embeddings[filtered_hotels.index]

    # 計算與新文本的相似性
    similarities = cosine_similarity(np.concatenate([filtered_embeddings, new_embedding[None, :]]))

    # 獲取最相似的 n 個文本的索引
    topn_indices = np.argsort(similarities[-1][:-1])[:-n-1:-1]

    # 印出最相似的 n 個飯店
    for index in topn_indices:
        print(f"相似的飯店:\n{filtered_hotels.iloc[index][['飯店名稱']]}")
        print(f"縣市:{filtered_hotels.iloc[index][['縣市']]}")
        print(f"鄉鎮:{filtered_hotels.iloc[index][['鄉鎮']]}")
        print(f"整體評分:{filtered_hotels.iloc[index][['整體評分']]}")
        print(f"相似度: {similarities[-1][index]}")

In [19]:
# 讀取 DataFrame
df_hotel_info = pd.read_csv('hotel_info_longformerbase4096.csv')
print(len(df_hotel_info))
# 讀取 embeddings
embeddings = np.load('embeddings_longformer_base_4096.npy')
print(len(embeddings))

# 新的文本
new_sentence = "房間大"
inputs = tokenizer(new_sentence, return_tensors="pt", truncation=True, max_length=512, padding=True)

with torch.no_grad():
    outputs = model(**inputs)
new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()



#     # 新的文本
#     tokens = tokenizer.tokenize(sentence)  # 分詞
#     inputs = tokenizer(sentence, return_tensors="pt")

#     with torch.no_grad():
#         outputs = model(**inputs)
#     new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()


# 搜尋'南投縣'的前 5 個相似的飯店
get_similar_hotels(df_hotel_info['縣市'] == '南投縣', embeddings, new_embedding, df_hotel_info, 5)

# # 搜尋整體評分大於 8 的前 10 個相似的飯店
# get_similar_hotels(hotel_data['整體評分'] > 8, embeddings, new_embedding, hotel_data, 10)

# # 搜尋'南投縣'且整體評分大於 8 的前 15 個相似的飯店
# get_similar_hotels((hotel_data['縣市'] == '南投縣') & (hotel_data['整體評分'] > 8), embeddings, new_embedding, hotel_data, 15)

900
18


IndexError: index 128 is out of bounds for axis 0 with size 18

In [46]:
# top10_hotel_scores = []
# for name in top10_hotel_name:
#     score = df[df['飯店名稱'] == name]['整體評分'].values[0]
#     top10_hotel_scores.append((name, score))

# # 根據評分進行排序，評分高的在前
# top10_hotel_scores_sorted = sorted(top10_hotel_scores, key=lambda x: x[1], reverse=True)

# # 印出排序後的飯店和評分
# for name, score in top10_hotel_scores_sorted:
#     print(f"飯店名稱: {name}, 整體評分: {score}")

飯店名稱: 太魯閣阿騫的家民宿, 整體評分: 9.6
飯店名稱: 鹿台民宿, 整體評分: 9.1
飯店名稱: 花蓮綠舍 The Green Villa l 花蓮親子溜滑梯民宿, 整體評分: 8.8
飯店名稱: 親水棧民宿, 整體評分: 8.6
飯店名稱: 磐石旅店, 整體評分: 8.4
飯店名稱: 雁窩民宿, 整體評分: 8.3
飯店名稱: 朝日民宿 - Peng's Family, 整體評分: 8.0
飯店名稱: 卡爾登飯店 the Carlton, 整體評分: 7.9
飯店名稱: 樂活休閒海景飯店, 整體評分: 7.9


#### GPT

In [None]:
import openai
import time

openai.api_key = 'your-api-key'

questions = [
    "What is the capital of France?",
    "Who wrote the book '1984'?",
    "What is the distance from Earth to the Moon?"
]

responses = []

for question in questions:
    response = openai.Completion.create(
        engine="text-davinci-004",
        prompt=question,
        max_tokens=100
    )
    responses.append(response.choices[0].text.strip())
    time.sleep(1)  # 避免在短时间内发送过多请求

for question, answer in zip(questions, responses):
    print(f"Question: {question}")
    print(f"Answer: {answer}")


In [None]:
# 下載預訓練模型和分詞工具
model = BertModel.from_pretrained('bert-base-chinese', ignore_mismatched_sizes=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

# 假設數據
texts = [('今天天氣真好', '今天天氣很不錯'), ('你好早安', '我想回家')]
scores = [1.0, 0.0]

# 对数据进行编码
input_ids = tokenizer(texts, return_tensors='pt', padding=True, truncation=True);input_ids

# 分詞
# input_ids = tokenizer.encode("你好，世界！", add_special_tokens=True);input_ids

# 創建 Tensors
scores = torch.tensor(scores)

# 使用模型得到句子的表示
outputs = model(input_ids)

# 計算句子表示的餘弦相似度
similarity_scores = torch.nn.functional.cosine_similarity(outputs[0], outputs[1])

# # 微调模型
# optimizer = Adam(model.parameters())
# loss = torch.nn.functional.mse_loss(similarity_scores, scores)
# loss.backward()
# optimizer.step()

In [1]:
from transformers import BertModel, BertTokenizer

# 載入預訓練模型及其分詞器
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese')

# 輸入你的文本
text = "請將我換成你想要的任何文本。"

# 使用分詞器將文本轉換為模型可以理解的形式
inputs = tokenizer(text, return_tensors='pt')

# 使用模型獲得文本的表示
outputs = model(**inputs)

# 'outputs' 是一個 tuple，我們需要的是文本表示，它被儲存在第一個元素中
text_embedding = outputs[0]

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
print(text_embedding)

tensor([[[-0.7956,  0.8762, -0.1610,  ...,  1.1726, -0.1286, -0.5807],
         [-0.7632,  0.2071,  0.1727,  ...,  0.1169, -0.9048,  0.0898],
         [-0.9492,  0.2783,  0.9922,  ...,  1.1149, -0.1266,  0.0922],
         ...,
         [ 0.0631,  0.8451,  0.0971,  ...,  1.0712,  0.7042, -0.1742],
         [-0.3739,  1.4240,  0.2010,  ...,  0.6602, -0.2418, -0.4565],
         [-1.2477, -0.0717,  0.4747,  ...,  0.0043, -0.0576,  0.0467]]],
       grad_fn=<NativeLayerNormBackward0>)
