In [None]:
# !pip install transformers
# !pip install torch
# !pip install sentence_transformers
# !pip install textblob

In [1]:
from transformers import BertModel, BertTokenizer
from torch.optim import Adam
import torch
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import datetime

  from .autonotebook import tqdm as notebook_tqdm


## roberta 模型

### 原始評論(利用roberta原始的分詞方法)(會把每一個字都拆開)

In [5]:
df_ = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# 篩選評論數大於等於 30 的飯店
df_filtered = df_.groupby('飯店名稱').filter(lambda x: len(x) >= 30)

len(df_filtered['飯店名稱'].unique())

Unnamed: 0,飯店名稱,綜合評論
0,貓咪民宿Mini館-貓行為諮詢,乾淨、舒適、民宿內有3隻貓咪，非常可愛。民宿老闆很親切、開放提早中午就可以入住、熱心分享當地...
1,貓咪民宿Mini館-貓行為諮詢,注重生活機能的話要三思。民宿主人非常熱情回答我的養貓困擾，也給我很多建議。位置太偏僻，飲食選擇不多
2,貓咪民宿Mini館-貓行為諮詢,服務人員熱心親切，貓咪很乖很可愛。無
3,貓咪民宿Mini館-貓行為諮詢,貓可愛，老闆親切。一樓廚房有碗盤及雜物堆積，想洗環保餐具時看到水槽太髒，就躲回房間浴室洗了
4,貓咪民宿Mini館-貓行為諮詢,老闆娘人非常好，很熱心的介紹好吃的食物，民宿的貓都很可愛很有個性。第一天晚上隔壁房客的小孩尖...
...,...,...
1130754,華大旅店-南西館,電梯
1130755,一八遛遛居民宿,很棒的體驗，建築很新也很乾淨，房間裡的用品也都很有質感喔！小可惜是我們住的房間沒有桌椅，化妝...
1130756,風之島背包客民宿,讚。地點偏市區外整體住起來很舒服放假來這很愜意。
1130757,風之島背包客民宿,乾凈.舒適.安靜。


In [None]:
# 開始測量(爬取時間)
startime = datetime.datetime.now()

# 初始化 RoBERTa-wwm-ext 模型和分詞器
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')

# 評論文本
sentences = list(df_filtered.loc[:, '綜合評論'])

# 初始化一個空的 list 用於儲存飯店資訊
hotel_info_list = []

# 獲取每個文本的嵌入向量，並儲存對應的飯店資訊
embeddings = []
for idx, sentence in enumerate(sentences):
    tokens = tokenizer.tokenize(sentence)  # 分詞
    inputs = tokenizer(sentence, return_tensors="pt")

    # 檢查序列長度是否超過 512
    if len(inputs["input_ids"][0]) > 512:
        print("The sentence is too long. Skipping...")
        continue

    with torch.no_grad():
        outputs = model(**inputs)
    embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
    hotel_info_list.append(df_filtered.iloc[idx].to_dict())

# 儲存飯店資訊與 embeddings
df_hotel_info = pd.DataFrame(hotel_info_list)
df_hotel_info.to_csv('hotel_info_roberta.csv', index=False)
np.save('embeddings_roberta.npy', embeddings)

# 結束測量
endtime = datetime.datetime.now()

# 輸出結果
print("執行時間：", endtime - startime)

##### 利用迴圈跑

In [None]:
import numpy as np
import pandas as pd
import os
import datetime
from transformers import BertModel, BertTokenizer
import torch
import datetime

# 開始測量(時間)
startime = datetime.datetime.now()

df_ = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# 篩選評論數大於等於 30 的飯店
df_filtered = df_.groupby('飯店名稱').filter(lambda x: len(x) >= 30)

df_filtered = df_filtered[0:100000]

df_filtered = df_filtered.reset_index(drop=True)

# 初始化 RoBERTa-wwm-ext 模型和分詞器
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')

# 定義每個批次的大小
batch_size = 5000

# 計算需要進行的批次數
n_batches = (len(df_filtered) // batch_size) + 1

# 開始進行批次處理
for batch_idx in range(n_batches):
    print(f"Processing batch {batch_idx + 1} of {n_batches}...")
    
    start_idx = batch_idx * batch_size

    end_idx = (batch_idx + 1) * batch_size

    # 評論文本
    sentences = list(df_filtered.loc[start_idx:end_idx-1, '綜合評論'])

    # 如果 sentences 是空的，則跳過該批次
    if not sentences:
        continue

    # 初始化一個空的 list 用於儲存飯店資訊
    hotel_info_list = []

    # 獲取每個文本的嵌入向量，並儲存對應的飯店資訊
    embeddings = []
    for idx, sentence in enumerate(sentences):
        if isinstance(sentence, str):
            tokens = tokenizer.tokenize(sentence)  # 分詞
            inputs = tokenizer(sentence, return_tensors="pt")
        else:
            print(f"Sentence at index {idx} is not a string. Skipping...")
            continue

        # 檢查序列長度是否超過 512
        if len(inputs["input_ids"][0]) > 512:
            print("The sentence is too long. Skipping...")
            continue

        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
        hotel_info_list.append(df_filtered.iloc[start_idx + idx].to_dict())

    # 讀取原有的飯店資訊和 embeddings
    if os.path.exists('hotel_info_roberta_綜合評論.csv') and os.path.exists('embeddings_roberta_綜合評論.npy'):
        df_hotel_info_old = pd.read_csv('hotel_info_roberta_綜合評論.csv')
        embeddings_old = np.load('embeddings_roberta_綜合評論.npy')

    else:
        df_hotel_info_old = pd.DataFrame()
        embeddings_old = np.array([]).reshape(0,768)

    # 儲存飯店資訊與 embeddings
    df_hotel_info_new = pd.DataFrame(hotel_info_list)
    df_hotel_info = pd.concat([df_hotel_info_old, df_hotel_info_new], ignore_index=True)
    df_hotel_info.to_csv('hotel_info_roberta_綜合評論.csv', index=False)

    embeddings_new = np.array(embeddings)
    embeddings = np.vstack([embeddings_old, embeddings_new])
    np.save('embeddings_roberta_綜合評論.npy', embeddings)
    
    print(f"第{batch_idx + 1}批次儲存完畢")
    
# 結束測量
endtime = datetime.datetime.now()

# 輸出結果
print("執行時間：", endtime - startime)

### 讀取並進行篩選(先計算相似度再取平均)

In [3]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

def get_similar_hotels(df, embeddings, filter_dict, sentence, top_n, batch_size=5000):
    # 篩選
    for key, value in filter_dict.items():
        if key in df.columns:
            operation, target_value = value
            if operation == "==":
                df = df[df[key] == target_value]
            elif operation == "!=":
                df = df[df[key] != target_value]
            elif operation == ">":
                df = df[df[key] > target_value]
            elif operation == "<":
                df = df[df[key] < target_value]
            elif operation == ">=":
                df = df[df[key] >= target_value]
            elif operation == "<=":
                df = df[df[key] <= target_value]

    embeddings_filtered = embeddings[df.index.values]

    # 新的文本
    inputs = tokenizer(sentence, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)
    new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()

    # 計算與新文本的相似性
    similarities = []
    num_batches = len(embeddings_filtered) // batch_size + 1
    for i in range(num_batches):
        start_index = i * batch_size
        end_index = start_index + batch_size
        batch = embeddings_filtered[start_index:end_index]
        batch_similarity = cosine_similarity(np.vstack((batch, new_embedding)))
        similarities.append(batch_similarity[-1, :-1])
    similarities = np.concatenate(similarities)

    # 建立新的 DataFrame，儲存相似度與對應的飯店名稱
    df_similarity = pd.DataFrame({'飯店名稱': df['飯店名稱'], '相似度': similarities})

    # 以飯店名稱分組，計算每間飯店的平均相似度
    df_avg_similarity = df_similarity.groupby('飯店名稱').mean()

    # 取出平均相似度最高的前 top_n 間飯店
    top_hotels = df_avg_similarity.sort_values(by='相似度', ascending=False).head(top_n)
    
    return top_hotels

In [None]:
# 讀取 DataFrame
df_hotel_info = pd.read_csv('hotel_info_roberta_綜合評論.csv')

# 讀取 embeddings
embeddings = np.load('embeddings_roberta_綜合評論.npy')

tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')

In [8]:
# 建立篩選條件字典
filter_dict = {'整體評分': (">", 7.0)}

# 使用函數
top_hotels = get_similar_hotels(df_hotel_info, embeddings, filter_dict, "房間好", 10)

print(top_hotels)

                      相似度
飯店名稱                     
磐石旅店             0.652121
太魯閣阿騫的家民宿        0.651672
鹿台民宿             0.651581
貓咪民宿Mini館-貓行為諮詢  0.650581
墾丁君臨農場           0.650515
嘉義優遊商旅           0.650436
星享道酒店            0.650195
宜蘭明水露渡假民宿        0.650051
台糖長榮酒店- 台南       0.649993
漫遊舍民宿            0.649875


### 讀取並進行篩選(先對向量取平均再計算相似度)

In [13]:
# 計算每間飯店的評論向量的平均值
def compute_average_vectors(df, embeddings):
    
    df['embedding'] = list(embeddings)
    df_avg_embedding = df.groupby('飯店名稱')['embedding'].apply(np.mean)

    return df_avg_embedding

# 根據篩選條件與新文本計算平均相似度
def compute_similarity(df_avg_embedding, df, filter_dict, sentence):
    
    # 篩選
    for key, value in filter_dict.items():
        if key in df.columns:
            operation, target_value = value
            if operation == "==":
                df = df[df[key] == target_value]
            elif operation == "!=":
                df = df[df[key] != target_value]
            elif operation == ">":
                df = df[df[key] > target_value]
            elif operation == "<":
                df = df[df[key] < target_value]
            elif operation == ">=":
                df = df[df[key] >= target_value]
            elif operation == "<=":
                df = df[df[key] <= target_value]

    df_avg_embedding = df_avg_embedding[df_avg_embedding.index.isin(df['飯店名稱'])]

    # 新的文本
    inputs = tokenizer(sentence, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)
    new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()

    # 計算與新文本的相似性
    similarities = cosine_similarity(np.vstack((df_avg_embedding.values.tolist(), new_embedding)))[-1, :-1]

    # 建立新的 DataFrame，儲存相似度與對應的飯店名稱
    df_similarity = pd.DataFrame({'飯店名稱': df_avg_embedding.index, '相似度': similarities})

    return df_similarity

In [16]:
# 讀取 DataFrame
df_hotel_info = pd.read_csv('hotel_info_roberta_綜合評論.csv')

# 讀取 embeddings
embeddings = np.load('embeddings_roberta_綜合評論.npy')

# 計算平均向量
df_avg_embedding = compute_average_vectors(df_hotel_info, embeddings)

tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


##### 篩選並計算相似度

In [18]:
# 建立篩選條件字典
filter_dict = {'整體評分': (">", 3.0),'縣市': ("==", "高雄市")}

df_similarity = compute_similarity(df_avg_embedding, df_hotel_info, filter_dict, "早餐好吃")

top_n = 10

top_hotels = df_similarity.sort_values(by='相似度', ascending=False).head(top_n)

print(top_hotels)

           飯店名稱       相似度
197        涵園民宿  0.756361
39         世奇商旅  0.754723
124    康橋商旅-七賢館  0.741242
203   玫瑰花園汽車旅館   0.735814
165       旗山三合院  0.734303
127    康橋商旅-覺民館  0.733816
93      威尼斯汽車旅館  0.732871
125  康橋商旅-三多商圈館  0.730270
67        喜達絲飯店  0.730034
94        媚力泊飯店  0.727803


##### 比較花費時間測試

In [6]:
import numpy as np
import pandas as pd
import os
import datetime
from transformers import BertModel, BertTokenizer
import torch
import datetime

# 開始測量(時間)
startime = datetime.datetime.now()

df_ = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# 篩選評論數大於等於 30 的飯店
df_filtered = df_.groupby('飯店名稱').filter(lambda x: len(x) >= 30)

df_filtered = df_filtered[0:100000]

df_filtered = df_filtered.reset_index(drop=True)

# 初始化 RoBERTa-wwm-ext 模型和分詞器
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')

# 定義每個批次的大小
batch_size = 5000

# 計算需要進行的批次數
n_batches = (len(df_filtered) // batch_size) + 1

# 開始進行批次處理
for batch_idx in range(n_batches):
    print(f"Processing batch {batch_idx + 1} of {n_batches}...")
    
    start_idx = batch_idx * batch_size

    end_idx = (batch_idx + 1) * batch_size

    # 評論文本
    sentences = list(df_filtered.loc[start_idx:end_idx-1, '綜合評論'])

    # 如果 sentences 是空的，則跳過該批次
    if not sentences:
        continue

    # 初始化一個空的 list 用於儲存飯店資訊
    hotel_info_list = []

    # 獲取每個文本的嵌入向量，並儲存對應的飯店資訊
    embeddings = []
    for idx, sentence in enumerate(sentences):
        if isinstance(sentence, str):
            tokens = tokenizer.tokenize(sentence)  # 分詞
            inputs = tokenizer(sentence, return_tensors="pt")
        else:
            print(f"Sentence at index {idx} is not a string. Skipping...")
            continue

        # 檢查序列長度是否超過 512
        if len(inputs["input_ids"][0]) > 512:
            print("The sentence is too long. Skipping...")
            continue

        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
        hotel_info_list.append(df_filtered.iloc[start_idx + idx].to_dict())

    # 讀取原有的飯店資訊和 embeddings
    if os.path.exists('hotel_info_roberta_test.csv') and os.path.exists('embeddings_roberta_test.npy'):
        df_hotel_info_old = pd.read_csv('hotel_info_roberta_test.csv')
        embeddings_old = np.load('embeddings_roberta_test.npy')

    else:
        df_hotel_info_old = pd.DataFrame()
        embeddings_old = np.array([]).reshape(0,768)

    # 儲存飯店資訊與 embeddings
    df_hotel_info_new = pd.DataFrame(hotel_info_list)
    df_hotel_info = pd.concat([df_hotel_info_old, df_hotel_info_new], ignore_index=True)
    df_hotel_info.to_csv('hotel_info_roberta_test.csv', index=False)

    embeddings_new = np.array(embeddings)
    embeddings = np.vstack([embeddings_old, embeddings_new])
    np.save('embeddings_roberta_test.npy', embeddings)
    
    print(f"第{batch_idx + 1}批次儲存完畢")
    
# 結束測量
endtime = datetime.datetime.now()

# 輸出結果
print("執行時間：", endtime - startime)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing batch 1 of 21...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
第1批次儲存完畢
Processing batch 2 of 21...
The sentence is too long. Skipping...
第2批次儲存完畢
Processing batch 3 of 21...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
第3批次儲存完畢
Processing batch 4 of 21...
The sentence is too long. Skipping...
第4批次儲存完畢
Processing batch 5 of 21...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
第5批次儲存完畢
Processing batch 6 of 21...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
第6批次儲存完畢
Processing batch 7 of 21...
The sentence is too long. Sk

### 測試自行斷詞後的結果(roberta)

In [8]:
df = pd.read_csv('0_10000.csv', header=0)

In [4]:
import ast

tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')

sentences_tokenized = [eval(d) for d in df.loc[:, "綜合評論_ws"].values]

# 獲取每個文本的嵌入向量
embeddings = []
sentences = []  # 新建一個列表來保存被處理的句子

for sentence in sentences_tokenized:
    
    inputs = tokenizer.encode_plus(sentence, is_split_into_words=True, return_tensors="pt")
    
    # 檢查序列長度是否超過 512
    if len(inputs["input_ids"][0]) > 512:
        print("The sentence is too long. Skipping...")
        continue
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings.append(outputs.last_hidden_state.mean(dim = 1).numpy().flatten())
    sentences.append(sentence)  # 將被處理的句子加到列表中

# 新的文本
new_sentence = "房間大"
new_tokens = ['房間', '大'] # 分詞
inputs = tokenizer(new_tokens, is_split_into_words=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()

# 計算與新文本的相似性
similarities = cosine_similarity(embeddings + [new_embedding])

# 獲取最相似的 50 個文本的索引
top50_indices = np.argsort(similarities[-1][:-1])[:-51:-1]

# 印出最相似的 50 個文本
for i, index in enumerate(top50_indices):
    print(f"相似的文本 (索引 {index}): {sentences[index]}, 相似度: {similarities[-1][index]}")

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The sentence is too long. Skipping...
The sentence is too long. Skipping...
相似的文本 (索引 9302): ['房間', '小'], 相似度: 0.9714384078979492
相似的文本 (索引 7713): ['房間', '算', '大'], 相似度: 0.9639410376548767
相似的文本 (索引 6559): ['房間', '很', '大'], 相似度: 0.9479893445968628
相似的文本 (索引 6710): ['房間', '很', '大'], 相似度: 0.9479893445968628
相似的文本 (索引 6730): ['房間', '很', '大'], 相似度: 0.9479893445968628
相似的文本 (索引 3851): ['房間', '很', '大'], 相似度: 0.9479893445968628
相似的文本 (索引 7572): ['房間', '很', '大'], 相似度: 0.9479893445968628
相似的文本 (索引 6796): ['房間', '很', '大'], 相似度: 0.9479893445968628
相似的文本 (索引 9294): ['房間', '偏', '小'], 相似度: 0.927994966506958
相似的文本 (索引 4188): ['房間', '稍', '小'], 相似度: 0.9203976392745972
相似的文本 (索引 6951): ['乾淨', '房間', '大'], 相似度: 0.9122157096862793
相似的文本 (索引 1642): ['房間', '太', '小'], 相似度: 0.900887131690979
相似的文本 (索引 7985): ['房間', '大小'], 相似度: 0.8921487331390381
相似的文本 (索引 1403): ['房間', '大小'], 相似度: 0.8921487331390381
相似的文本 (索引 8195): ['房間', '大小'], 相似度: 0.8921487331390381
相似的文本 (索引 6233): ['便宜', '房間', '大'], 相似度: 0.89132845401763

In [5]:
# 新的文本
new_sentence = "早餐好吃"
new_tokens = ['早餐', '好吃'] # 分詞
inputs = tokenizer(new_tokens, is_split_into_words=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()

# 計算與新文本的相似性
similarities = cosine_similarity(embeddings + [new_embedding])

# 獲取最相似的 50 個文本的索引
top50_indices = np.argsort(similarities[-1][:-1])[:-51:-1]

# 印出最相似的 50 個文本
for i, index in enumerate(top50_indices):
    print(f"相似的文本 (索引 {index}): {sentences[index]}, 相似度: {similarities[-1][index]}")

相似的文本 (索引 4343): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 1529): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 6567): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 7509): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 4348): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 6958): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 6491): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 4098): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 2429): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 4827): ['早餐', '好吃'], 相似度: 1.000000238418579
相似的文本 (索引 4190): ['早餐', '很', '好吃'], 相似度: 0.9652129411697388
相似的文本 (索引 6052): ['早餐', '很', '好吃'], 相似度: 0.9652129411697388
相似的文本 (索引 5181): ['早餐', '很', '好吃'], 相似度: 0.9652129411697388
相似的文本 (索引 6429): ['早餐', '很', '好吃'], 相似度: 0.9652129411697388
相似的文本 (索引 4300): ['早餐', '很', '好吃'], 相似度: 0.9652129411697388
相似的文本 (索引 9561): ['早餐', '超', '好吃'], 相似度: 0.9607133865356445
相似的文本 (索引 1245): ['早餐', '不', '好吃'], 相似度: 0.9587073922157288
相似的文本 (索引 9445): ['早餐', '很', '好吃', '呦'],

#### 使用roberta的不同層輸出

In [25]:
sentences_tokenized_ = [" ".join(eval(d)) for d in df_.loc[0:50, "綜合評論_ws"].values]

In [28]:
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np

tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext', output_hidden_states=True)

embeddings = []
for sentence in sentences_tokenized_:
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # 獲取最後四層的輸出
    last_four_layers = outputs.hidden_states[-4:]
    # 合併
    embedding = torch.mean(torch.stack(last_four_layers), dim=0).mean(dim=1).numpy()
    embeddings.append(embedding.squeeze())

new_tokens = ['早餐', '不好'] # 分詞
inputs = tokenizer(new_tokens, is_split_into_words=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
last_four_layers = outputs.hidden_states[-4:]
new_embedding = torch.mean(torch.stack(last_four_layers), dim=0).mean(dim=1).numpy()

embeddings_np = np.array(embeddings)

similarities = cosine_similarity(embeddings_np, new_embedding.reshape(1, -1))
top_indices = np.argsort(similarities, axis=0)[::-1]

for i, index in enumerate(top_indices[:10]):
    print(f"第 {i+1} 相似的文本 (索引 {index}): {sentences_tokenized_[index[0]]}, 相似度: {similarities[index[0]][0]}")


Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


第 1 相似的文本 (索引 [45]): 早餐 超 豐盛 超, 相似度: 0.6999987363815308
第 2 相似的文本 (索引 [3]): 沒有 附 早餐, 相似度: 0.6805572509765625
第 3 相似的文本 (索引 [22]): 滿意, 相似度: 0.5698429346084595
第 4 相似的文本 (索引 [23]): 沒有 髒, 相似度: 0.558742105960846
第 5 相似的文本 (索引 [35]): 地 上 有 頭髮 不 夠 乾淨, 相似度: 0.5520568490028381
第 6 相似的文本 (索引 [39]): 房間 升級 房間 及 衛浴 空間 舒適 早餐 每 天 都 是 培根 起司 口味, 相似度: 0.5247654914855957
第 7 相似的文本 (索引 [40]): 草地 夠 大 風 好 大, 相似度: 0.5237064957618713
第 8 相似的文本 (索引 [17]): 地點 方便 停車位 多 早餐 也 很 不錯 補菜 的 速度 也 很 快 早餐 沒有 醬瓜 房間 太 昏暗, 相似度: 0.5115199089050293
第 9 相似的文本 (索引 [1]): 整體 上 都 不錯 無 飲水機 有點 不 方便, 相似度: 0.5094053745269775
第 10 相似的文本 (索引 [8]): 有 桌遊 可以 玩, 相似度: 0.5080175399780273


### 觀察自行分詞與bert內建分詞的差異

In [11]:
new_sentence = "房間小"

# inputs = tokenizer(new_sentence, return_tensors="pt")
new_tokens = tokenizer.tokenize(new_sentence)
inputs = tokenizer(new_tokens, return_tensors="pt")

print(new_tokens)
print(inputs)

new_tokens = ['房間', '小'] # 分詞
inputs = tokenizer(new_tokens, is_split_into_words=True, return_tensors="pt")

print(new_tokens)
print(inputs)

['房', '間', '小']
{'input_ids': tensor([[ 101, 2791,  102],
        [ 101, 7279,  102],
        [ 101, 2207,  102]]), 'token_type_ids': tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]])}
['房間', '小']
{'input_ids': tensor([[ 101, 2791, 7279, 2207,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


#### 嘗試只保留有意義的詞性

In [6]:
df = pd.read_csv('0_10000.csv', header=0)

In [11]:
stop_pos = set(['Nep', 'Nh', 'Nb'])  # 這 3 種詞性不保留

# 分詞後評論
sentences_tokenized = [eval(d) for d in df.loc[:, "綜合評論_ws"].values]

# 分詞後詞性
pos_tags = [eval(d) for d in df.loc[:, "綜合評論_pos"].values]

short_sentence = []  # 放過濾後的句子

for sentence, sentence_pos_tags in zip(sentences_tokenized, pos_tags):

    short_sentence_sentence = []  # 儲存過濾後的句子（不帶詞性）
    
    for word_ws, word_pos in zip(sentence, sentence_pos_tags):
        
        # 只留名詞和動詞
        is_N_or_V = word_pos.startswith("V") or word_pos.startswith("N")

        # 去掉名詞裡的某些詞性
        is_not_stop_pos = word_pos not in stop_pos

        if is_N_or_V and is_not_stop_pos:
            
            short_sentence_sentence.append(f"{word_ws}")

    short_sentence.append(short_sentence_sentence)

# print(sentences_tokenized[0:3])
# print(short_sentence[0:3])

# 去除空的列表
short_sentence = [sentence for sentence in short_sentence if sentence]  

### 利用篩選詞性後的結果做 roberta

In [10]:
import ast

tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')

# 獲取每個文本的嵌入向量
embeddings = []
sentences = []  # 新建一個列表來保存被處理的句子

for sentence in short_sentence:
    
    inputs = tokenizer.encode_plus(sentence, is_split_into_words=True, return_tensors="pt")
    
    # 檢查序列長度是否超過 512
    if len(inputs["input_ids"][0]) > 512:
        print("The sentence is too long. Skipping...")
        continue
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings.append(outputs.last_hidden_state.mean(dim = 1).numpy().flatten())
    sentences.append(sentence)  # 將被處理的句子加到 list

# 新的文本
new_sentence = "房間大"
new_tokens = ['房間', '大'] # 分詞
inputs = tokenizer(new_tokens, is_split_into_words=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()

# 計算與新文本的相似性
similarities = cosine_similarity(embeddings + [new_embedding])

# 獲取最相似的 50 個文本的索引
top50_indices = np.argsort(similarities[-1][:-1])[:-51:-1]

# 最相似的 50 個文本
for i, index in enumerate(top50_indices):
    print(f"相似的文本 (索引 {index}): {sentences[index]}, 相似度: {similarities[-1][index]}")

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The sentence is too long. Skipping...
相似的文本 (索引 3834): ['房間', '大'], 相似度: 0.9999998211860657
相似的文本 (索引 6677): ['房間', '大'], 相似度: 0.9999998211860657
相似的文本 (索引 7534): ['房間', '大'], 相似度: 0.9999998211860657
相似的文本 (索引 6762): ['房間', '大'], 相似度: 0.9999998211860657
相似的文本 (索引 6697): ['房間', '大'], 相似度: 0.9999998211860657
相似的文本 (索引 6527): ['房間', '大'], 相似度: 0.9999998211860657
相似的文本 (索引 4170): ['房間', '小'], 相似度: 0.9714384078979492
相似的文本 (索引 9246): ['房間', '小'], 相似度: 0.9714384078979492
相似的文本 (索引 1637): ['房間', '小'], 相似度: 0.9714384078979492
相似的文本 (索引 7674): ['房間', '算', '大'], 相似度: 0.9639410376548767
相似的文本 (索引 8884): ['房間', '差'], 相似度: 0.9296784996986389
相似的文本 (索引 9238): ['房間', '偏', '小'], 相似度: 0.927994966506958
相似的文本 (索引 989): ['房間', '浴室', '大'], 相似度: 0.9196709394454956
相似的文本 (索引 8559): ['房間', '小', '床', '小'], 相似度: 0.9166770577430725
相似的文本 (索引 6916): ['乾淨', '房間', '大'], 相似度: 0.9122157096862793
相似的文本 (索引 7094): ['房間', '大', '寬敞'], 相似度: 0.9050126075744629
相似的文本 (索引 5854): ['床', '房間', '大小'], 相似度: 0.8997808694839478
相似

## Doc2Vec 模型

In [2]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 開始測量(時間)
startime = datetime.datetime.now()

df_ = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# 篩選評論數大於等於 30 的飯店
df_filtered_doc2vec = df_.groupby('飯店名稱').filter(lambda x: len(x) >= 30)

df_filtered_doc2vec = df_filtered_doc2vec[0:100000]

df_filtered_doc2vec = df_filtered_doc2vec.reset_index(drop=True)

# 分詞後評論
sentences_tokenized = [eval(d) for d in df_filtered_doc2vec.loc[:, "綜合評論_cut"].values]

# 準備訓練數據(轉成模型會吃的樣子)
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences_tokenized)]

# 訓練 Doc2Vec 模型
model = Doc2Vec(documents, vector_size=768, min_count=2, epochs=1000) # min_count:單詞最少須出現次數

# 使用模型將文本轉換為向量
embeddings = [model.infer_vector(doc) for doc in sentences_tokenized]
print(len(embeddings))

# 結束測量
endtime = datetime.datetime.now()

# 輸出結果
print("執行時間：", endtime - startime)

100000
執行時間： 2:39:24.482711


In [10]:
# 對新的句子執行相同的操作
new_sentence = "早餐好吃"
new_embedding = model.infer_vector(new_sentence.split())

# 將嵌入向量列表轉為 numpy array，以便後續使用 cosine_similarity 函数
embeddings_np = np.array(embeddings)

# 計算新的句子和已有句子的相似度
similarities = cosine_similarity(embeddings_np, new_embedding.reshape(1, -1))

# 排列相似度
top_indices = np.argsort(similarities, axis=0)[::-1]

# 印出最相似的 n 個句子
for i, index in enumerate(top_indices[:10]):
    print(f"第 {i+1} 相似的文本 (索引 {index}): {sentences_tokenized[index[0]]}, 相似度: {similarities[index[0]][0]}")

第 1 相似的文本 (索引 [20418]): ['支付', '路邊', '停車費', '充足', '停車', '位', '自行', '開車', '房客', '地點', '不錯', '晚餐', '散步', '至愛河', '入住', '家庭', '房', '玩具', '出發', '前有', '提醒', '消毒', '房內', '玩具', '積', '木盒', '開啟', '充滿', '未乾', '酒精', '自行', '乾布', '擦拭', '房務人員', '開啟', '盒子', '噴上', '酒精', '關上', '幼兒', '澡盆', '水瓢', '小椅子', '全是', '發黴', '狀態', '清潔', '加強', '早餐', '種類', '選擇', '少', '15', '用餐', '菜盤', '已空', '不補', '空盤', '空碗', '不補'], 相似度: 0.13786755502223969
第 2 相似的文本 (索引 [35488]): ['房內', '裝置', '老舊', '不好', '停車', '早餐', '選擇', '較少'], 相似度: 0.12939956784248352
第 3 相似的文本 (索引 [49408]): ['天悅剛', '營運', '每次', '入住', '很棒', '入住', '走道', '房內', '地毯', '久', '老舊', '裝置', '維護', '感謝', '房務人員', '用心', '整理', '房間', '謝謝', '早餐', '接待', '人員'], 相似度: 0.1287374198436737
第 4 相似的文本 (索引 [40442]): ['地點', '佳', '早餐', '豐盛', '用餐', '環境', '整潔', '餐廳', '入內時', '接待', '人員', '提醒', '取餐', '配戴', '口罩', '房內', '衛浴', '乾淨', '整潔', '裝置', '新穎', '值得', '推薦', '電視', '遙控器', '連線', '不佳', '隔音', '稍差', '房間', '內會', '聽到', '隔壁', '電視', '聲', '浴室', '水時', '聲音'], 相似度: 0.12402409315109253
第 5 相似的文本 (索引 [62702]): ['

## TF-IDF

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from ast import literal_eval

df_tf_idf = pd.read_csv('0_10000.csv', header=0)

sentences_tokenized = [eval(d) for d in df_tf_idf["綜合評論_ws"].values]
pos_tags = [eval(d) for d in df_tf_idf["綜合評論_pos"].values]

# 嘗試只保留有意義的詞性
stop_pos = set(['Nep', 'Nh', 'Nb'])  # 這 3 種詞性不保留

short_with_pos = []  # 放過濾後的詞性與句子
short_sentence = []  # 放過濾後的句子

for sentence, sentence_pos_tags in zip(sentences_tokenized, pos_tags):
    short_with_pos_sentence = []  # 儲存這個句子過濾後的詞（帶詞性）
    short_sentence_sentence = []  # 儲存這個句子過濾後的詞（不帶詞性）
    
    for word_ws, word_pos in zip(sentence, sentence_pos_tags):
        
        # 只留名詞和動詞
        is_N_or_V = word_pos.startswith("V") or word_pos.startswith("N")

        # 去掉名詞裡的某些詞性
        is_not_stop_pos = word_pos not in stop_pos

        if is_N_or_V and is_not_stop_pos:
            short_with_pos_sentence.append(f"{word_ws}({word_pos})")
            short_sentence_sentence.append(f"{word_ws}")

    short_with_pos.append(short_with_pos_sentence)
    short_sentence.append(short_sentence_sentence)

# 將 '綜合評論_ws' 欄位更新為 'short_sentence' 列表
df_tf_idf['綜合評論_ws'] = [' '.join(s) for s in short_sentence]

# 將綜合評論過濾後為空的資料去除
non_empty_mask = df_tf_idf['綜合評論_ws'].str.strip() != ''

# 使用此遮罩來過濾 DataFrame
df_tf_idf = df_tf_idf[non_empty_mask]

# 使用 TF-IDF 將所有評論文本轉換為數值向量
vectorizer = TfidfVectorizer()
vectorizer.fit(df_tf_idf['綜合評論_ws'])

# 假設新用戶輸入的需求
new_user_input = "房間 大"

# 將新用戶輸入轉換為相同的數值向量
new_vector = vectorizer.transform([new_user_input])

# 獲取所有的飯店名稱
hotel_names = df['飯店名稱'].unique()

# 初始化一個空的字典來儲存每個飯店的所有評論相似度和評論文本
hotel_reviews_similarities = {}

# 對每個飯店進行處理
for hotel_name in hotel_names:
    # 只選取該飯店的評論
    hotel_reviews = df_tf_idf[df_tf_idf['飯店名稱'] == hotel_name]['綜合評論_ws']
    
    # 使用擬合好的TF-IDF將飯店的評論轉換為數值向量
    X = vectorizer.transform(hotel_reviews)

    # 計算新用戶輸入與所有評論的餘弦相似度
    cos_similarities = cosine_similarity(new_vector, X).flatten()
    
    # 將這個飯店的所有評論相似度和評論文本儲存到字典中
    hotel_reviews_similarities[hotel_name] = list(zip(hotel_reviews, cos_similarities))

# 初始化一個空的字典來儲存每個飯店與新用戶需求的平均相似度
hotel_similarities = {hotel: np.mean([sim for _, sim in reviews]) for hotel, reviews in hotel_reviews_similarities.items()}

# 對飯店的平均相似度進行排序，並只取前10個
top10_hotels = sorted(hotel_similarities.items(), key=lambda x: x[1], reverse=True)[:10]

# 輸出與新用戶需求最相似的前 10 個飯店
for hotel_name, avg_sim in top10_hotels:
    print(f"飯店名稱：{hotel_name}")
    print(f"平均相似度:{avg_sim}")
    
    # 對這家飯店的所有評論相似度進行排序，並只取前 10 個
    top10_reviews = sorted(hotel_reviews_similarities[hotel_name], key=lambda x: x[1], reverse=True)[:10]

    # 輸出與新用戶需求最相似的前10條評論
    print("前10條相似的評論:")
    for review_text, sim in top10_reviews:
        print(f"評論：{review_text}，相似度：{sim}")
    
    print("\n")

飯店名稱：陽光滿屋民宿
平均相似度:0.11570316506156822
前10條相似的評論:
評論：舒適 房間，相似度：0.5655955612619636
評論：乾淨 舒適 房間，相似度：0.4529094419157692
評論：推薦 房間 乾淨 不錯 房間 內 無 冰箱 有 共用，相似度：0.3734171246078148
評論：房間 乾淨 看 出 屋主 用心，相似度：0.22074116484931985
評論：整體 棒 地點 不錯 大 片 窗戶 外面 有 連結感 地板 乾淨 熱水 燙 水壓 足 沒 暖氣 訂 個 房間 牆壁 上 有 幅 畫 房間 陽光感 風格 不符 感覺 看 久 頭 暈 正面 床 難 看到，相似度：0.1696819678971862
評論：陽光 地方 房間 不錯 熱水 熱 晚上 安靜 有 小 螞蟻，相似度：0.15946363273550007
評論：民宿 地點 活水湖 近 車程 五 分鐘 無 市區 鬧區 噪音 老闆 熱心 解答 任何 問題 房間 整潔 乾淨 希望 有 含 早餐 退房 時間 早點 房間 冷氣 涼 需要 設定到 低 溫度，相似度：0.15127160639049955
評論：海岸 公園 近 民宿 房間 內 看見 海 民宿 地點 不錯 安靜 停車 方便 進門處 鞋子 沒 地方 擺 顯 亂 房間 垃圾桶 裏 遺留 上 個 房客 泡麵碗 洗手台 螞蟻 一 大 堆 腳 撞 狂 床框，相似度：0.14818309750777228
評論：房間 乾淨 水壓 大 熱水 快 來 洗澡 舒服 市區 開車 分鐘 內 到達 路 邊 停車 方便 消毒水 味道 重，相似度：0.11527584378112371
評論：早上 五點 多 有 公雞 叫 起床 六點 多 有 戰機 飛過 房間 廁所 乾淨，相似度：0.09638061657609528


飯店名稱：家有囍宿
平均相似度:0.09766333643934164
前10條相似的評論:
評論：房間 大 間 衛浴 設備 讚 有 按摩 浴缸 值 高 下 次 來 時 有 房間 住，相似度：0.3924007059710153
評論：早餐 直接 送 餐券 方便 安靜 舒適 房間 大 枕頭 扁 床 軟 中間 陷下去 睡 右邊 一點 建議 翻面，相似度：0.09591597622569287
評論：老闆

##### 將同一間飯店評論合併後，使用 roberta 模型(有誤  還須測試)

In [None]:
import re
import pandas as pd
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from collections import defaultdict

def sentence_based_sliding_window(text, window_size):
    sentences = re.split('(?<=[。])', text)
    current_group = []
    current_length = 0
    grouped_sentences = []

    for sentence in sentences:
        sentence_length = len(tokenizer.tokenize(sentence))
        if sentence_length > window_size:
            # If the length of a single sentence exceeds the window size, skip that data
            continue
        elif current_length + sentence_length > window_size:
            grouped_sentences.append("".join(current_group))
            current_group = [sentence]
            current_length = sentence_length
        else:
            current_group.append(sentence)
            current_length += sentence_length

    # Add the remaining sentences to grouped_sentences
    if current_group:
        grouped_sentences.append("".join(current_group))

    return grouped_sentences

tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
model = BertModel.from_pretrained('hfl/chinese-roberta-wwm-ext')


df = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# Merge the comprehensive comments of the same hotel using a period
df_grouped = df[0:10000].groupby('飯店名稱')['綜合評論'].apply(lambda x: '。'.join(x)).reset_index()

# Text
sentences = list(df_grouped.loc[:, '綜合評論'])

# Get the embeddings for each text
embeddings = []
hotel_indices = []
hotel_embeddings = defaultdict(list)

for i, sentence in enumerate(sentences):
    # If the sentence is too long, split it into multiple fragments
    if len(tokenizer.tokenize(sentence)) > tokenizer.model_max_length:
        sentence_windows = sentence_based_sliding_window(sentence, 4000)
        for sentence_window in sentence_windows:
            if len(tokenizer.tokenize(sentence_window)) > tokenizer.model_max_length:
                continue
            inputs = tokenizer(sentence_window, return_tensors="pt", truncation=False, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
            hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
    else:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=False, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())

# Compute the average embedding for each hotel
for i, hotel_embedding in hotel_embeddings.items():
    embeddings.append(np.mean(hotel_embedding, axis=0))
    hotel_indices.append(i)

# 新的文本
new_sentence = "房間大"
inputs = tokenizer(new_sentence, return_tensors="pt", truncation=True, max_length=512, padding=True)

with torch.no_grad():
    outputs = model(**inputs)
new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()


# Calculate similarity with the new text
similarities = cosine_similarity(embeddings + [new_embedding])

# Get the indices of the top 10 most similar texts
top10_indices = np.argsort(similarities[-1][:-1])[:-10:-1]

# Print the top 10 most similar texts
for index in top10_indices:
    print(f"Similar hotel: {df_grouped.loc[hotel_indices[index], '飯店名稱']}, similarity: {similarities[-1][index]}")


## Longformer 模型

#### 將同一間飯店評論合併後，使用可處理長句子的 longformer 模型

In [15]:
def sentence_based_sliding_window(text, window_size):
    sentences = re.split('(?<=[。])', text)
    current_group = []
    current_length = 0
    grouped_sentences = []

    for sentence in sentences:
        sentence_length = len(tokenizer.tokenize(sentence))
        if sentence_length > window_size:
            # 如果單一句子的長度超過大小(4096)，跳過該筆數據
            continue
        elif current_length + sentence_length > window_size:
            grouped_sentences.append("".join(current_group))
            current_group = [sentence]
            current_length = sentence_length
        else:
            current_group.append(sentence)
            current_length += sentence_length

    # 把剩下的句子加入 grouped_sentences
    if current_group:
        grouped_sentences.append("".join(current_group))
    
    return grouped_sentences

In [16]:
from transformers import LongformerModel, LongformerTokenizer
from collections import defaultdict

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')

df = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# 將相同飯店的綜合評論用句號合併
df_grouped = df[0:10000].groupby('飯店名稱')['綜合評論'].apply(lambda x: '。'.join(x)).reset_index()

# 文本
sentences = list(df_grouped.loc[:, '綜合評論'])

# 獲取每個文本的嵌入向量
embeddings = []
hotel_indices = []
hotel_embeddings = defaultdict(list)

for i, sentence in enumerate(sentences):
    # 如果句子太長，就將其分割成多個片段
    if len(tokenizer.tokenize(sentence)) > tokenizer.model_max_length:
        sentence_windows = sentence_based_sliding_window(sentence, 4000)
        for sentence_window in sentence_windows:
            if len(tokenizer.tokenize(sentence_window)) > tokenizer.model_max_length:
                continue
            inputs = tokenizer(sentence_window, return_tensors="pt", truncation=False, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
            hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
    else:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=False, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())

# 計算每一間飯店的平均向量
for i, hotel_embedding in hotel_embeddings.items():
    embeddings.append(np.mean(hotel_embedding, axis=0))
    hotel_indices.append(i)

# 新的文本
new_sentence = "房間大"
inputs = tokenizer(new_sentence, return_tensors="pt", truncation=False, padding=True)

with torch.no_grad():
    outputs = model(**inputs)
new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()

# 計算與新文本的相似性
similarities = cosine_similarity(embeddings + [new_embedding])

# 獲取最相似的 10 個文本的索引
top10_indices = np.argsort(similarities[-1][:-1])[:-10:-1]

# 印出最相似的 10 個文本
for index in top10_indices:
    print(f"相似的飯店: {df_grouped.loc[hotel_indices[index], '飯店名稱']}, 相似度: {similarities[-1][index]}")

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


相似的飯店: 大寶的民宿 Tabohouse, 相似度: 0.9586089849472046
相似的飯店: 翠園歐風庭園民宿, 相似度: 0.9230974912643433
相似的飯店: 花爵墾丁, 相似度: 0.9208060503005981
相似的飯店: 白鷺灣 民宿(安平古堡), 相似度: 0.9181367754936218
相似的飯店: 希望恆春休閒會館, 相似度: 0.9181272983551025
相似的飯店: 卡爾登飯店 the Carlton, 相似度: 0.9166074991226196
相似的飯店: 陽光滿屋民宿, 相似度: 0.9149898886680603
相似的飯店: 家有囍宿, 相似度: 0.9144690036773682
相似的飯店: 海洋風情民宿, 相似度: 0.9132213592529297


#### 將同一間飯店評論合併後，使用可處理長句子的 longformer，並排除評論數不足30的飯店

In [8]:
from transformers import LongformerModel, LongformerTokenizer

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')

df = pd.read_csv('./booking_comments_分詞update.csv', header=0)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from collections import defaultdict

# 篩選評論數大於等於 30 的飯店
df_filtered = df[0:10000].groupby('飯店名稱').filter(lambda x: len(x) >= 30);df_filtered

# 將相同飯店的綜合評論用句號合併，並保留其他重要欄位
df_grouped = df_filtered.groupby('飯店名稱').agg({
    '綜合評論': lambda x: '。'.join(x),
    '縣市': lambda x: x.iloc[0],
    '鄉鎮': lambda x: x.iloc[0],
    '整體評分': lambda x: x.iloc[0],
    '單項評分_整潔度': lambda x: x.iloc[0],
    '單項評分_舒適程度': lambda x: x.iloc[0],
    '單項評分_住宿地點': lambda x: x.iloc[0],
    '單項評分_設施': lambda x: x.iloc[0],
    '單項評分_員工素質': lambda x: x.iloc[0],
    '單項評分_性價比': lambda x: x.iloc[0]
}).reset_index();df_grouped

# 文本
sentences = list(df_grouped.loc[:, '綜合評論'])

# 獲取每個文本的嵌入向量
embeddings = []
hotel_indices = []
hotel_embeddings = defaultdict(list)

for i, sentence in enumerate(sentences):
    
    # 如果句子太長，就分割成多個片段
    if len(tokenizer.tokenize(sentence)) > tokenizer.model_max_length:
        sentence_windows = sentence_based_sliding_window(sentence, 4000)
        for sentence_window in sentence_windows:
            if len(tokenizer.tokenize(sentence_window)) > tokenizer.model_max_length:
                continue
            inputs = tokenizer(sentence_window, return_tensors="pt", truncation=False, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
            hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
    else:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=False, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())

# 計算每間飯店的平均向量
for i, hotel_embedding in hotel_embeddings.items():
    embeddings.append(np.mean(hotel_embedding, axis=0))
    hotel_indices.append(i)

In [52]:
# 儲存嵌入向量
np.save('embeddings_longformer_base_4096.npy', embeddings)

# 儲存飯店資訊
df_grouped.loc[hotel_indices, :].to_csv('hotel_info_longformerbase4096.csv')

#### 使用迴圈跑

In [11]:
def sentence_based_sliding_window(text, window_size):
    sentences = re.split('(?<=[。])', text)
    current_group = []
    current_length = 0
    grouped_sentences = []

    for sentence in sentences:
        sentence_length = len(tokenizer.tokenize(sentence))
        if sentence_length > window_size:
            # 如果單一文本太長就跳過
            continue
        elif current_length + sentence_length > window_size:
            grouped_sentences.append("".join(current_group))
            current_group = [sentence]
            current_length = sentence_length
        else:
            current_group.append(sentence)
            current_length += sentence_length

    # 把剩下的句子加入 grouped_sentences
    if current_group:
        grouped_sentences.append("".join(current_group))

    return grouped_sentences

In [None]:
import torch
import numpy as np
import pandas as pd
from collections import defaultdict
from transformers import LongformerModel, LongformerTokenizer
import re
from sklearn.metrics.pairwise import cosine_similarity

# 如果有 GPU 就使用，否則用 CPU 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
model = model.to(device)

df = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# 讀取之前的檔案
try:
    embeddings = np.load('embeddings_longformer_base_4096.npy').tolist()
    df_grouped = pd.read_csv('hotel_info_longformerbase4096.csv')
except FileNotFoundError:
    embeddings = {}
    df_grouped = pd.DataFrame()

hotel_names = df['飯店名稱'].unique()
n_hotels = len(hotel_names)

# 分批執行(每50間飯店儲存一次)
for i in range(0, n_hotels, 50):
    print(f'Processing hotels {i} to {min(i + 50, n_hotels)}')
    batch_hotel_names = hotel_names[i:i + 50]
    df_filtered = df[df['飯店名稱'].isin(batch_hotel_names)]
    df_grouped_batch = df_filtered.groupby('飯店名稱').agg({
        '正評': lambda x: '。'.join(str(v) for v in x if not pd.isna(v)),
        '縣市': lambda x: x.iloc[0],
        '鄉鎮': lambda x: x.iloc[0],
        '整體評分': lambda x: x.iloc[0],
        '單項評分_整潔度': lambda x: x.iloc[0],
        '單項評分_舒適程度': lambda x: x.iloc[0],
        '單項評分_住宿地點': lambda x: x.iloc[0],
        '單項評分_設施': lambda x: x.iloc[0],
        '單項評分_員工素質': lambda x: x.iloc[0],
        '單項評分_性價比': lambda x: x.iloc[0]
    }).reset_index()
    
    sentences = list(df_grouped_batch.loc[:, '正評'])
    hotel_embeddings = defaultdict(list)

    for hotel_name, sentence in zip(batch_hotel_names, sentences):
        
        # 如果文本過長就拆分
        if len(tokenizer.tokenize(sentence)) > tokenizer.model_max_length:
            sentence_windows = sentence_based_sliding_window(sentence, 4000)
            for sentence_window in sentence_windows:
                if len(tokenizer.tokenize(sentence_window)) > tokenizer.model_max_length:
                    continue
                inputs = tokenizer(sentence_window, return_tensors="pt", truncation=False, padding=True).to(device)
                with torch.no_grad():
                    outputs = model(**inputs)
                hotel_embeddings[hotel_name].append(outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten())
        else:
            inputs = tokenizer(sentence, return_tensors="pt", truncation=False, padding=True).to(device)
            with torch.no_grad():
                outputs = model(**inputs)
            hotel_embeddings[hotel_name].append(outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten())
            
    # 計算平均向量
    for hotel_name, embeds in hotel_embeddings.items():
        if len(embeds) > 0:
            embeddings[hotel_name] = np.mean(embeds, axis=0)

    # 合併文件内容
    df_grouped = pd.concat([df_grouped, df_grouped_batch])
    
    # 儲存
    np.savez('embeddings_longformer_base_4096.npy', **embeddings)
    df_grouped.to_csv('hotel_info_longformerbase4096.csv', index=False)


#### 進行二次篩選(根據某些特定條件)

In [14]:
def get_similar_hotels(condition, embeddings, new_embedding, hotel_data, n):
    
    # 根據條件篩選出飯店
    filtered_hotels = hotel_data[condition]

    # 取得符合條件的飯店嵌入向量
    filtered_embeddings = embeddings[filtered_hotels.index]

    # 計算與新文本的相似性
    similarities = cosine_similarity(np.concatenate([filtered_embeddings, new_embedding[None, :]]))

    # 獲取最相似的 n 個文本的索引
    topn_indices = np.argsort(similarities[-1][:-1])[:-n-1:-1]

    # 印出最相似的 n 個飯店
    for index in topn_indices:
        print(f"相似的飯店:\n{filtered_hotels.iloc[index][['飯店名稱']]}")
        print(f"縣市:{filtered_hotels.iloc[index][['縣市']]}")
        print(f"鄉鎮:{filtered_hotels.iloc[index][['鄉鎮']]}")
        print(f"整體評分:{filtered_hotels.iloc[index][['整體評分']]}")
        print(f"相似度: {similarities[-1][index]}")

In [None]:
# 讀取 DataFrame
df_hotel_info = pd.read_csv('hotel_info_longformerbase4096.csv')

# 讀取 embeddings
embeddings = np.load('embeddings_longformer_base_4096.npy.npz')


# 新的文本
new_sentence = "房間大"
inputs = tokenizer(new_sentence, return_tensors="pt", truncation=True, max_length=512, padding=True)

with torch.no_grad():
    outputs = model(**inputs)
new_embedding = outputs.last_hidden_state.mean(dim=1).numpy().flatten()

# 搜尋'南投縣'的前 5 個相似的飯店
get_similar_hotels(df_hotel_info['縣市'] == '南投縣', embeddings, new_embedding, df_hotel_info, 5)

# 搜尋整體評分大於 8 的前 10 個相似的飯店
get_similar_hotels(hotel_data['整體評分'] > 8, embeddings, new_embedding, hotel_data, 10)

# 搜尋'南投縣'且整體評分大於 8 的前 15 個相似的飯店
get_similar_hotels((hotel_data['縣市'] == '南投縣') & (hotel_data['整體評分'] > 8), embeddings, new_embedding, hotel_data, 15)

##### 花費時間測試

In [13]:
from collections import defaultdict
from transformers import LongformerModel, LongformerTokenizer
import re

# 開始測量(時間)
startime = datetime.datetime.now()

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')

df = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# 篩選評論數大於等於 30 的飯店
df_filtered = df[0:100000].groupby('飯店名稱').filter(lambda x: len(x) >= 30);df_filtered

# 將相同飯店的綜合評論用句號合併，並保留其他重要欄位
df_grouped = df_filtered.groupby('飯店名稱').agg({
    '綜合評論': lambda x: '。'.join(x),
    '縣市': lambda x: x.iloc[0],
    '鄉鎮': lambda x: x.iloc[0],
    '整體評分': lambda x: x.iloc[0],
    '單項評分_整潔度': lambda x: x.iloc[0],
    '單項評分_舒適程度': lambda x: x.iloc[0],
    '單項評分_住宿地點': lambda x: x.iloc[0],
    '單項評分_設施': lambda x: x.iloc[0],
    '單項評分_員工素質': lambda x: x.iloc[0],
    '單項評分_性價比': lambda x: x.iloc[0]
}).reset_index();df_grouped

# 文本
sentences = list(df_grouped.loc[:, '綜合評論'])

# 獲取每個文本的嵌入向量
embeddings = []
hotel_indices = []
hotel_embeddings = defaultdict(list)

for i, sentence in enumerate(sentences):
    
    # 如果句子太長，就分割成多個片段
    if len(tokenizer.tokenize(sentence)) > tokenizer.model_max_length:
        sentence_windows = sentence_based_sliding_window(sentence, 4000)
        for sentence_window in sentence_windows:
            if len(tokenizer.tokenize(sentence_window)) > tokenizer.model_max_length:
                continue
            inputs = tokenizer(sentence_window, return_tensors="pt", truncation=False, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
            hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
    else:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=False, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        hotel_embeddings[i].append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())

# 計算每間飯店的平均向量
for i, hotel_embedding in hotel_embeddings.items():
    embeddings.append(np.mean(hotel_embedding, axis=0))
    hotel_indices.append(i)
    
# 儲存嵌入向量
np.save('embeddings_longformer_base_4096_test.npy', embeddings)

# 儲存飯店資訊
df_grouped.loc[hotel_indices, :].to_csv('hotel_info_longformerbase4096_test.csv')

# 結束測量
endtime = datetime.datetime.now()

# 輸出結果
print("執行時間：", endtime - startime)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


執行時間： 3:49:31.820817


## BERT 模型

In [11]:
import numpy as np
import pandas as pd
import os
import datetime
from transformers import BertModel, BertTokenizer
import torch
import datetime

# 開始測量(時間)
startime = datetime.datetime.now()

df_ = pd.read_csv('./booking_comments_分詞update.csv', header=0)

# 篩選評論數大於等於 30 的飯店
df_filtered = df_.groupby('飯店名稱').filter(lambda x: len(x) >= 30)

df_filtered = df_filtered[0:100000]

df_filtered = df_filtered.reset_index(drop=True)

# 初始化 BERT 模型和分詞器
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese')

# 定義每個批次的大小
batch_size = 5000

# 計算需要進行的批次數
n_batches = (len(df_filtered) // batch_size) + 1

# 開始進行批次處理
for batch_idx in range(n_batches):
    print(f"Processing batch {batch_idx + 1} of {n_batches}...")
    
    start_idx = batch_idx * batch_size
    end_idx = (batch_idx + 1) * batch_size

    # 評論文本
    sentences = list(df_filtered.loc[start_idx:end_idx-1, '綜合評論'])

    # 如果 sentences 是空的，則跳過該批次
    if not sentences:
        continue

    # 初始化一個空的 list 用於儲存飯店資訊
    hotel_info_list = []

    # 獲取每個文本的嵌入向量，並儲存對應的飯店資訊
    embeddings = []
    for idx, sentence in enumerate(sentences):
        if isinstance(sentence, str):
            tokens = tokenizer.tokenize(sentence)  # 分詞
            inputs = tokenizer(sentence, return_tensors="pt")
        else:
            print(f"Sentence at index {idx} is not a string. Skipping...")
            continue

        # 檢查序列長度是否超過 512
        if len(inputs["input_ids"][0]) > 512:
            print("The sentence is too long. Skipping...")
            continue

        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
        hotel_info_list.append(df_filtered.iloc[start_idx + idx].to_dict())

    # 讀取原有的飯店資訊和 embeddings
    if os.path.exists('hotel_info_bert_綜合評論.csv') and os.path.exists('embeddings_bert_綜合評論.npy'):
        df_hotel_info_old = pd.read_csv('hotel_info_bert_綜合評論.csv')
        embeddings_old = np.load('embeddings_bert_綜合評論.npy')

    else:
        df_hotel_info_old = pd.DataFrame()
        embeddings_old = np.array([]).reshape(0,768)

    # 儲存飯店資訊與 embeddings
    df_hotel_info_new = pd.DataFrame(hotel_info_list)
    df_hotel_info = pd.concat([df_hotel_info_old, df_hotel_info_new], ignore_index=True)
    df_hotel_info.to_csv('hotel_info_bert_綜合評論.csv', index=False)

    embeddings_new = np.array(embeddings)
    embeddings = np.vstack([embeddings_old, embeddings_new])
    np.save('embeddings_bert_綜合評論.npy', embeddings)
    
    print(f"第{batch_idx + 1}批次儲存完畢")
    
# 結束測量
endtime = datetime.datetime.now()

# 輸出結果
print("執行時間：", endtime - startime)


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing batch 1 of 21...


Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
第1批次儲存完畢
Processing batch 2 of 21...
The sentence is too long. Skipping...
第2批次儲存完畢
Processing batch 3 of 21...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
第3批次儲存完畢
Processing batch 4 of 21...
The sentence is too long. Skipping...
第4批次儲存完畢
Processing batch 5 of 21...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
第5批次儲存完畢
Processing batch 6 of 21...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
The sentence is too long. Skipping...
第6批次儲存完畢
Processing batch 7 of 21...
The sentence is too long. Skipping...
The sentence is to

#### GPT

In [None]:
import openai
import time

openai.api_key = 'your-api-key'

questions = [
    "What is the capital of France?",
    "Who wrote the book '1984'?",
    "What is the distance from Earth to the Moon?"
]

responses = []

for question in questions:
    response = openai.Completion.create(
        engine="text-davinci-004",
        prompt=question,
        max_tokens=100
    )
    responses.append(response.choices[0].text.strip())
    time.sleep(1)

for question, answer in zip(questions, responses):
    print(f"Question: {question}")
    print(f"Answer: {answer}")
