## 基礎範例

In [5]:
from openai import OpenAI
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os

# 載入環境變數
load_dotenv()

# 從環境變數中取得 API 密鑰
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

# 假設有一個包含文本數據的 DataFrame
df = pd.DataFrame({
    'combined': [
        'Good Quality Dog Food. I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.',
        'Not as Advertised. Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. The vendor refused to refund my purchase or send me the correct product.',
        'Delicious. These are the best cookies I have ever eaten! I highly recommend them to anyone who loves chocolate.',
        'Terrible customer service. The product was okay, but the customer service was terrible. I will not be buying from this company again.',
        'Just okay. The product was neither great nor bad. It was just okay. I might buy it again if there are no better options.',
    ]
})

# 將文本轉換為嵌入
df['embedding'] = df.combined.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

# 儲存結果到 CSV 文件
df.to_csv('embedded_reviews.csv', index=False)

# 從儲存的文件中載入嵌入數據
df = pd.read_csv('embedded_reviews.csv')
df['embedding'] = df.embedding.apply(eval).apply(np.array)

# 顯示結果
print(df.head())


                                            combined  \
0  Good Quality Dog Food. I have bought several o...   
1  Not as Advertised. Product arrived labeled as ...   
2  Delicious. These are the best cookies I have e...   
3  Terrible customer service. The product was oka...   
4  Just okay. The product was neither great nor b...   

                                           embedding  
0  [0.015021142549812794, -0.007734853308647871, ...  
1  [-0.014828150160610676, -0.005706844385713339,...  
2  [0.03156798705458641, -0.06490229815244675, -0...  
3  [-0.03399420902132988, -0.01097179763019085, -...  
4  [-0.04210306704044342, 0.019184265285730362, -...  


## 搜尋 Search

根據查詢字串找到最相關的文本，通過計算嵌入向量之間的餘弦相似度來實現。

In [6]:
import numpy as np

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

def find_similar_reviews(query, df, model="text-embedding-3-small"):
    query_embedding = get_embedding(query, model=model)
    df['similarity'] = df.embedding.apply(lambda x: np.dot(query_embedding, x) / (np.linalg.norm(query_embedding) * np.linalg.norm(x)))
    similar_reviews = df.sort_values('similarity', ascending=False)
    return similar_reviews

# 查詢範例
query = "best cookies"
similar_reviews = find_similar_reviews(query, df)
print(similar_reviews[['combined', 'similarity']])


                                            combined  similarity
2  Delicious. These are the best cookies I have e...    0.545966
0  Good Quality Dog Food. I have bought several o...    0.195261
1  Not as Advertised. Product arrived labeled as ...    0.129095
4  Just okay. The product was neither great nor b...    0.079523
3  Terrible customer service. The product was oka...    0.065597


## 聚類 Clustering

將相似的文本分組，以使用 `K-means` 或 `DBSCAN` 等算法來實現。

`pip install scikit-learn`

In [8]:
from sklearn.cluster import KMeans

# 假設 df 中的嵌入向量列名為 'embedding'
embeddings = np.array(df['embedding'].tolist())
kmeans = KMeans(n_clusters=3, random_state=0).fit(embeddings)
df['cluster'] = kmeans.labels_

print(df[['combined', 'cluster']])

                                            combined  cluster
0  Good Quality Dog Food. I have bought several o...        0
1  Not as Advertised. Product arrived labeled as ...        2
2  Delicious. These are the best cookies I have e...        0
3  Terrible customer service. The product was oka...        1
4  Just okay. The product was neither great nor b...        1


## 推薦 Recommendations

根據用戶的興趣推薦相似的內容，可以使用餘弦相似度來實現。

In [9]:
def recommend_similar_items(item_index, df):
    item_embedding = df.iloc[item_index]['embedding']
    df['similarity'] = df.embedding.apply(lambda x: np.dot(item_embedding, x) / (np.linalg.norm(item_embedding) * np.linalg.norm(x)))
    recommended_items = df.sort_values('similarity', ascending=False)
    return recommended_items

# 假設推薦與第一條評論相似的項目
recommended_items = recommend_similar_items(0, df)
print(recommended_items[['combined', 'similarity']])


                                            combined  similarity
0  Good Quality Dog Food. I have bought several o...    1.000000
4  Just okay. The product was neither great nor b...    0.240480
2  Delicious. These are the best cookies I have e...    0.199340
1  Not as Advertised. Product arrived labeled as ...    0.164630
3  Terrible customer service. The product was oka...    0.123355


## 異常檢測 Anomaly Detection

識別與大多數文本不同的異常點，可以使用 Isolation Forest 或 One-Class SVM 來實現。

In [10]:
from sklearn.ensemble import IsolationForest

embeddings = np.array(df['embedding'].tolist())
clf = IsolationForest(random_state=0).fit(embeddings)
df['anomaly_score'] = clf.decision_function(embeddings)
df['anomaly'] = clf.predict(embeddings)

print(df[['combined', 'anomaly', 'anomaly_score']])


                                            combined  anomaly  anomaly_score
0  Good Quality Dog Food. I have bought several o...        1       0.034910
1  Not as Advertised. Product arrived labeled as ...        1       0.040418
2  Delicious. These are the best cookies I have e...        1       0.017985
3  Terrible customer service. The product was oka...        1       0.036293
4  Just okay. The product was neither great nor b...        1       0.055233


## 多樣性測量 Diversity Measurement

分析文本之間的相似性分佈，可以通過計算嵌入向量之間的距離分佈來實現。

In [11]:
from scipy.spatial.distance import pdist, squareform

embeddings = np.array(df['embedding'].tolist())
distance_matrix = squareform(pdist(embeddings, metric='cosine'))

print("Distance Matrix:\n", distance_matrix)


Distance Matrix:
 [[0.         0.8353697  0.80065967 0.87664512 0.75951974]
 [0.8353697  0.         0.88520441 0.62127699 0.73445364]
 [0.80065967 0.88520441 0.         0.88961617 0.88532285]
 [0.87664512 0.62127699 0.88961617 0.         0.4774982 ]
 [0.75951974 0.73445364 0.88532285 0.4774982  0.        ]]


## 分類 Classification

根據最相似的標籤對文本進行分類，可以使用邏輯迴歸、支持向量機或神經網絡等分類器來實現。

為 DataFrame 添加一個 label 列，可以用來標記每個評論的情感，比如 `正面（1）` 或`負面（0）`。

In [24]:
from openai import OpenAI
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 載入環境變數
load_dotenv()

# 從環境變數中取得 API 密鑰
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

# 增加更多的訓練數據
df = pd.DataFrame({
    'combined': [
        '優質狗糧。 我買了幾款 Vitality 罐裝狗糧產品，發現它們的品質都很好。 該產品看起來更像是燉菜，而不是加工過的肉，而且味道更好。 我的拉布拉多很挑剔，她比大多數人都更欣賞這個產品。',
        '不像廣告中那樣。 產品到達時標記為巨型鹹花生……花生實際上是小尺寸的未加鹽的。 供應商拒絕退還我購買的商品或向我發送正確的產品。',
        '可口的。 這是我吃過的最好的餅乾！ 我強烈推薦給所有喜歡巧克力的人。',
        '糟糕的客戶服務。 產品還可以，但客戶服務很糟糕。 我不會再從這家公司購買產品。',
        '就這樣吧。 該產品既不好也不壞。 沒關係。 如果沒有更好的選擇，我可能會再次購買。',
        '非常滿意的購物體驗。 產品質量很高，物流也很快。',
        '糟糕的產品質量。 產品不到一周就壞了，我很失望。',
        '超級好吃的零食。 每次吃都讓我很開心，真是太棒了。',
        '客戶服務非常棒。 他們非常熱心，解決了我的所有問題。',
        '我很失望。 產品與描述不符，我不會再購買了。',
        '這是我吃過的最好的一頓飯。 我真的很喜歡。',
        '這是我見過的最糟糕的產品。 我再也不會買了。',
        '味道很好。 我會再次購買。',
        '質量很差。 我很不滿意。'
    ],
    'label': [1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0]  # 添加更多的標籤：1 表示正面評論，0 表示負面評論
})

# 將文本轉換為嵌入
df['embedding'] = df.combined.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

# 分割數據集為訓練集和測試集
X = np.array(df['embedding'].tolist())
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 使用邏輯迴歸進行分類
clf = LogisticRegression(random_state=0)
# 使用交叉驗證來評估模型性能，使用 StratifiedKFold 來確保每個折中的類別分布均勻
cv = StratifiedKFold(n_splits=3)
cv_scores = cross_val_score(clf, X_train, y_train, cv=cv)
print("交叉驗證分數: ", cv_scores)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# 顯示分類報告
print(classification_report(y_test, y_pred))

# 定義一個函數來預測新文本的分類
def classify_new_text(text):
    # 將新文本轉換為嵌入
    embedding = get_embedding(text, model='text-embedding-3-small')
    # 使用訓練好的模型進行預測
    prediction = clf.predict([embedding])
    return prediction[0]

# 新文本範例
# new_text_1 = "這款產品真的很棒，我會再次購買。"
new_text_1 = "我覺得這款產品還不錯耶。"
# new_text_0 = "這款產品真的不太好，我不會再買了。"
new_text_0 = "這款產品有點讓人不太滿意。"
predicted_label_1 = classify_new_text(new_text_1)
predicted_label_0 = classify_new_text(new_text_0)
print(f"新文本的預測標籤為: {predicted_label_1}")
print(f"新文本的預測標籤為: {predicted_label_0}")


交叉驗證分數:  [0.5 0.5 1. ]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

新文本的預測標籤為: 1
新文本的預測標籤為: 0
