In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import seaborn as sns
from collections import defaultdict
import re
from matplotlib.font_manager import FontProperties
import matplotlib as mpl

# 設置中文字體支持
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'STHeiti', 'SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 創建模擬數據，因為我們沒有實際的數據
np.random.seed(42)

# 假設我們有10個不同的地點
locations = [
    "台北101", "故宮博物院", "九份老街", "日月潭", "阿里山",
    "墾丁國家公園", "太魯閣國峽", "高雄愛河", "淡水老街", "西門町"
]

# 假設我們有5個不同的國籍
nationalities = ["日本", "美國", "中國", "韓國", "歐洲"]

# 生成模擬數據
n_samples = 1000
data = []

for _ in range(n_samples):
    nationality = np.random.choice(nationalities)
    
    # 根據國籍設定不同的偏好
    if nationality == "日本":
        prefs = [0.3, 0.3, 0.1, 0.1, 0.05, 0.05, 0.05, 0.05, 0.0, 0.0]
    elif nationality == "美國":
        prefs = [0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.05, 0.05, 0.0]
    elif nationality == "中國":
        prefs = [0.1, 0.3, 0.2, 0.1, 0.1, 0.05, 0.05, 0.05, 0.05, 0.0]
    elif nationality == "韓國":
        prefs = [0.2, 0.1, 0.2, 0.1, 0.1, 0.1, 0.1, 0.05, 0.05, 0.0]
    else:  # 歐洲
        prefs = [0.1, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.05, 0.05]
    
    # 為每個用戶生成2-5個地點的行程
    n_places = np.random.randint(2, 6)
    user_id = f"user_{_}"
    
    # 選擇地點
    user_places = np.random.choice(len(locations), size=n_places, replace=False, p=prefs)
    
    # 確保地點是有序的（模擬行程順序）
    user_places.sort()
    
    for i, place_idx in enumerate(user_places):
        location = locations[place_idx]
        location_id = f"loc_{place_idx}"
        
        # 生成評分，根據國籍有不同的偏好
        if nationality == "日本" and place_idx < 2:
            score = np.random.randint(4, 6)
        elif nationality == "中國" and place_idx == 1:
            score = np.random.randint(4, 6)
        else:
            score = np.random.randint(1, 6)
            
        data.append({
            "gmap_location": location,
            "location_ID": location_id,
            "user_name": f"User {_}",
            "user_id": user_id,
            "review_id": f"review_{_}_{i}",
            "score": score,
            "date": f"2023-{np.random.randint(1, 13):02d}-{np.random.randint(1, 29):02d}",
            "comments": f"This is a comment about {location}",
            "language": nationality,  # 使用國籍作為語言
            "translated_comments": f"This is a translated comment about {location}",
            "user_page": f"user_page_{_}",
            "_merge": "both"
        })

# 創建 DataFrame
df = pd.DataFrame(data)

# 顯示數據的前幾行
print(df.head())
print("\n數據形狀:", df.shape)

# 檢查每個國籍的樣本數
print("\n每個國籍的樣本數:")
print(df["language"].value_counts())

# 構建圖數據
# 為每個國籍創建一個圖
nationality_graphs = {}

for nationality in nationalities:
    # 過濾特定國籍的數據
    nat_df = df[df["language"] == nationality]
    
    # 創建一個有向圖
    G = nx.DiGraph()
    
    # 添加節點
    for loc in locations:
        G.add_node(loc)
    
    # 添加邊 (從一個地點到下一個地點)
    edge_weights = defaultdict(int)
    
    # 按用戶分組
    for user_id, user_data in nat_df.groupby("user_id"):
        # 按日期排序用戶的訪問
        user_data = user_data.sort_values("date")
        
        # 獲取用戶訪問的地點序列
        visited_locations = user_data["gmap_location"].tolist()
        
        # 添加邊
        for i in range(len(visited_locations) - 1):
            source = visited_locations[i]
            target = visited_locations[i + 1]
            edge_weights[(source, target)] += 1
    
    # 將邊權重添加到圖中
    for (source, target), weight in edge_weights.items():
        G.add_edge(source, target, weight=weight)
    
    nationality_graphs[nationality] = G

# 可視化每個國籍的旅遊圖
plt.figure(figsize=(20, 15))

for i, (nationality, G) in enumerate(nationality_graphs.items()):
    plt.subplot(2, 3, i + 1)
    
    # 獲取邊權重
    edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
    
    # 正規化邊權重以便可視化
    max_weight = max(edge_weights) if edge_weights else 1
    normalized_weights = [3 * w / max_weight for w in edge_weights]
    
    # 設置節點大小基於訪問頻率
    node_sizes = []
    for node in G.nodes():
        # 計算到達該節點的總權重
        incoming_weight = sum([G[u][node]['weight'] for u in G.predecessors(node)], 0)
        # 計算從該節點出發的總權重
        outgoing_weight = sum([G[node][v]['weight'] for v in G.successors(node)], 0)
        # 節點大小基於總權重
        node_sizes.append(100 + 10 * (incoming_weight + outgoing_weight))
    
    # 使用 spring_layout 以獲得更好的節點分佈
    pos = nx.spring_layout(G, seed=42)
    
    # 繪製節點
    nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='lightblue', alpha=0.8)
    
    # 繪製邊，寬度基於權重
    nx.draw_networkx_edges(G, pos, width=normalized_weights, alpha=0.5, edge_color='gray', 
                          connectionstyle='arc3,rad=0.1', arrowsize=15)
    
    # 添加標籤
    nx.draw_networkx_labels(G, pos, font_size=10, font_family='sans-serif')
    
    plt.title(f"{nationality}旅客行程模式", fontsize=16)
    plt.axis('off')

plt.tight_layout()
plt.show()

# 使用 PyTorch Geometric 實現 GNN
# 首先，我們需要將圖數據轉換為 PyTorch Geometric 格式

# 創建一個合併的圖，用於 GNN 訓練
combined_graph = nx.DiGraph()

# 添加所有地點作為節點
for loc in locations:
    combined_graph.add_node(loc)

# 合併所有國籍的邊
all_edge_weights = defaultdict(int)
for nationality, G in nationality_graphs.items():
    for u, v, data in G.edges(data=True):
        all_edge_weights[(u, v)] += data['weight']

# 將邊添加到合併圖中
for (source, target), weight in all_edge_weights.items():
    combined_graph.add_edge(source, target, weight=weight)

# 創建節點特徵
# 我們將使用節點的度作為特徵
node_features = []
for node in combined_graph.nodes():
    # 入度和出度作為特徵
    in_degree = combined_graph.in_degree(node, weight='weight')
    out_degree = combined_graph.out_degree(node, weight='weight')
    node_features.append([in_degree, out_degree])

# 轉換為 PyTorch 張量
x = torch.tensor(node_features, dtype=torch.float)

# 創建邊索引
edge_index = []
edge_attr = []
node_to_idx = {node: i for i, node in enumerate(combined_graph.nodes())}

for u, v, data in combined_graph.edges(data=True):
    edge_index.append([node_to_idx[u], node_to_idx[v]])
    edge_attr.append(data['weight'])

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
edge_attr = torch.tensor(edge_attr, dtype=torch.float).view(-1, 1)

# 創建 PyTorch Geometric 數據對象
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

# 定義 GNN 模型
class GNN(torch.nn.Module):
    def __init__(self):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(2, 16)
        self.conv2 = GCNConv(16, 8)
        self.conv3 = GCNConv(8, 2)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        
        x = self.conv3(x, edge_index)
        
        return x

# 創建模型
model = GNN()

# 訓練模型
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

model.train()
for epoch in range(100):
    optimizer.zero_grad()
    out = model(data)
    
    # 使用自我監督學習：預測節點的度
    loss = F.mse_loss(out, torch.log(data.x + 1))
    
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 20 == 0:
        print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

# 使用訓練好的模型獲取節點嵌入
model.eval()
with torch.no_grad():
    node_embeddings = model(data).numpy()

# 可視化節點嵌入
plt.figure(figsize=(10, 8))
for i, loc in enumerate(combined_graph.nodes()):
    plt.scatter(node_embeddings[i, 0], node_embeddings[i, 1], s=100)
    plt.text(node_embeddings[i, 0] + 0.01, node_embeddings[i, 1] + 0.01, loc, fontsize=12)

plt.title('景點嵌入空間可視化', fontsize=16)
plt.xlabel('嵌入維度 1')
plt.ylabel('嵌入維度 2')
plt.grid(True)
plt.show()

# 分析不同國籍旅客的行程模式
plt.figure(figsize=(15, 10))

# 為每個國籍創建一個子圖
for i, nationality in enumerate(nationalities):
    plt.subplot(2, 3, i + 1)
    
    G = nationality_graphs[nationality]
    
    # 獲取最常見的路線 (前5個)
    edges = [(u, v, d['weight']) for u, v, d in G.edges(data=True)]
    edges.sort(key=lambda x: x[2], reverse=True)
    top_edges = edges[:5]
    
    # 打印最常見的路線
    print(f"\n{nationality}旅客最常見的5個路線:")
    for u, v, w in top_edges:
        print(f"{u} -> {v}: {w}次")
    
    # 繪製熱力圖
    adj_matrix = np.zeros((len(locations), len(locations)))
    location_to_idx = {loc: i for i, loc in enumerate(locations)}
    
    for u, v, d in G.edges(data=True):
        u_idx = location_to_idx[u]
        v_idx = location_to_idx[v]
        adj_matrix[u_idx, v_idx] = d['weight']
    
    sns.heatmap(adj_matrix, annot=True, fmt=".0f", cmap="YlGnBu",
                xticklabels=locations, yticklabels=locations)
    
    plt.title(f"{nationality}旅客行程轉換熱力圖", fontsize=14)
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)

plt.tight_layout()
plt.show()

# 計算每個國籍的平均行程長度
avg_trip_lengths = {}
for nationality in nationalities:
    nat_df = df[df["language"] == nationality]
    trip_lengths = nat_df.groupby("user_id").size()
    avg_trip_lengths[nationality] = trip_lengths.mean()

# 繪製平均行程長度
plt.figure(figsize=(10, 6))
plt.bar(avg_trip_lengths.keys(), avg_trip_lengths.values(), color='skyblue')
plt.title('不同國籍旅客的平均行程長度', fontsize=16)
plt.xlabel('國籍')
plt.ylabel('平均訪問景點數量')
plt.grid(axis='y', linestyle='--', alpha=0.7)

for i, v in enumerate(avg_trip_lengths.values()):
    plt.text(i, v + 0.05, f'{v:.2f}', ha='center')

plt.show()

# 分析景點受歡迎程度
location_popularity = {}
for location in locations:
    location_popularity[location] = {}
    for nationality in nationalities:
        nat_df = df[(df["language"] == nationality) & (df["gmap_location"] == location)]
        location_popularity[location][nationality] = len(nat_df)

# 繪製景點受歡迎程度
plt.figure(figsize=(12, 8))

# 準備數據
loc_names = list(location_popularity.keys())
nat_names = nationalities
data_matrix = np.zeros((len(loc_names), len(nat_names)))

for i, loc in enumerate(loc_names):
    for j, nat in enumerate(nat_names):
        data_matrix[i, j] = location_popularity[loc][nat]

# 繪製熱力圖
sns.heatmap(data_matrix, annot=True, fmt=".0f", cmap="YlGnBu",
            xticklabels=nat_names, yticklabels=loc_names)

plt.title('不同國籍旅客對各景點的訪問頻率', fontsize=16)
plt.xlabel('國籍')
plt.ylabel('景點')
plt.tight_layout()
plt.show()

# 預測下一個可能訪問的景點
def predict_next_location(current_location, nationality):
    """基於當前位置和國籍預測下一個最可能訪問的地點"""
    G = nationality_graphs[nationality]
    
    if current_location not in G.nodes():
        return "無法預測：該地點不在圖中"
    
    # 獲取從當前位置出發的所有邊
    outgoing_edges = list(G.out_edges(current_location, data=True))
    
    if not outgoing_edges:
        return "無法預測：該地點沒有出邊"
    
    # 按權重排序
    outgoing_edges.sort(key=lambda x: x[2]['weight'], reverse=True)
    
    # 返回權重最高的下一個地點
    return outgoing_edges[0][1]

# 為每個國籍和每個地點預測下一個可能的地點
prediction_results = {}
for nationality in nationalities:
    prediction_results[nationality] = {}
    for location in locations:
        next_loc = predict_next_location(location, nationality)
        prediction_results[nationality][location] = next_loc

# 打印預測結果
print("\n基於GNN的下一個景點預測:")
for nationality, predictions in prediction_results.items():
    print(f"\n{nationality}旅客:")
    for current_loc, next_loc in predictions.items():
        print(f"  從 {current_loc} 到 {next_loc}")

# 可視化預測結果 - 為每個國籍創建一個預測圖
plt.figure(figsize=(20, 15))

for i, (nationality, predictions) in enumerate(prediction_results.items()):
    plt.subplot(2, 3, i + 1)
    
    # 創建一個新的有向圖
    G = nx.DiGraph()
    
    # 添加節點
    for loc in locations:
        G.add_node(loc)
    
    # 添加預測的邊
    for current_loc, next_loc in predictions.items():
        if isinstance(next_loc, str) and next_loc.startswith("無法預測"):
            continue
        G.add_edge(current_loc, next_loc)
    
    # 繪製圖
    pos = nx.spring_layout(G, seed=42)
    nx.draw_networkx_nodes(G, pos, node_size=700, node_color='lightblue', alpha=0.8)
    nx.draw_networkx_edges(G, pos, width=1.5, alpha=0.7, edge_color='gray', 
                          connectionstyle='arc3,rad=0.1', arrowsize=15)
    nx.draw_networkx_labels(G, pos, font_size=10, font_family='sans-serif')
    
    plt.title(f"{nationality}旅客預測的下一個景點", fontsize=16)
    plt.axis('off')

plt.tight_layout()
plt.show()

ModuleNotFoundError: No module named 'torch_geometric'