<a href="https://colab.research.google.com/github/samko5sam/programming-language-class/blob/main/0327_Visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Chicago sex offenders
path = kagglehub.dataset_download("chicago/chicago-sex-offenders")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/chicago/chicago-sex-offenders?dataset_version_number=44...


100%|██████████| 26.2k/26.2k [00:00<00:00, 9.18MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/chicago/chicago-sex-offenders/versions/44





In [2]:
!ls {path}

sex-offenders.csv  socrata_metadata.json


In [3]:
import pandas as pd

data = pd.read_csv(path+"/sex-offenders.csv")
data

Unnamed: 0,LAST,FIRST,BLOCK,GENDER,RACE,BIRTH DATE,AGE,HEIGHT,WEIGHT,VICTIM MINOR
0,MCGINNIS,DELL,0000X E 100TH PL,MALE,BLACK,07/26/1982,37.0,505,141,Y
1,WHITE,CHARLES,0000X E 100TH ST,MALE,BLACK,05/02/1961,58.0,509,180,Y
2,SIMON,GERA,0000X E 110TH PL,MALE,BLACK,05/21/1952,67.0,504,110,Y
3,WARD,RICHARD,0000X E 110TH PL,MALE,BLACK,07/23/1949,70.0,506,190,Y
4,WORTHON,SEBASTIAN,0000X E 119TH PL,MALE,BLACK,10/22/1982,37.0,600,180,Y
...,...,...,...,...,...,...,...,...,...,...
1075,OLANIRAN,ADELANI,13XXX S EDBROOKE AVE,MALE,BLACK,08/07/1968,51.0,509,172,Y
1076,BORAH,DAVID,13XXX S FORRESTVILLE AVE,MALE,BLACK,09/19/1971,48.0,506,250,N
1077,MYLES,JAMES,13XXX S MICHIGAN AVE,MALE,BLACK,01/06/1965,54.0,601,195,Y
1078,WATSON,JEROME,13XXX S RHODES AVE,MALE,BLACK,09/03/1980,39.0,506,160,Y


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LAST          1080 non-null   object 
 1   FIRST         1080 non-null   object 
 2   BLOCK         1080 non-null   object 
 3   GENDER        1080 non-null   object 
 4   RACE          1080 non-null   object 
 5   BIRTH DATE    1080 non-null   object 
 6   AGE           1080 non-null   float64
 7   HEIGHT        1080 non-null   int64  
 8   WEIGHT        1080 non-null   int64  
 9   VICTIM MINOR  1080 non-null   object 
dtypes: float64(1), int64(2), object(7)
memory usage: 84.5+ KB


In [9]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import plotly.express as px

# 1. 選擇用於 PCA 和 K-Means 的數值型欄位
numerical_features = ['AGE', 'HEIGHT', 'WEIGHT']
categorical_features = ['GENDER', 'RACE']

# 2. 建立預處理的 ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# 3. 建立預處理和 PCA 的 Pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=None))
])

# 4. 執行 Pipeline
data_clean = data.dropna()  # 移除缺失值以便簡化處理
principal_components = pipe.fit_transform(data_clean)

# 5. 分析 PCA 結果
pca = pipe.named_steps['pca']
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = explained_variance_ratio.cumsum()

# 使用 Plotly Express 繪製累積解釋的變異比例
fig_variance = px.line(x=range(1, len(explained_variance_ratio) + 1),
                       y=cumulative_variance_ratio,
                       title='累積解釋的變異比例',
                       labels={'x': '主成分數量', 'y': '累積解釋的變異比例'})
fig_variance.update_traces(mode='markers+lines')
fig_variance.show()

# 根據上圖選擇要保留的主成分數量
n_components_to_keep = 3  # 這是一個示例值，請根據您的圖表決定
pca_final = PCA(n_components=n_components_to_keep)
principal_components_final = pca_final.fit_transform(principal_components)

pca_df = pd.DataFrame(data=principal_components_final,
                      columns=[f'principal_component_{i + 1}' for i in range(n_components_to_keep)])

# 6. 執行 K-Means 分群
n_clusters = 4  # 這是一個示例值
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(principal_components_final)

cluster_series = pd.Series(clusters, index=data_clean.index, name='cluster')
data_with_clusters = data_clean.merge(cluster_series, left_index=True, right_index=True, how='left')

# 7. 使用 Plotly Express 可視化分群結果
if n_components_to_keep >= 2:
    pca_df_with_cluster = pca_df.merge(cluster_series.rename('cluster'), left_index=True, right_index=True)
    fig_scatter_pca = px.scatter(pca_df_with_cluster, x='principal_component_1', y='principal_component_2',
                                 color='cluster',
                                 title=f'PCA 降維後的 K-Means 分群 (K={n_clusters})',
                                 labels={'principal_component_1': '主成分 1',
                                         'principal_component_2': '主成分 2',
                                         'cluster': '群組'})
    fig_scatter_pca.show()
else:
    print("降維後的主成分數量少於 2，無法繪製二維散佈圖。")

# 8. 使用 Plotly Express 分析每個群組的特徵均值（數值特徵）
cluster_analysis = data_with_clusters.groupby('cluster')[numerical_features].mean().reset_index()
cluster_melted = pd.melt(cluster_analysis, id_vars=['cluster'], var_name='feature', value_name='mean_value')

fig_bar_cluster_means = px.bar(cluster_melted, x='cluster', y='mean_value',
                               color='feature',
                               title='每個群組的特徵均值',
                               labels={'cluster': '群組', 'mean_value': '均值', 'feature': '特徵'})
fig_bar_cluster_means.show()

# 定義年齡區間
bins = [0, 18, 30, 45, 60, float('inf')]
labels = ['0-18', '19-30', '31-45', '46-60', '61+']

# 將年齡分組
data_with_clusters['AGE_GROUP'] = pd.cut(data_with_clusters['AGE'], bins=bins, labels=labels, right=False)

# 按照新年齡分組後聚合數據
age_group_distribution = data_with_clusters.groupby('cluster')['AGE_GROUP'].value_counts(normalize=True).unstack(fill_value=0).reset_index()
age_group_melted = pd.melt(age_group_distribution, id_vars=['cluster'], var_name='AGE_GROUP', value_name='proportion')

# 繪製分布圖
fig_bar_age_group_distribution = px.bar(age_group_melted, x='cluster', y='proportion',
                                        color='AGE_GROUP',
                                        title='每個群組的年齡分布比例',
                                        labels={'cluster': '群組', 'proportion': '比例', 'AGE_GROUP': '年齡'})
fig_bar_age_group_distribution.show()

In [10]:
# 分析 GENDER 在每個簇中的分佈
gender_distribution = data_with_clusters.groupby('cluster')['GENDER'].value_counts(normalize=True).unstack(fill_value=0).reset_index()
gender_melted = pd.melt(gender_distribution, id_vars=['cluster'], var_name='GENDER', value_name='proportion')

fig_gender_distribution = px.bar(
    gender_melted,
    x='cluster',
    y='proportion',
    color='GENDER',
    title="每個群組的性別分布比例",
    labels={'cluster': '群組', 'proportion': '比例', 'GENDER': '性別'}
)
fig_gender_distribution.show()

# 分析 RACE 在每個簇中的分佈
race_distribution = data_with_clusters.groupby('cluster')['RACE'].value_counts(normalize=True).unstack(fill_value=0).reset_index()
race_melted = pd.melt(race_distribution, id_vars=['cluster'], var_name='RACE', value_name='proportion')

fig_race_distribution = px.bar(
    race_melted,
    x='cluster',
    y='proportion',
    color='RACE',
    title="每個群組的種族分布比例",
    labels={'cluster': '群組', 'proportion': '比例', 'RACE': '種族'}
)
fig_race_distribution.show()

In [12]:
import pandas as pd
import plotly.express as px
import numpy as np

# 從 Pipeline 中提取 PCA 和 preprocessor
pca = pipe.named_steps['pca']
preprocessor = pipe.named_steps['preprocessor']

# 獲取特徵名稱
# 數值特徵
num_features = numerical_features
# 類別特徵（One-Hot 編碼後的特徵名稱）
cat_transformer = preprocessor.named_transformers_['cat']
cat_features_encoded = cat_transformer.get_feature_names_out(categorical_features).tolist()
# 合併所有特徵名稱
feature_names = num_features + cat_features_encoded

# 獲取主成分的特徵權重（components_）
pca_components = pd.DataFrame(
    pca.components_,
    columns=feature_names,
    index=[f'PC{i+1}' for i in range(pca.n_components_)]
)

# 選擇前幾個主成分（例如前 3 個）進行可視化
n_components_to_plot = min(3, pca.n_components_)
pca_components_subset = pca_components.iloc[:n_components_to_plot]

# 將數據轉為長格式以便繪圖
pca_components_melted = pca_components_subset.reset_index().melt(
    id_vars='index',
    var_name='Feature',
    value_name='Weight'
)

# 使用 Plotly Express 繪製熱圖
fig_pca_loadings = px.imshow(
    pca_components_subset,
    labels=dict(x="特徵", y="主成分", color="權重"),
    title="PCA 主成分的特徵權重",
    color_continuous_scale='RdBu',
    aspect='auto'
)
fig_pca_loadings.show()

# 或者繪製條形圖，顯示每個主成分中特徵的權重
fig_pca_loadings_bar = px.bar(
    pca_components_melted,
    x='Weight',
    y='Feature',
    color='index',
    title="PCA 主成分的特徵權重（條形圖）",
    labels={'Weight': '權重', 'Feature': '特徵', 'index': '主成分'},
    orientation='h'
)
fig_pca_loadings_bar.show()

In [13]:
import plotly.graph_objects as go

# 提取前兩個主成分
pca_2d = PCA(n_components=2)
principal_components_2d = pca_2d.fit_transform(preprocessor.fit_transform(data_clean))
pca_df_2d = pd.DataFrame(principal_components_2d, columns=['PC1', 'PC2'])
pca_df_2d['cluster'] = clusters

# 獲取特徵向量（主成分的權重）
feature_vectors = pca_2d.components_.T
scaling_factor = 5  # 調整特徵向量的長度以便可視化

# 繪製 Biplot
fig_biplot = go.Figure()

# 繪製數據點（散點圖）
fig_biplot.add_trace(
    go.Scatter(
        x=pca_df_2d['PC1'],
        y=pca_df_2d['PC2'],
        mode='markers',
        marker=dict(color=pca_df_2d['cluster'], colorscale='Viridis', size=5),
        name='數據點'
    )
)

# 繪製特徵向量
for i, feature in enumerate(feature_names):
    fig_biplot.add_trace(
        go.Scatter(
            x=[0, feature_vectors[i, 0] * scaling_factor],
            y=[0, feature_vectors[i, 1] * scaling_factor],
            mode='lines+text',
            line=dict(color='red', width=2),
            text=['', feature],
            textposition='middle right',
            name=feature
        )
    )

fig_biplot.update_layout(
    title="PCA Biplot：數據點與特徵向量",
    xaxis_title="主成分 1",
    yaxis_title="主成分 2",
    showlegend=False
)
fig_biplot.show()

手機分析

In [None]:
import kagglehub

# Phones in India
path = kagglehub.dataset_download("informrohit1/smartphones-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/informrohit1/smartphones-dataset?dataset_version_number=1...


100%|██████████| 23.1k/23.1k [00:00<00:00, 21.2MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/informrohit1/smartphones-dataset/versions/1





In [None]:
!ls {path}

smartphones_cleaned_v6.csv


In [None]:
import pandas as pd

data = pd.read_csv(path+"/smartphones_cleaned_v6.csv")
brands_to_keep = ['samsung', 'motorola', 'realme', 'apple', 'xiaomi',
       'nothing', 'oppo', 'vivo', 'poco', 'google',
       'redmi', 'asus', 'sony',
       'tcl', 'sharp']
data = data[data['brand_name'].isin(brands_to_keep)]
# 假設當前的匯率是 1 盧比 = 0.41 新台幣
exchange_rate = 0.4
data['price'] = data['price'] * exchange_rate
data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,brand_name,model,price,rating,has_5g,has_nfc,has_ir_blaster,processor_brand,num_cores,processor_speed,...,refresh_rate,num_rear_cameras,num_front_cameras,os,primary_camera_rear,primary_camera_front,extended_memory_available,extended_upto,resolution_width,resolution_height
2,samsung,Samsung Galaxy A14 5G,6599.6,75.0,True,False,False,exynos,8.0,2.40,...,90,3,1.0,android,50.0,13.0,1,1024.0,1080,2408
3,motorola,Motorola Moto G62 5G,5999.6,81.0,True,False,False,snapdragon,8.0,2.20,...,120,3,1.0,android,50.0,16.0,1,1024.0,1080,2400
4,realme,Realme 10 Pro Plus,9999.6,82.0,True,False,False,dimensity,8.0,2.60,...,120,3,1.0,android,108.0,16.0,0,,1080,2412
5,samsung,Samsung Galaxy F23 5G (6GB RAM + 128GB),6799.6,80.0,True,True,False,snapdragon,8.0,2.20,...,120,3,1.0,android,50.0,8.0,1,1024.0,1080,2408
6,apple,Apple iPhone 14,26399.6,81.0,True,True,False,bionic,6.0,3.22,...,60,2,1.0,ios,12.0,12.0,0,,1170,2532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
974,vivo,Vivo X Fold 2,47996.0,,True,True,False,snapdragon,8.0,3.20,...,120,3,1.0,android,50.0,32.0,0,,1916,2160
975,motorola,Motorola Moto Edge S30 Pro,13996.0,83.0,True,False,False,snapdragon,8.0,3.00,...,120,3,1.0,android,64.0,16.0,0,,1080,2460
977,poco,POCO X4 GT 5G (8GB RAM + 256GB),11596.0,85.0,True,True,True,dimensity,8.0,2.85,...,144,3,1.0,android,64.0,16.0,0,,1080,2460
978,motorola,Motorola Moto G91 5G,7996.0,80.0,True,True,False,snapdragon,8.0,2.20,...,60,3,1.0,android,108.0,32.0,1,1024.0,1080,2400


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 741 entries, 2 to 979
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   brand_name                 741 non-null    object 
 1   model                      741 non-null    object 
 2   price                      741 non-null    float64
 3   rating                     678 non-null    float64
 4   has_5g                     741 non-null    bool   
 5   has_nfc                    741 non-null    bool   
 6   has_ir_blaster             741 non-null    bool   
 7   processor_brand            730 non-null    object 
 8   num_cores                  736 non-null    float64
 9   processor_speed            705 non-null    float64
 10  battery_capacity           730 non-null    float64
 11  fast_charging_available    741 non-null    int64  
 12  fast_charging              591 non-null    float64
 13  ram_capacity               741 non-null    float64
 14 

In [None]:
data['brand_name'].unique()

array(['samsung', 'motorola', 'realme', 'apple', 'xiaomi', 'nothing',
       'oppo', 'vivo', 'poco', 'google', 'redmi', 'asus', 'sony', 'tcl',
       'sharp'], dtype=object)

In [None]:
import plotly.express as px

# 1. 不同品牌手機的平均價格長條圖
avg_price_by_brand = data.groupby('brand_name')['price'].mean().reset_index()
fig_bar = px.bar(avg_price_by_brand, x='brand_name', y='price',
                 title='不同品牌手機的平均價格',
                 labels={'brand_name': '品牌名稱', 'price': '平均價格'})
fig_bar.update_layout(xaxis_tickangle=-45)
fig_bar.show()

In [None]:
# 2. 手機價格與螢幕大小的散佈圖
fig_scatter = px.scatter(data, x='screen_size', y='price',
                         title='手機價格與螢幕大小的關係',
                         labels={'screen_size': '螢幕大小 (吋)', 'price': '價格'},
                         hover_name='model')
fig_scatter.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# 1. 選擇用於 PCA 和 K-Means 的數值型欄位
# numerical_features = ['price', 'rating', 'num_cores', 'processor_speed',
#                       'battery_capacity', 'fast_charging', 'ram_capacity',
#                       'internal_memory', 'screen_size', 'refresh_rate',
#                       'num_rear_cameras', 'num_front_cameras',
#                       'primary_camera_rear', 'primary_camera_front',
#                       'resolution_width', 'resolution_height']
# numerical_features = ['price', 'rating', 'processor_speed',
#                       'battery_capacity', 'ram_capacity',
#                       'internal_memory', 'screen_size',
#                       'num_rear_cameras', 'num_front_cameras',
#                       'primary_camera_rear', 'primary_camera_front']
numerical_features = ['price', 'processor_speed',
                      'battery_capacity']

data_for_clustering = data[numerical_features].dropna().copy()

# 2. 資料標準化
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_for_clustering)

# 3. 執行 PCA 降維
pca = PCA(n_components=None)
principal_components = pca.fit_transform(scaled_features)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = explained_variance_ratio.cumsum()

# 使用 Plotly Express 繪製累積解釋的變異比例
fig_variance = px.line(x=range(1, len(explained_variance_ratio) + 1),
                       y=cumulative_variance_ratio,
                       title='累積解釋的變異比例',
                       labels={'x': '主成分數量', 'y': '累積解釋的變異比例'})
fig_variance.update_traces(mode='markers+lines')
fig_variance.show()

# 根據上圖選擇要保留的主成分數量
n_components_to_keep = 3 # 這是一個範例值，請根據您的圖表決定
pca_final = PCA(n_components=n_components_to_keep)
principal_components_final = pca_final.fit_transform(scaled_features)

pca_df = pd.DataFrame(data=principal_components_final,
                      columns=[f'principal_component_{i+1}' for i in range(n_components_to_keep)])

# 4. 執行 K-Means 分群
n_clusters = 3 # 這是一個範例值
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(principal_components_final)

cluster_series = pd.Series(clusters, index=data_for_clustering.index, name='cluster')
data_with_clusters = data.merge(cluster_series, left_index=True, right_index=True, how='left')

# 5. 使用 Plotly Express 可視化分群結果
if n_components_to_keep >= 2:
    pca_df_with_cluster = pca_df.merge(cluster_series.rename('cluster'), left_index=True, right_index=True)
    fig_scatter_pca = px.scatter(pca_df_with_cluster, x='principal_component_1', y='principal_component_2',
                                 color='cluster',
                                 title=f'PCA 降維後的 K-Means 分群 (K={n_clusters})',
                                 labels={'principal_component_1': '主成分 1',
                                         'principal_component_2': '主成分 2',
                                         'cluster': '群組'})
    fig_scatter_pca.show()
else:
    print("降維後的主成分數量少於 2，無法繪製二維散佈圖。")

# 使用 Plotly Express 分析每個群組的特徵均值
cluster_analysis = data_with_clusters.groupby('cluster')[numerical_features].mean().reset_index()
cluster_melted = pd.melt(cluster_analysis, id_vars=['cluster'], var_name='feature', value_name='mean_value')

fig_bar_cluster_means = px.bar(cluster_melted, x='cluster', y='mean_value',
                               color='feature',
                               title='每個群組的特徵均值',
                               labels={'cluster': '群組', 'mean_value': '平均值', 'feature': '特徵'})
fig_bar_cluster_means.show()

# 使用 Plotly Express 分析每個群組中不同品牌的分布
brand_distribution = data_with_clusters.groupby('cluster')['brand_name'].value_counts(normalize=True).unstack(fill_value=0).reset_index()
brand_melted = pd.melt(brand_distribution, id_vars=['cluster'], var_name='brand_name', value_name='proportion')

fig_bar_brand_distribution = px.bar(brand_melted, x='cluster', y='proportion',
                                     color='brand_name',
                                     title='每個群組的品牌分布比例',
                                     labels={'cluster': '群組', 'proportion': '比例', 'brand_name': '品牌名稱'})
fig_bar_brand_distribution.show()

🔽 老師的範例

金門縣政府提供教育相關數據

https://drive.google.com/drive/folders/1ZnEmLN_Gbx074-dv_8m3chn2vvhaeUSK?usp=sharing

金門縣康軒國中小使用數據

https://docs.google.com/spreadsheets/d/1LvcLpeVNIa-OfICFnakSbzQgSKBCkfFACmQRIKekgRA/edit?usp=sharing

In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

In [None]:
import pandas as pd
# read data and put it in a dataframe
# 在 google 工作表載入 gsheets
gsheets = gc.open_by_url('https://docs.google.com/spreadsheets/d/1LvcLpeVNIa-OfICFnakSbzQgSKBCkfFACmQRIKekgRA/edit?usp=sharing')

In [None]:
# 從 gsheets 的 All-whiteboard-device 載入 sheets
sheets = gsheets.worksheet('anonymized_xapi').get_all_values()
# 將 sheets1 資料載入 pd 的 DataFrame 進行分析
df = pd.DataFrame(sheets[1:], columns=sheets[0])
# 取得最前面的5筆資料
df.head()

Unnamed: 0,id,json,timestamp
0,281744,"{""id"": ""5d3ddc37-c43c-4342-8015-c1697ac5148b"",...",6/11/24 22:17
1,281746,"{""id"": ""b27139a8-101a-4d94-9f24-0fb6be429de7"",...",6/11/24 22:17
2,281747,"{""id"": ""73d06d71-56b5-4fd2-a951-7f3e485294ba"",...",6/11/24 22:17
3,281748,"{""id"": ""3c0c29d1-c8f9-441f-a820-bafd90e0fc58"",...",6/11/24 22:17
4,281749,"{""id"": ""6cce33e0-b8d4-47f4-a00d-6dd335c409c2"",...",6/11/24 22:17


In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
alldays = max(set(df['timestamp'].dt.date)) - min(set(df['timestamp'].dt.date))

  df['timestamp'] = pd.to_datetime(df['timestamp'])


In [None]:
alldays.days

37

In [None]:
len(df)

42928

In [None]:
import json
df_expanded = pd.json_normalize(df['json'].apply(json.loads))

In [None]:
type(df_expanded['timestamp'][0])

str

In [None]:
df_expanded['id'] = df['id']
df_expanded['timestamp'] = pd.to_datetime(df['timestamp'])
df_expanded['Date'] = df_expanded['timestamp'].dt.date    # 提取日期部分
df_expanded['Time'] = df_expanded['timestamp'].dt.time    # 提取時間部分
df_expanded['Weekday'] = df_expanded['timestamp'].dt.day_name()  # 以星期幾的名稱表示（例如：Monday）
df_expanded.head()

Unnamed: 0,id,timestamp,actor.objectType,actor.name,actor.account.homePage,actor.account.name,verb.id,verb.display.zh-TW,object.objectType,object.id,...,context.team.mailto,context.team.mbox,object.definition.extensions.https://w3id.org/xapi/acrossx/extensions/alignment,object.definition.name.zh-TW,object.definition.interactionType,object.definition.correctResponsesPattern,object.definition.choices,Date,Time,Weekday
0,281744,2024-06-11 22:17:00,Agent,學生298,mailto:kc0368@cnc.km.edu.tw,學生,https://w3id.org/xapi/acrossx/verbs/watched,觀看,Activity,https://ksjh-km.kschool.com.tw/video-player/44...,...,,,,,,,,2024-06-11,22:17:00,Tuesday
1,281746,2024-06-11 22:17:00,Agent,學生298,mailto:kc0368@cnc.km.edu.tw,學生,https://w3id.org/xapi/acrossx/verbs/paused,暫停,Activity,https://ksjh-km.kschool.com.tw/video-player/44...,...,,,,,,,,2024-06-11,22:17:00,Tuesday
2,281747,2024-06-11 22:17:00,Agent,學生298,mailto:kc0368@cnc.km.edu.tw,學生,https://w3id.org/xapi/acrossx/verbs/watched,觀看,Activity,https://ksjh-km.kschool.com.tw/video-player/44...,...,,,,,,,,2024-06-11,22:17:00,Tuesday
3,281748,2024-06-11 22:17:00,Agent,學生298,mailto:kc0368@cnc.km.edu.tw,學生,https://w3id.org/xapi/acrossx/verbs/paused,暫停,Activity,https://ksjh-km.kschool.com.tw/video-player/44...,...,,,,,,,,2024-06-11,22:17:00,Tuesday
4,281749,2024-06-11 22:17:00,Agent,學生298,mailto:kc0368@cnc.km.edu.tw,學生,https://w3id.org/xapi/acrossx/verbs/watched,觀看,Activity,https://ksjh-km.kschool.com.tw/video-player/44...,...,,,,,,,,2024-06-11,22:17:00,Tuesday


In [None]:
# 提取影片的類型、標題和主題
video_activity_info = df_expanded[['actor.name', 'Date', 'Time',	'Weekday', 'verb.display.zh-TW']]

# 添加影片的定義類型和主題
video_activity_info['theme'] = df_expanded['object.definition.extensions.https://w3id.org/xapi/acrossx/extensions/alignment'].apply(lambda x: x[0] if isinstance(x, list) and x else None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  video_activity_info['theme'] = df_expanded['object.definition.extensions.https://w3id.org/xapi/acrossx/extensions/alignment'].apply(lambda x: x[0] if isinstance(x, list) and x else None)


In [None]:
video_activity_info

Unnamed: 0,actor.name,Date,Time,Weekday,verb.display.zh-TW,theme
0,學生298,2024-06-11,22:17:00,Tuesday,觀看,
1,學生298,2024-06-11,22:17:00,Tuesday,暫停,
2,學生298,2024-06-11,22:17:00,Tuesday,觀看,
3,學生298,2024-06-11,22:17:00,Tuesday,暫停,
4,學生298,2024-06-11,22:17:00,Tuesday,觀看,
...,...,...,...,...,...,...
42923,學生584,2024-07-18,11:24:00,Thursday,暫停,1-2物質進出細胞的方式
42924,學生584,2024-07-18,11:26:00,Thursday,觀看,1-2物質進出細胞的方式
42925,學生584,2024-07-18,11:26:00,Thursday,暫停,1-2物質進出細胞的方式
42926,學生584,2024-07-18,11:28:00,Thursday,觀看,1-2物質進出細胞的方式


In [None]:
allStudents = len(video_activity_info['actor.name'].unique())

In [None]:
video_activity_info[video_activity_info['actor.name'] == '學生315']

Unnamed: 0,actor.name,Date,Time,Weekday,verb.display.zh-TW,theme
23,學生315,2024-06-12,08:40:00,Wednesday,開始嘗試,
208,學生315,2024-06-12,08:57:00,Wednesday,回答,
209,學生315,2024-06-12,08:57:00,Wednesday,回答,
210,學生315,2024-06-12,08:57:00,Wednesday,回答,
211,學生315,2024-06-12,08:57:00,Wednesday,回答,
...,...,...,...,...,...,...
41588,學生315,2024-06-27,12:42:00,Thursday,回答,"SO ,12,"
41589,學生315,2024-06-27,12:42:00,Thursday,回答,"SO ,12,"
41590,學生315,2024-06-27,12:42:00,Thursday,回答,"SO ,12,"
41591,學生315,2024-06-27,12:42:00,Thursday,回答,"SO ,12,"


In [None]:
distinct_types = video_activity_info['verb.display.zh-TW'].unique()

In [None]:
distinct_types

array(['觀看', '暫停', '開始嘗試', '回答', '完成', '查看'], dtype=object)

In [None]:
theme = video_activity_info['theme'].unique()
theme

array([None, '第4單元 複習影片', '第3單元 複習影片', 'MA ,32,', 'NA ,22,', 'L11重點句型',
       'SO ,12,', 'L6戰後臺灣的文化與社會發展', 'SO ,52,', 'L6 Word Bank', 'CH ,32,',
       'L6戰後臺灣的經濟變遷', 'MA ,12,', 'L12文意理解', 'NA ,32,',
       'B4L6-2_文法動畫_if_although句型', 'B4L6-1_文法動畫_不定代名詞',
       'B4L6 Reading 動畫', 'B4L6 Dialogue 動畫', 'EN ,5,', '複習&統整CH4',
       '複習&統整CH5', 'L12重點句型', 'L11文意理解', 'CH ,12,', 'L6臺灣的區域差異',
       'L6臺灣的區域特色', 'L6臺灣六都的特色', 'L8 文意理解', 'L6社會安全與國家責任', 'L8 詞成語',
       'L8 形音義', 'NA ,12,', '4-2反推不等式', '4-2應用題--幾何', '3-6鳥類和哺乳類',
       '3-6魚類、兩生類和爬蟲類', '3-6節肢動物門', '3-6軟體、環節、刺絲胞、棘皮、扁形動物門', '軍犬立大功',
       '和喜鵲學築巢', 'CH ,51,52,', '迎向第一次段考讀書計畫表', '國語二上第7課教學動畫愛漂亮的國王',
       '1-1生命現象', '1-1細胞的發現、型態與功能', '1-1動、植物細胞構造', '1-2組成細胞的物質',
       '1-2物質進出細胞的方式'], dtype=object)

In [None]:
# Filter out rows where 'theme' is None for meaningful pairing
filtered_df = video_activity_info.dropna(subset=['theme'])

# Pivot the table to show 'verb.display.zh-TW' as columns and 'theme' as corresponding values
pivoted_df = filtered_df.pivot_table(index=['actor.name', 'Date'], columns='verb.display.zh-TW', values='theme', aggfunc=lambda x: ', '.join(x))

# Reset index for better readability
pivoted_df = pivoted_df.reset_index()

In [None]:
pivoted_df

verb.display.zh-TW,actor.name,Date,回答,完成,暫停,查看,觀看,開始嘗試
0,學生298,2024-06-24,"SO ,12,, SO ,12,, SO ,12,, SO ,12,, SO ,12,, S...","SO ,12,",,,,"SO ,12,"
1,學生299,2024-06-26,,,,,,"NA ,12,, NA ,12,"
2,學生299,2024-06-27,,,,,,"SO ,12,"
3,學生300,2024-06-24,"CH ,12,, CH ,12,, CH ,12,, CH ,12,, CH ,12,, C...","CH ,12,, SO ,12,, SO ,12,",,,,"CH ,12,, SO ,12,, SO ,12,"
4,學生300,2024-06-25,"NA ,12,, NA ,12,, NA ,12,, NA ,12,, NA ,12,, N...","NA ,12,",,,,"NA ,12,, NA ,12,, NA ,12,, NA ,12,, NA ,12,"
...,...,...,...,...,...,...,...,...
233,學生590,2024-06-25,"EN ,5,, EN ,5,, EN ,5,, EN ,5,, EN ,5,, EN ,5,...","EN ,5,, EN ,5,, EN ,5,",,,,"EN ,5,, EN ,5,, EN ,5,"
234,學生591,2024-06-25,"EN ,5,, EN ,5,, EN ,5,, EN ,5,, EN ,5,, EN ,5,...","EN ,5,, EN ,5,, EN ,5,",,,,"EN ,5,, EN ,5,, EN ,5,, EN ,5,"
235,學生592,2024-06-28,,,,"軍犬立大功, 和喜鵲學築巢",,
236,學生593,2024-07-02,,,,迎向第一次段考讀書計畫表,,


Grouping and Counting Behaviors per Theme

In [None]:
# Assuming the dataframe is already loaded into df and contains columns like 'actor.name', 'Date', 'verb.display.zh-TW', 'theme'

# Filter out rows where theme is None (if applicable)
df_filtered = video_activity_info.dropna(subset=['theme'])

# Group by 'actor.name', 'theme', and 'verb.display.zh-TW' (behavior)
behavior_counts = df_filtered.groupby(['actor.name', 'Date', 'theme', 'verb.display.zh-TW']).size().unstack(fill_value=0)

# Display the behavior counts per student and theme
behavior_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,verb.display.zh-TW,回答,完成,暫停,查看,觀看,開始嘗試
actor.name,Date,theme,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
學生298,2024-06-24,"SO ,12,",40,1,0,0,0,1
學生299,2024-06-26,"NA ,12,",0,0,0,0,0,2
學生299,2024-06-27,"SO ,12,",0,0,0,0,0,1
學生300,2024-06-24,"CH ,12,",40,1,0,0,0,1
學生300,2024-06-24,"SO ,12,",80,2,0,0,0,2
...,...,...,...,...,...,...,...,...
學生591,2024-06-25,"EN ,5,",30,3,0,0,0,4
學生592,2024-06-28,和喜鵲學築巢,0,0,0,1,0,0
學生592,2024-06-28,軍犬立大功,0,0,0,1,0,0
學生593,2024-07-02,迎向第一次段考讀書計畫表,0,0,0,1,0,0


In [None]:
behavior_counts.index

MultiIndex([('學生298', 2024-06-24,           'SO ,12,'),
            ('學生299', 2024-06-26,           'NA ,12,'),
            ('學生299', 2024-06-27,           'SO ,12,'),
            ('學生300', 2024-06-24,           'CH ,12,'),
            ('學生300', 2024-06-24,           'SO ,12,'),
            ('學生300', 2024-06-25,           'NA ,12,'),
            ('學生300', 2024-06-26,           'NA ,12,'),
            ('學生300', 2024-06-27,           'SO ,12,'),
            ('學生301', 2024-06-24,           'CH ,12,'),
            ('學生301', 2024-06-26,           'NA ,12,'),
            ...
            ('學生589', 2024-06-25,         'L6臺灣的區域特色'),
            ('學生589', 2024-06-25,          '複習&統整CH4'),
            ('學生589', 2024-06-25,          '複習&統整CH5'),
            ('學生589', 2024-06-26,          '複習&統整CH5'),
            ('學生590', 2024-06-25,            'EN ,5,'),
            ('學生591', 2024-06-25,            'EN ,5,'),
            ('學生592', 2024-06-28,            '和喜鵲學築巢'),
            ('學生592', 2024-06-28

In [None]:
from datetime import date
# Accessing 'actor.name' and 'theme' from MultiIndex
actor_names = behavior_counts.index.get_level_values('actor.name')
themes = behavior_counts.index.get_level_values('theme')
dates = behavior_counts.index.get_level_values('Date')

# Example: Combine into a DataFrame for easy viewing
df_with_index = pd.DataFrame({
    'actor.name': actor_names,
    'theme': themes,
    'Date': dates
})

In [None]:
df_with_index

Unnamed: 0,actor.name,theme,Date
0,學生298,"SO ,12,",2024-06-24
1,學生299,"NA ,12,",2024-06-26
2,學生299,"SO ,12,",2024-06-27
3,學生300,"CH ,12,",2024-06-24
4,學生300,"SO ,12,",2024-06-24
...,...,...,...
352,學生591,"EN ,5,",2024-06-25
353,學生592,和喜鵲學築巢,2024-06-28
354,學生592,軍犬立大功,2024-06-28
355,學生593,迎向第一次段考讀書計畫表,2024-07-02


In [None]:
merged_df = pd.merge(df_with_index, behavior_counts, on=['actor.name', 'theme', 'Date'])

In [None]:
merged_df[merged_df['actor.name'] == '學生315']

Unnamed: 0,actor.name,theme,Date,回答,完成,暫停,查看,觀看,開始嘗試
75,學生315,"SO ,12,",2024-06-23,40,1,0,0,0,2
76,學生315,"CH ,12,",2024-06-24,40,1,0,0,0,1
77,學生315,"SO ,12,",2024-06-24,40,1,0,0,0,1
78,學生315,"NA ,12,",2024-06-25,120,3,0,0,0,8
79,學生315,"NA ,12,",2024-06-26,83,2,0,0,0,4
80,學生315,"SO ,12,",2024-06-27,40,1,0,0,0,1


In [None]:
merged_df

Unnamed: 0,actor.name,theme,Date,回答,完成,暫停,查看,觀看,開始嘗試
0,學生298,"SO ,12,",2024-06-24,40,1,0,0,0,1
1,學生299,"NA ,12,",2024-06-26,0,0,0,0,0,2
2,學生299,"SO ,12,",2024-06-27,0,0,0,0,0,1
3,學生300,"CH ,12,",2024-06-24,40,1,0,0,0,1
4,學生300,"SO ,12,",2024-06-24,80,2,0,0,0,2
...,...,...,...,...,...,...,...,...,...
352,學生591,"EN ,5,",2024-06-25,30,3,0,0,0,4
353,學生592,和喜鵲學築巢,2024-06-28,0,0,0,1,0,0
354,學生592,軍犬立大功,2024-06-28,0,0,0,1,0,0
355,學生593,迎向第一次段考讀書計畫表,2024-07-02,0,0,0,1,0,0


In [None]:
merged_df[merged_df['actor.name'] == '學生315']

Unnamed: 0,actor.name,theme,Date,回答,完成,暫停,查看,觀看,開始嘗試
75,學生315,"SO ,12,",2024-06-23,40,1,0,0,0,2
76,學生315,"CH ,12,",2024-06-24,40,1,0,0,0,1
77,學生315,"SO ,12,",2024-06-24,40,1,0,0,0,1
78,學生315,"NA ,12,",2024-06-25,120,3,0,0,0,8
79,學生315,"NA ,12,",2024-06-26,83,2,0,0,0,4
80,學生315,"SO ,12,",2024-06-27,40,1,0,0,0,1


In [None]:
view = merged_df[merged_df['觀看'] > 0]

In [None]:
view

Unnamed: 0,actor.name,theme,Date,回答,完成,暫停,查看,觀看,開始嘗試
83,學生317,L11文意理解,2024-06-24,0,0,2,0,2,0
84,學生317,L11重點句型,2024-06-24,0,0,1,0,1,0
85,學生317,L12文意理解,2024-06-24,0,0,1,0,1,0
86,學生317,L12重點句型,2024-06-24,0,0,1,0,1,0
129,學生356,第3單元 複習影片,2024-06-23,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...
347,學生589,L6臺灣的區域特色,2024-06-25,0,0,2,0,2,0
348,學生589,複習&統整CH4,2024-06-25,0,0,1,0,1,0
349,學生589,複習&統整CH5,2024-06-25,0,0,1,0,1,0
350,學生589,複習&統整CH5,2024-06-26,0,0,0,0,1,0


In [None]:
import plotly.express as px

# Convert 'Date' to datetime
view['Date'] = pd.to_datetime(view['Date'])

# Group the data by Date and theme, summing the '觀看' counts
df_view_counts = view.groupby(['Date', 'theme'])['觀看'].sum().reset_index()

# Plot using Plotly
fig = px.bar(df_view_counts, x='Date', y='觀看', color='theme',
             title="Number of Views by Theme Over Time",
             labels={'觀看': 'View Count', 'theme': 'Theme'},
             barmode='stack')

# Update layout for better visualization
fig.update_layout(xaxis_title="Date", yaxis_title="View Count",
                  legend_title="Theme", title_x=0.5)

# Show the plot
fig.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  view['Date'] = pd.to_datetime(view['Date'])


In [None]:
TEMP = merged_df[merged_df['Date'] == date(2024, 6, 26)]

In [None]:
TEMP['theme'].unique()

array(['NA ,12,', 'SO ,12,', '複習&統整CH4', '複習&統整CH5', 'B4L6-1_文法動畫_不定代名詞',
       'L6臺灣的區域特色', '3-6節肢動物門', '3-6軟體、環節、刺絲胞、棘皮、扁形動物門', '3-6魚類、兩生類和爬蟲類',
       '3-6鳥類和哺乳類', 'L6社會安全與國家責任', 'L6臺灣的區域差異', 'MA ,12,',
       'L6戰後臺灣的文化與社會發展', 'L6戰後臺灣的經濟變遷', 'L6臺灣六都的特色'], dtype=object)

In [None]:
viewOnes = merged_df[merged_df['完成'] >= 1]

In [None]:
finishTasks = len(viewOnes['actor.name'].unique())

In [None]:
finishTasks / allStudents

0.40404040404040403

In [None]:
# Grouping the data by 'actor.name' and 'Date', and summarizing actions (sum for numeric columns)
grouped_df = viewOnes.groupby(['actor.name', 'Date']).sum().reset_index()

In [None]:
grouped_df[grouped_df['觀看'] > 1]

Unnamed: 0,actor.name,Date,theme,回答,完成,暫停,查看,觀看,開始嘗試


1. 只做題庫，不看影片 -> 想要知道是哪些學生，是看哪幾個科目
2. 只看影片，不做題庫 -> 想要知道是哪些學生，是看哪幾個科目
3. 只進來晃一下，不看影片，也不做題庫 -> 想要知道是哪些學生
4. 只做題庫，但沒有明確的題型標題 (None) 異常類別，需要待確認

依照這四類學生樣態，來歸納出學習行為的各類比例

被匡列出來學生樣態，對應的學習成績，這次探勘做不到 (目前因為康軒的資料沒有辦法提供學習成績)

In [None]:
import pandas as pd
import plotly.express as px

# 只做題庫，不看影片
only_quiz = merged_df[(merged_df['觀看'] == 0) & (merged_df['完成'] > 0)]
only_quiz_students = only_quiz['actor.name'].unique()

# 只看影片，不做題庫
only_watch = merged_df[(merged_df['觀看'] > 0) & (merged_df['完成'] == 0)]
only_watch_students = only_watch['actor.name'].unique()

# 只進來晃一下，不看影片，也不做題庫
only_lurk = merged_df[(merged_df['觀看'] == 0) & (merged_df['完成'] == 0)]
only_lurk_students = only_lurk['actor.name'].unique()

# 只做題庫，但沒有明確的題型標題 (None) 異常類別
only_quiz_no_title = merged_df[(merged_df['theme'].isna()) & (merged_df['完成'] > 0)]
only_quiz_no_title_students = only_quiz_no_title['actor.name'].unique()

# 統計不同樣態的學生數量
summary = {
    '只做題庫': len(only_quiz_students),
    '只看影片': len(only_watch_students),
    '只進來晃一下': len(only_lurk_students),
    '只做題庫(無題型)': len(only_quiz_no_title_students)
}

# 將樣態視覺化
df_summary = pd.DataFrame(list(summary.items()), columns=['行為樣態', '學生數量'])
fig = px.bar(df_summary, x='行為樣態', y='學生數量', title="不同學習行為樣態的學生數量", labels={'學生數量': '學生數量'})
fig.show()

# 顯示只做題庫的學生和科目
print("只做題庫的學生與科目：")
only_quiz_subjects = merged_df[(merged_df['actor.name'].isin(only_quiz_students))]
print(only_quiz_subjects[['actor.name', 'theme']].drop_duplicates())

# 顯示只看影片的學生和科目
print("只看影片的學生與科目：")
only_watch_subjects = merged_df[(merged_df['actor.name'].isin(only_watch_students))]
print(only_watch_subjects[['actor.name', 'theme']].drop_duplicates())

# 顯示只進來晃一下的學生
print("只進來晃一下的學生：")
print(only_lurk[['actor.name']].drop_duplicates())

# 顯示只做題庫但無題型標題的異常學生
print("只做題庫但無題型標題的異常學生：")
print(only_quiz_no_title[['actor.name']].drop_duplicates())


只做題庫的學生與科目：
    actor.name     theme
0        學生298   SO ,12,
3        學生300   CH ,12,
4        學生300   SO ,12,
5        學生300   NA ,12,
8        學生301   CH ,12,
..         ...       ...
340      學生588   MA ,12,
341      學生588  複習&統整CH4
342      學生588  複習&統整CH5
351      學生590    EN ,5,
352      學生591    EN ,5,

[199 rows x 2 columns]
只看影片的學生與科目：
    actor.name              theme
83       學生317            L11文意理解
84       學生317            L11重點句型
85       學生317            L12文意理解
86       學生317            L12重點句型
129      學生356          第3單元 複習影片
..         ...                ...
346      學生589          L6臺灣的區域差異
347      學生589          L6臺灣的區域特色
348      學生589           複習&統整CH4
349      學生589           複習&統整CH5
356      學生594  國語二上第7課教學動畫愛漂亮的國王

[145 rows x 2 columns]
只進來晃一下的學生：
    actor.name
1        學生299
9        學生301
24       學生304
34       學生306
40       學生307
42       學生308
46       學生309
49       學生310
67       學生313
81       學生316
90       學生320
159      學生373
168      學生385

不同行為與科目之間的交互關係分析

    分析不同學生在不同科目上的行為特徵，例如學生觀看的影片內容是否集中在特定科目，或者是否傾向於只做特定科目的題庫。
    視覺化建議：
        使用熱力圖（Heatmap）來展示不同學生在各個科目中不同行為的頻率。
        X軸表示科目，Y軸表示學生，色彩強度表示某一學生對某科目的觀看或完成次數。

In [None]:
import plotly.express as px
import plotly.graph_objects as go

# 使用數據透視表進行重新排列表格
pivot_subject_behavior = subject_behavior.pivot_table(index='actor.name', columns='theme', values='觀看', fill_value=0)

# 使用 Plotly 的熱力圖繪製
fig = go.Figure(data=go.Heatmap(
    z=pivot_subject_behavior.values,
    x=pivot_subject_behavior.columns,
    y=pivot_subject_behavior.index,
    colorscale='YlGnBu'
))

# 設定圖表標題和軸標籤
fig.update_layout(
    title="不同學生在各科目觀看影片的次數",
    xaxis_title="科目",
    yaxis_title="學生",
    height=800,  # 設定圖表高度
    width=1000   # 設定圖表寬度
)

# 顯示圖表
fig.show()

學生行為分群分析

    對學生行為進行聚類分析（Clustering），將行為模式相似的學生分群，幫助理解不同學生群體的學習行為特徵。
    視覺化建議：
        可以用散佈圖（Scatter Plot）或主成分分析（PCA）後的二維投影，展示不同學生群體的行為特徵。
        不同顏色的點表示不同群組的學生，X和Y軸表示主要的行為指標（如「觀看次數」、「完成次數」）。

In [None]:
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# 計算每個學生的行為數據
student_behavior_summary = merged_df.groupby('actor.name').agg({'觀看': 'sum', '完成': 'sum'}).reset_index()

# 使用KMeans進行聚類
kmeans = KMeans(n_clusters=4)
student_behavior_summary['cluster'] = kmeans.fit_predict(student_behavior_summary[['觀看', '完成']])

# 使用PCA進行降維視覺化
pca = PCA(n_components=2)
components = pca.fit_transform(student_behavior_summary[['觀看', '完成']])
student_behavior_summary['pca1'] = components[:, 0]
student_behavior_summary['pca2'] = components[:, 1]

# 視覺化學生行為聚類結果
fig = px.scatter(
    student_behavior_summary,
    x='pca1',
    y='pca2',
    color='cluster',
    title="學生行為聚類結果",
    labels={'pca1': '主成分1', 'pca2': '主成分2', 'cluster': '聚類群組'},
    hover_data=['actor.name', '觀看', '完成']  # 在互動中顯示更多細節
)

# 更新圖表的佈局
fig.update_layout(
    xaxis_title="主成分1",
    yaxis_title="主成分2",
    legend_title="群組",
    title_x=0.5,  # 圖表標題居中
    height=600,
    width=800
)

# 顯示圖表
fig.show()