In [None]:
# 이게 최종 이거 사용했어!!!-*- coding: utf-8 -*-
"""
Hybrid Phase Clustering  |  Actor 6D + Topic 11D
────────────────────────────────────────────────────────────────────────
· weight grid : (1‥3)×(1‥3) , K = 3‥7
· 3-year centred MA
· K-Means(n_init=20) & Ward hierarchical
· Phase  : (km==hc  or  ±1y)  +  ≥3y  +  GAP 흡수(≤2y)
· Hungarian 매핑 → global ARI & phase-purity
· 시각화  : Silhouette / Elbow / CH grid, Phase timeline, profile heatmap
"""

# ───────────────────── 0. 라이브러리 ─────────────────────
import os, sys, warnings, itertools
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster       import KMeans
from sklearn.metrics       import silhouette_score, calinski_harabasz_score, adjusted_rand_score
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.optimize          import linear_sum_assignment

plt.rcParams["font.family"] = "Malgun Gothic"
warnings.filterwarnings("ignore", category=FutureWarning)

# ───────────────────── 1. 경로 / 파일 ─────────────────────
FILE = (
    r"C:\Users\Administrator\Desktop\0.졸업논문\2. Chapter 2\Data\결과값\DTM_statement per actor_뉴스기사\DTM_statement per actor\전처리\actor,topic으로해서 시기구분\DTM_Doc_Topic_Distribution_20250523기준최종업데이트_1st revision_Na제거 - 복사본.xlsx"
)
if not os.path.isfile(FILE):
    sys.exit(f"[Error] 파일이 없습니다 → {FILE}")

SAVE_DIR = os.path.dirname(FILE)
STAMP    = datetime.now().strftime("%Y%m%d_%H%M%S")

# ───────────────────── 2. 데이터 로드 ─────────────────────
df = pd.read_excel(
    FILE, engine="openpyxl",
    dtype={"year": int, "broader_type": str, "DTM_DominantTopic": int}
)
ACTOR_COL = "broader_type"
TOPIC_COL = "DTM_DominantTopic"

# ───────────────────── 3. 연도 × Actor / Topic 비중 ─────────────────────
actor_cnt = (
    df.groupby(["year", ACTOR_COL]).size()
      .unstack(fill_value=0)
      .reindex(columns=sorted(df[ACTOR_COL].dropna().unique()))
)
actor_share = actor_cnt.div(actor_cnt.sum(axis=1), axis=0)

topic_cnt = (
    df.groupby(["year", TOPIC_COL]).size()
      .unstack(fill_value=0)
      .reindex(columns=range(11), fill_value=0)
)
topic_share = topic_cnt.div(topic_cnt.sum(axis=1), axis=0)

# ───────────────────── 4. 컷오프 + 3-year MA ─────────────────────
MIN_DOCS = 15
mask     = (actor_cnt.sum(axis=1) >= MIN_DOCS)
actor_share = actor_share.loc[mask]
topic_share = topic_share.loc[mask]

pivot_base = actor_share.join(topic_share, how="inner")
pivot_base.columns = pivot_base.columns.astype(str)

pivot_ma = (
    pivot_base
      .rolling(window=3, center=True, min_periods=1)
      .mean()
      .dropna(how="any")
).copy()
if pivot_ma.empty:
    sys.exit("[Error] 유효 연도가 0개입니다. MIN_DOCS 값을 낮춰 보세요.")

yrs    = pivot_ma.index
A_cols = actor_share.columns.astype(str)
T_cols = topic_share.columns.astype(str)

# ───────────────────── 5. weight × K 그리드 ─────────────────────
weights = list(itertools.product(range(1,4), repeat=2))   # 9 조합
k_range = range(3,8)
records = []

def metrics(X, k):
    km = KMeans(n_clusters=k, random_state=42, n_init=20).fit(X)
    lbl = km.labels_
    return silhouette_score(X, lbl), calinski_harabasz_score(X, lbl), km.inertia_

X_std = StandardScaler().fit_transform(pivot_ma)
X_std = pd.DataFrame(X_std, index=pivot_ma.index, columns=pivot_ma.columns)

for wa, wt in weights:
    Xw = X_std.copy()
    Xw[A_cols] *= wa
    Xw[T_cols] *= wt
    for k in k_range:
        records.append([wa, wt, k, *metrics(Xw.values, k)])

grid = (pd.DataFrame(records, columns=["W_actor","W_topic","k","Silhouette","CH","SSE"])
        .sort_values(["Silhouette","CH"], ascending=False))
grid.to_excel(os.path.join(SAVE_DIR, f"metric_weight_k_grid_{STAMP}.xlsx"), index=False)

# ──(선택) 그리드 그래프
for col, ylabel in [("Silhouette","Silhouette"),
                    ("SSE","SSE"),
                    ("CH","Calinski-Harabasz")]:
    plt.figure(figsize=(6,4))
    for (wa,wt), g in grid.groupby(["W_actor","W_topic"]):
        plt.plot(g["k"], g[col], marker="o", label=f"A{wa}:T{wt}")
    plt.xlabel("k"); plt.ylabel(ylabel); plt.title(f"{ylabel} vs k")
    plt.legend(); plt.tight_layout()
    plt.savefig(os.path.join(SAVE_DIR, f"plot_{col.lower()}_grid_{STAMP}.png"))
    plt.clf()

# ───────────────────── 6. 최적 weight & k ─────────────────────
best  = grid.iloc[0]
WA, WT, best_K = map(int, [best.W_actor, best.W_topic, best.k])
print(f"\n✅ best → A:{WA}, T:{WT}, K:{best_K}  (Sil={best.Silhouette:.3f})")

# ───────────────────── 7. 모델 학습 ─────────────────────
Xopt = pivot_ma.copy()
Xopt[A_cols] *= WA
Xopt[T_cols] *= WT
X_std2 = StandardScaler().fit_transform(Xopt)

km_lbl = KMeans(n_clusters=best_K, random_state=42, n_init=20).fit(X_std2).labels_
hc_lbl = fcluster(linkage(X_std2, method="ward"), t=best_K, criterion="maxclust") - 1

# Hungarian 매핑
max_k  = max(km_lbl.max()+1, hc_lbl.max()+1)
cost   = np.zeros((max_k, max_k), dtype=int)
for i in range(km_lbl.max()+1):
    for j in range(hc_lbl.max()+1):
        cost[i,j] = -np.sum((km_lbl==i) & (hc_lbl==j))
row_ind, col_ind = linear_sum_assignment(cost)
hc_map = hc_lbl.copy()
for j,i in enumerate(col_ind):
    hc_map[hc_lbl==j] = i
print(f"   Hungarian ARI = {adjusted_rand_score(km_lbl, hc_map):.3f}")

# ───────────────────── 8. Phase 추출 & GAP 흡수 ─────────────────────
MIN_PHASE_LEN = 3
GAP_TOL       = 2
phase_raw, open_st, last_lb = [], None, None
for i,(yr,lk,lh) in enumerate(zip(yrs, km_lbl, hc_lbl)):
    cond = (lk==lh) or ((i>0 and km_lbl[i-1]==lh) or (i<len(km_lbl)-1 and km_lbl[i+1]==lh))
    if cond:
        if open_st is None:
            open_st, last_lb = yr, lk
    else:
        if open_st and (yrs[i-1]-open_st+1)>=MIN_PHASE_LEN:
            phase_raw.append([open_st, yrs[i-1], last_lb])
        open_st=None
if open_st and (yrs[-1]-open_st+1)>=MIN_PHASE_LEN:
    phase_raw.append([open_st, yrs[-1], last_lb])

phase=[]
for s,e,l in phase_raw:
    if not phase:
        phase.append([s,e,l]); continue
    ps,pe,pl = phase[-1]
    if l==pl and (s-pe-1)<=GAP_TOL:
        phase[-1][1]=e
    else:
        phase.append([s,e,l])

print("\n[Phase]")
for s,e,l in phase:
    print(f"  P-{l+1}: {s}–{e}")

# ───────────────────── 9. 라벨 전파(ffill/bfill) ─────────────────────
full_years=np.arange(yrs.min(),yrs.max()+1)
(pd.Series(km_lbl, index=yrs)
   .reindex(full_years).ffill().bfill().astype(int)
   .to_csv(os.path.join(SAVE_DIR,f"km_label_full_{STAMP}.csv")))
(pd.Series(hc_lbl, index=yrs)
   .reindex(full_years).ffill().bfill().astype(int)
   .to_csv(os.path.join(SAVE_DIR,f"hc_label_full_{STAMP}.csv")))

# ───────────────────── 10. Phase purity/ARI ─────────────────────
rows=[]
for s,e,l in phase:
    mask=(yrs>=s)&(yrs<=e)
    rows.append([f"{s}-{e}", l, mask.sum(),
                 round((km_lbl[mask]==hc_map[mask]).mean(),3),
                 round(adjusted_rand_score(km_lbl[mask], hc_lbl[mask]),3)])
pd.DataFrame(rows, columns=["phase","km_lbl","n_years","hung_purity","ARI"])\
  .to_excel(os.path.join(SAVE_DIR,f"phase_purity_{STAMP}.xlsx"),index=False)

# ───────────────────── 11. 결과 저장 ─────────────────────
pivot_ma["km_lbl"]=km_lbl; pivot_ma["hc_lbl"]=hc_lbl
pivot_ma.reset_index().to_excel(os.path.join(SAVE_DIR,f"phase_table_{STAMP}.xlsx"),index=False)

profile=pivot_ma.groupby("km_lbl").mean().round(3)
profile.to_excel(os.path.join(SAVE_DIR,f"cluster_profiles_{STAMP}.xlsx"))

top=[]
for cl,row in profile.iterrows():
    top.append([cl,
        ", ".join(row[A_cols].nlargest(3).index),
        ", ".join(row[T_cols].nlargest(3).index)])
pd.DataFrame(top, columns=["cluster","top3_actors","top3_topics"])\
  .to_excel(os.path.join(SAVE_DIR,f"cluster_topkeys_{STAMP}.xlsx"),index=False)

# ── 시각화
fig,ax=plt.subplots(figsize=(8,2))
ax.scatter(yrs, np.zeros_like(yrs), c=km_lbl, cmap="tab10", s=110)
for s,e,l in phase:
    ax.axvspan(s-0.5,e+0.5,color=f"C{l}",alpha=.13)
ax.set_yticks([]); ax.set_xlabel("Year"); ax.set_title("Phase timeline")
plt.tight_layout(); plt.savefig(os.path.join(SAVE_DIR,f"timeline_phase_{STAMP}.png")); plt.clf()

plt.figure(figsize=(11,4))
sns.heatmap(profile.loc[:,list(A_cols)+list(T_cols)],
            cmap="RdYlGn",center=profile.values.mean(),
            cbar_kws={"label":"mean share"})
plt.title("Cluster profile heatmap"); plt.tight_layout()
plt.savefig(os.path.join(SAVE_DIR,f"heatmap_profile_{STAMP}.png")); plt.clf()

print(f"\n✅ 모든 결과가 {SAVE_DIR} 폴더에 저장되었습니다  (stamp={STAMP})")



✅ best → A:2, T:1, K:6  (Sil=0.388)
   Hungarian ARI = 0.602

[Phase]
  P-6: 2000–2002
  P-1: 2007–2010
  P-2: 2021–2024

✅ 모든 결과가 C:\Users\Administrator\Desktop\0.졸업논문\2. Chapter 2\Data\결과값\DTM_statement per actor_뉴스기사\DTM_statement per actor\전처리\actor,topic으로해서 시기구분 폴더에 저장되었습니다  (stamp=20250525_180654)


<Figure size 600x400 with 0 Axes>

<Figure size 600x400 with 0 Axes>

<Figure size 600x400 with 0 Axes>

<Figure size 800x200 with 0 Axes>

<Figure size 1100x400 with 0 Axes>