In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [None]:
df_2024 = pd.read_csv("../scraper/kdd2024/kdd2024_subsessions.csv")
df_2025 = pd.read_csv("../scraper/kdd2025/kdd2025_subsessions.csv")
df_2024.shape, df_2025.shape

In [None]:
titles_2024 = df_2024["title"].tolist()
titles_2025 = df_2025["title"].tolist()

fig, axs = plt.subplots(1, 2, figsize=(20, 16))

wordcloud_2024 = WordCloud(background_color='white', colormap='tab20', width=800, height=600).generate(" ".join(titles_2024))
axs[0].imshow(wordcloud_2024, interpolation='bilinear')
axs[0].axis('off')
axs[0].set_title('KDD 2024 Word Cloud', fontsize=14)

wordcloud_2025 = WordCloud(background_color='white', colormap='tab20', width=800, height=600).generate(" ".join(titles_2025))
axs[1].imshow(wordcloud_2025, interpolation='bilinear')
axs[1].axis('off')
axs[1].set_title('KDD 2025 Word Cloud', fontsize=14)


In [None]:
[t for t in titles_2025 if "graph" in t.lower()]

In [None]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans

# 1) Graphを含むタイトル抽出（大小無視）
df = pd.concat([df_2024[['title']], df_2025[['title']]], ignore_index=True)
graph_df = df[df['title'].str.contains(r'\bgraph(s)?\b', case=False, na=False)].copy()
titles = graph_df['title'].tolist()

# 2) ルールベース分類（すぐ全体感を掴む）
rules = {
    "GNN/Graph Neural Network": r"\b(gnn|graph neural|message passing)\b",
    "Knowledge Graph / KG": r"\b(knowledge graph|kg\b)\b",
    "Heterogeneous / Attributed Graph": r"\b(heterogeneous|attributed|multi-?relational|meta-?path)\b",
    "Temporal / Dynamic / Spatio-temporal": r"\b(temporal|dynamic|time[- ]series|spatio|trajectory)\b",
    "Recommender on Graph": r"\b(recommend|recommender|ctr|ranking)\b",
    "Anomaly / Fraud / Detection": r"\b(anomaly|fraud|outlier|intrusion|detection)\b",
    "Generation / Diffusion on Graph": r"\b(generate|generation|diffusion|synthesis)\b",
    "Contrastive / Self-supervised": r"\b(contrastive|self[- ]supervised|ssl)\b",
    "Causal / Reasoning / Logic": r"\b(causal|reasoning|logic|symbolic)\b",
    "LLM × Graph / KG-RAG": r"\b(llm|large language|retrieval|rag|agent)\b",
}

def tag_title(t):
    hits = [k for k, pat in rules.items() if re.search(pat, t.lower())]
    return hits if hits else ["Other"]

graph_df["rule_tags"] = graph_df["title"].apply(tag_title)

# 集計
rule_counts = (
    graph_df.explode("rule_tags")
            .groupby("rule_tags").size()
            .sort_values(ascending=False)
)
print("=== Rule-based counts ===")
print(rule_counts)

# 3) 自動トピック（NMF）: 「graph」等をストップワードに入れて本質語を浮かせる
stop = set([
    "graph","graphs","based","model","models","learning","data","via","toward","using",
    "method","framework","approach","task","large","scale","neural","network","networks"
])
vectorizer = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1,2),
    max_df=0.6, min_df=2,
    stop_words=stop
)
X = vectorizer.fit_transform(titles)

n_topics = 8  # データ量を見て調整
nmf = NMF(n_components=n_topics, random_state=0, init="nndsvd").fit(X)
W = nmf.transform(X)
H = nmf.components_
terms = vectorizer.get_feature_names_out()

def top_terms(component, k=10):
    idx = component.argsort()[::-1][:k]
    return [terms[i] for i in idx]

print("\n=== NMF topics (top terms) ===")
topic_terms = []
for t_id, comp in enumerate(H):
    words = top_terms(comp, k=10)
    topic_terms.append(words)
    print(f"Topic {t_id}: {', '.join(words)}")

# 各トピックの代表タイトル（スコア最大上位3件）
print("\n=== Representative titles per topic ===")
for t_id in range(n_topics):
    idx = W[:, t_id].argsort()[::-1][:3]
    reps = [titles[i] for i in idx]
    print(f"\n[Topic {t_id}]")
    for r in reps:
        print(" -", r)

# 4) KMeansでクラスタ名を自動案（上位語から命名ヘルパ）
k = 8
km = KMeans(n_clusters=k, n_init="auto", random_state=0).fit(X)
graph_df["kmeans_cluster"] = km.labels_

# 各クラスタの上位語
import numpy as np
print("\n=== KMeans clusters (top terms) ===")
for c in range(k):
    mask = (graph_df["kmeans_cluster"] == c).values
    centroid = km.cluster_centers_[c]
    idx = np.argsort(centroid)[::-1][:10]
    print(f"Cluster {c}: {', '.join(terms[i] for i in idx)}")


In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF

# 1) Graphを含むタイトルだけ抽出
def extract_graph_titles(df):
    g = df[df["title"].str.contains(r"\bgraph(s)?\b", case=False, na=False)].copy()
    g["clean_title"] = g["title"].str.replace(r"\s+", " ", regex=True).str.strip()
    return g

g24 = extract_graph_titles(df_2024)
g25 = extract_graph_titles(df_2025)
all_graph = pd.concat([g24.assign(year=2024), g25.assign(year=2025)], ignore_index=True)

# 2) ストップワード（listにするのがポイント）
custom = {
    "graph","graphs","based","model","models","learning","data","via","toward",
    "using","method","framework","approach","task","large","scale",
    "neural","network","networks"
}
stop_words = list(ENGLISH_STOP_WORDS.union(custom))

# 3) TF-IDF（bi-gramで表現を拾う、少数データでも動くよう min_df=1）
vec = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1,2),
    min_df=1, max_df=0.6,
    stop_words=stop_words,
    token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z\-]+\b"  # meta-path, time-series なども拾う
)

X = vec.fit_transform(all_graph["clean_title"])
terms = vec.get_feature_names_out()

# 4) NMFでトピック抽出
n_topics = 8  # データ量で調整（多ければ増やす）
nmf = NMF(n_components=n_topics, init="nndsvd", random_state=0)
W = nmf.fit_transform(X)
H = nmf.components_

def top_terms(component, k=10):
    idx = component.argsort()[::-1][:k]
    return [terms[i] for i in idx]

print("=== NMF topics (top terms) ===")
for t_id, comp in enumerate(H):
    print(f"Topic {t_id}: {', '.join(top_terms(comp))}")

# 5) 年別のトピック強度（平均スコア）を見る
all_graph["topic"] = W.argmax(axis=1)
topic_strength_by_year = (
    pd.DataFrame(W).assign(year=all_graph["year"].values)
    .groupby("year").mean()
)
print("\n=== Topic strength by year (mean W) ===")
print(topic_strength_by_year)

# 6) 代表タイトルを確認
for t_id in range(n_topics):
    idx = np.argsort(W[:, t_id])[::-1][:3]
    print(f"\n[Topic {t_id} representatives]")
    for i in idx:
        print("-", all_graph.iloc[i]["title"])


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def make_ngram_wordcloud(titles, ngram_range=(2, 3), stop_words="english", colormap="tab20"):
    # n-gram の頻度を集計
    vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words=stop_words)
    X = vectorizer.fit_transform(titles)
    freqs = dict(zip(vectorizer.get_feature_names_out(), X.toarray().sum(axis=0)))
    
    # WordCloud生成
    wc = WordCloud(
        background_color="white",
        colormap=colormap,
        width=800,
        height=600
    ).generate_from_frequencies(freqs)
    return wc

titles_2024 = df_2024["title"].tolist()
titles_2025 = df_2025["title"].tolist()

fig, axs = plt.subplots(1, 2, figsize=(20, 16))

# 2024年
wc_2024 = make_ngram_wordcloud(titles_2024, ngram_range=(2, 2))
axs[0].imshow(wc_2024, interpolation="bilinear")
axs[0].axis("off")
axs[0].set_title("KDD 2024 Word Cloud (2 gram)", fontsize=14)

# 2025年
wc_2025 = make_ngram_wordcloud(titles_2025, ngram_range=(2, 2))
axs[1].imshow(wc_2025, interpolation="bilinear")
axs[1].axis("off")
axs[1].set_title("KDD 2025 Word Cloud (2 gram)", fontsize=14)

plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

# ===== パラメータ =====
NGRAM_RANGE = (2, 2)          # 2-gram以上
TOPN = 50                     # 上位いくつを表示
MIN_2025_COUNT = 3            # 2025で最低何回以上出たn-gramを対象にするか（ノイズ除去）
EXTRA_STOP = {
    # つなぎ語など無意味フレーズを追加で除外（必要に応じて増やしてください）
    "based on","using","with","without","by","for","of","and","to","from",
    "toward","via","state of","real world","large scale","case study"
}
STOP_WORDS = ENGLISH_STOP_WORDS.union(set(" ".join(EXTRA_STOP).split()))  # 単語ベース除外

# ===== 1) 一貫した語彙でベクトル化（結合してfit、各年にtransform） =====
corpus_all = titles_2024 + titles_2025
vectorizer = CountVectorizer(
    ngram_range=NGRAM_RANGE,
    stop_words="english",     # 単語ストップは英語ベースで
    lowercase=True,
    min_df=1, max_df=0.9,     # 片寄り防止に高頻度語を少し落とす
    token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z\-]+\b"
)
X_all = vectorizer.fit_transform(corpus_all)
vocab = np.array(vectorizer.get_feature_names_out())

# 追加のフレーズ除去（EXTRA_STOP はフレーズ単位）
mask_keep = np.ones_like(vocab, dtype=bool)
if EXTRA_STOP:
    bad = np.array([any(bad_phrase == v for bad_phrase in EXTRA_STOP) for v in vocab])
    # 「〜 of」「using the」など部分一致で落としたければ .contains に変更
    mask_keep &= ~bad

# 各年のカウント行列
X_24 = vectorizer.transform(titles_2024)
X_25 = vectorizer.transform(titles_2025)

c24 = np.asarray(X_24.sum(axis=0)).ravel()
c25 = np.asarray(X_25.sum(axis=0)).ravel()

# ストップ対象を除外
c24 = c24[mask_keep]
c25 = c25[mask_keep]
vocab = vocab[mask_keep]

# ===== 2) 指標計算 =====
n24 = len(titles_2024)
n25 = len(titles_2025)

df = pd.DataFrame({
    "ngram": vocab,
    "count_2024": c24,
    "count_2025": c25,
})
# コーパス規模差を補正した頻度（100タイトルあたり）
df["freq_2024_per100"] = df["count_2024"] / max(n24, 1) * 100
df["freq_2025_per100"] = df["count_2025"] / max(n25, 1) * 100
df["delta_abs"] = df["count_2025"] - df["count_2024"]
df["delta_per100"] = df["freq_2025_per100"] - df["freq_2024_per100"]

# スムージング付きの成長率（出現ゼロ対策）
alpha = 0.5
df["growth_ratio"] = (df["count_2025"] + alpha) / (df["count_2024"] + alpha)
df["is_new_in_2025"] = (df["count_2024"] == 0) & (df["count_2025"] > 0)

# ノイズ削減（2025で最低出現回数）
df_filt = df[df["count_2025"] >= MIN_2025_COUNT].copy()

# ===== 3) ランキング出力 =====
# ①純増（規模補正後）のランキング
rank_delta = (df_filt
              .sort_values(["delta_per100","count_2025"], ascending=False)
              .head(TOPN))

# ②成長率（ある程度出現しているものに限定）
rank_growth = (df_filt[df_filt["count_2024"] >= 2]     # 極小母数を除外
               .sort_values(["growth_ratio","count_2025"], ascending=False)
               .head(TOPN))

print("\n=== 2024→2025 増加量ランキング（per 100 titles 基準） ===")
print(rank_delta[["ngram","count_2024","count_2025","freq_2024_per100","freq_2025_per100","delta_per100"]]
      .reset_index(drop=True))

print("\n=== 2024→2025 成長率ランキング（母数>=2 & スムージング） ===")
print(rank_growth[["ngram","count_2024","count_2025","growth_ratio"]]
      .reset_index(drop=True))

# 必要ならCSVに保存
# rank_delta.to_csv("ngram_delta_rank.csv", index=False)
# rank_growth.to_csv("ngram_growth_rank.csv", index=False)


In [None]:
import re
import pandas as pd

def extract_titles_for_phrase(df, phrase):
    """
    phrase: "anomaly detection" のような2-gram文字列
    - ハイフン/空白を許容（例: anomaly-detection）
    - 複数形を軽く許容（models など）
    """
    w1, w2 = phrase.split()
    # 後半語が model → models のように複数形の s を許容
    plural_ok = r"(s)?" if w2.endswith("model") or w2.endswith("Model") else ""
    pat = rf"(?i)\b{re.escape(w1)}[-\s]+{re.escape(w2)}{plural_ok}\b"
    m = df["title"].str.contains(pat, regex=True, na=False)
    return df.loc[m, ["title"]].copy()

# --- 年別抽出 ---
anom_2024 = extract_titles_for_phrase(df_2024, "anomaly detection")
anom_2025 = extract_titles_for_phrase(df_2025, "anomaly detection")


print("======== Anomaly Detection 2024:", len(anom_2024))
anom_2024 = anom_2024.reset_index(drop=True)
for i, row in anom_2024.iterrows():
    print(f"{i+1}: {row['title']}")
print("======== Anomaly Detection 2025:", len(anom_2025))
anom_2025 = anom_2025.reset_index(drop=True)
for i, row in anom_2025.iterrows():
    print(f"{i+1}: {row['title']}")

# 必要なら年情報を付けて結合＆保存
anom_2024["year"] = 2024
anom_2025["year"] = 2025

anom_all = pd.concat([anom_2024, anom_2025], ignore_index=True)

# 表示（Jupyterなら）
# display(anom_all)
# display(lm_all)

# CSV保存したい場合
# anom_all.to_csv("titles_anomaly_detection_2024_2025.csv", index=False)
# lm_all.to_csv("titles_language_models_2024_2025.csv", index=False)
