# 3C structure

## heatmap

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.ticker as mticker

# ------------------ 设置风格 ------------------
sns.set_style("whitegrid")
plt.rcParams.update({
    "axes.edgecolor": "black",         
    "axes.linewidth": 1.0,             
    "axes.facecolor": "white",         
    "grid.color": "gray",              
    "grid.alpha": 0.2,                 
    "grid.linestyle": "-",
    "xtick.color": "black",
    "ytick.color": "black",
    "text.color": "black",
    "font.size": 12,
    "figure.facecolor": "white",
})
sns.set_context("notebook", font_scale=1.2)

# ------------------ 使用 GnBu 渐变色调色板 ------------------
cmap = sns.color_palette("GnBu", as_cmap=True)

# ------------------ 读取 Excel 文件 ------------------
df = pd.read_excel("./vis/structure.xlsx", index_col=0)
df = df.astype(float)
df = df.drop(index="Accuracy", errors="ignore")

# ------------------ 分组列名 ------------------
group1 = ["Metal Source", "Organic Linkers Source", "Modulator Source", "Solvent Source",
          "Quantity of Metal", "Quantity of Organic Linkers", "Quantity of Modulator", "Quantity of Solvent"]
group2 = ["pH", "Synthesis Temperature", "Synthesis Time", "Equipment"]
group3 = ["Crystal Morphology", "Yield"]

# ------------------ 合并数据 ------------------
# 直接合并所有列，不添加空列
all_columns = group1 + group2 + group3
selected_df = df[all_columns]

# ------------------ 设置热图大小 ------------------
cell_size = 2
fig_width = len(all_columns) * cell_size
fig_height = len(df.index) * cell_size

# ------------------ 创建图表 ------------------
plt.figure(figsize=(fig_width, fig_height))

# 绘制热图
heatmap = sns.heatmap(selected_df,
                      annot=True,
                      fmt=".2f",
                      cmap=cmap,
                      square=True,
                      linewidths=0.5,
                      linecolor='black',
                      cbar_kws={'label': 'Value'},
                      annot_kws={"fontsize": 16, "ha": "center", "va": "center"})

# 设置标签
plt.xticks(np.arange(len(all_columns)) + 0.5, all_columns, rotation=45, ha='right', fontsize=18)
plt.yticks(np.arange(len(df.index)) + 0.5, df.index, fontsize=18)

# 添加粗白线分隔不同组
plt.axvline(x=len(group1), color='white', linewidth=3)
plt.axvline(x=len(group1) + len(group2), color='white', linewidth=3)

# 添加组标题
group_fontsize = 20
plt.text(len(group1)/2, -0.05, "Chemicals", ha='center', fontsize=group_fontsize)
plt.text(len(group1) + len(group2)/2, -0.05, "Conditions", ha='center', fontsize=group_fontsize)
plt.text(len(group1) + len(group2) + len(group3)/2, -0.05, "Crystallizations", ha='center', fontsize=group_fontsize)

# 调整colorbar
cbar = heatmap.collections[0].colorbar
cbar.ax.tick_params(labelsize=16)
cbar.set_label("Value", fontsize=18)
cbar.ax.yaxis.set_major_formatter(mticker.FormatStrFormatter('%.2f'))

plt.tight_layout()
plt.savefig("heatmapclass.pdf", format="pdf", bbox_inches="tight")
plt.show()

## std

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ======================
# 0) Config
# ======================
xlsx_path = "./vis/structure.xlsx"
std_sheet = "std"
mean_sheet = "mean"   # 若不存在，会自动降级为只画 std

metrics = [ "Precision", "Recall", "F1"]

# 按你截图的顺序（请保持一致）
desired_cols = [
    "Metal Source",
    "Organic Linkers Source",
    "Modulator Source",
    "Solvent Source",
    "Quantity of Metal",
    "Quantity of Organic Linkers",
    "Quantity of Modulator",
    "Quantity of Solvent",
    "pH",
    "Synthesis Temperature",
    "Synthesis Time",
    "Equipment",
    "Crystal Morphology",
    "Yield",
]

# ======================
# 1) Style (你的风格)
# ======================
sns.set_style("whitegrid")
plt.rcParams.update({
    "axes.edgecolor": "black",
    "axes.linewidth": 1.0,
    "axes.facecolor": "white",
    "grid.color": "gray",
    "grid.alpha": 0.2,
    "grid.linestyle": "-",
    "xtick.color": "black",
    "ytick.color": "black",
    "text.color": "black",
    "font.size": 12,
    "figure.facecolor": "white",
})
sns.set_context("notebook", font_scale=1.2)

palette = sns.color_palette("GnBu", n_colors=len(metrics))
colors = dict(zip(metrics, palette))
markers = {"Accuracy": "o", "Precision": "s", "Recall": "^", "F1": "D"}

# ======================
# 2) Load sheets
# ======================
xls = pd.ExcelFile(xlsx_path)
std_df_raw = pd.read_excel(xls, sheet_name=std_sheet)

# 兼容列名空格
std_df_raw.columns = [c.strip() if isinstance(c, str) else c for c in std_df_raw.columns]

# 识别“指标列”作为行名（优先 Metric，否则用第一列）
if "Metric" in std_df_raw.columns:
    std_df_raw = std_df_raw.set_index("Metric")
else:
    std_df_raw = std_df_raw.set_index(std_df_raw.columns[0])

# 只保留你关心的指标与列顺序
std_df = std_df_raw.reindex(index=metrics, columns=desired_cols)
std_df = std_df.apply(pd.to_numeric, errors="coerce")

# 尝试读取 mean（如果存在）
has_mean = (mean_sheet in xls.sheet_names)
mean_df = None
if has_mean:
    mean_df_raw = pd.read_excel(xls, sheet_name=mean_sheet)
    mean_df_raw.columns = [c.strip() if isinstance(c, str) else c for c in mean_df_raw.columns]
    if "Metric" in mean_df_raw.columns:
        mean_df_raw = mean_df_raw.set_index("Metric")
    else:
        mean_df_raw = mean_df_raw.set_index(mean_df_raw.columns[0])

    mean_df = mean_df_raw.reindex(index=metrics, columns=desired_cols)
    mean_df = mean_df.apply(pd.to_numeric, errors="coerce")

# ======================
# 3) Plot
# ======================
all_cols = [c for c in desired_cols if c in std_df.columns]  # 按截图顺序过滤存在列
x = np.arange(len(all_cols))
width = 0.18

fig_w = max(10, len(all_cols) * 0.75)
fig, ax = plt.subplots(figsize=(fig_w, 5))

if has_mean:
    # mean ± std 误差棒图
    for i, m in enumerate(metrics):
        y = mean_df.loc[m, all_cols].values.astype(float)
        e = std_df.loc[m, all_cols].values.astype(float)

        ax.errorbar(
            x + (i - 1.5) * width,
            y,
            yerr=e,
            fmt=markers[m],
            color=colors[m],
            ecolor=colors[m],
            capsize=3,
            elinewidth=1.4,
            capthick=1.4,
            markersize=6,
            label=m,
            linestyle="none"
        )

    ax.set_ylabel("Performance (mean ± std)")
else:
    # 只有 std：画 std 的点图（没有误差棒）
    for i, m in enumerate(metrics):
        y = std_df.loc[m, all_cols].values.astype(float)
        ax.plot(
            x,
            y,
            marker=markers[m],
            markersize=6,
            linestyle="none",
            color=colors[m],
            label=m
        )

    ax.set_ylabel("Std (across runs)")

ax.set_xticks(x)
ax.set_xticklabels(all_cols, rotation=45, ha="right")

# y 轴范围：你可按数据改，这里给一个“自动 + 合理留白”
y_all = []
if has_mean:
    y_all = np.r_[mean_df.loc[:, all_cols].values.flatten(), (mean_df.loc[:, all_cols].values + std_df.loc[:, all_cols].values).flatten(),
                  (mean_df.loc[:, all_cols].values - std_df.loc[:, all_cols].values).flatten()]
else:
    y_all = std_df.loc[:, all_cols].values.flatten()

y_all = y_all[np.isfinite(y_all)]
if len(y_all) > 0:
    ymin = max(0.0, float(np.min(y_all)) - 0.02)
    ymax = min(1.0, float(np.max(y_all)) + 0.02)
    ax.set_ylim(ymin, ymax)
    
ax.grid(axis="y", alpha=0.25)

ax.legend(
    frameon=False,
    ncol=4,
    loc="lower center",
    bbox_to_anchor=(0.5, 1.02)
)

plt.tight_layout()

# 保存
plt.savefig("structurestd.pdf", bbox_inches="tight")
plt.show()

# FT-model

## ft

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 设置 Seaborn 样式和 matplotlib 参数
sns.set_style("whitegrid")
plt.rcParams.update({
    "axes.edgecolor": "black",     
    "axes.linewidth": 1.0,         
    "axes.facecolor": "white",     
    "grid.color": "gray",          
    "grid.alpha": 0.2,             
    "grid.linestyle": "-",
    "xtick.color": "black",
    "ytick.color": "black",
    "text.color": "black",
    "font.size": 12,
    "figure.facecolor": "white",
})
sns.set_context("notebook", font_scale=1.2)

# 从 Excel 中读取数据
df = pd.read_excel('./vis/shot.xlsx')  # 请确保文件路径正确
labels = df['model'].tolist()
values = df['Cosine'].tolist()

# 按 Cosine 值升序排序（值最小的在内层，最大的在外层）
sorted_data = sorted(zip(labels, values), key=lambda x: x[1])
sorted_labels, sorted_values = zip(*sorted_data)
max_value = max(sorted_values)
N = len(sorted_labels)

# 使用 GnBu 渐变色调色板生成离散颜色列表
colors = sns.color_palette("GnBu", n_colors=N)

# 极坐标图参数
arc_max_deg = 300                        # 最大弧宽（单位：度），预留60°缺口
arc_max_rad = np.deg2rad(arc_max_deg)      # 转换为弧度
theta_start_deg = 90                     # 固定起始角度（所有环均从此角度开始）
theta_start_rad = np.deg2rad(theta_start_deg)

# 每个环的径向参数
r0 = 0.5             # 最内层环的起始半径
ring_thickness = 0.3 # 每个环的厚度
gap = 0.05           # 环之间的间隙

# 标签字体大小（可调）
label_fontsize = 14

# 创建极坐标图
fig = plt.figure(figsize=(10, 10))
ax = plt.subplot(111, polar=True)

# 绘制每个数据点作为嵌套极坐标柱状图
for i, (label, value) in enumerate(zip(sorted_labels, sorted_values)):
    # 根据数值计算当前环的弧宽（比例关系）
    width_i = (value / max_value) * arc_max_rad  
    # 用于绘制柱状图的中心角（这里用于 bar 的位置）
    theta_i = theta_start_rad + width_i / 2  
    
    # 计算当前环的起始径向位置
    bottom_i = r0 + i * (ring_thickness + gap)
    
    # 绘制极坐标柱状图（环）
    ax.bar(theta_i, ring_thickness, width=width_i, bottom=bottom_i,
           color=colors[i], edgecolor='white', alpha=0.8)
    
    # 固定文本的径向位置
    text_r = bottom_i + ring_thickness / 2
    
    # 在环的起点放置 model 标签
    text_theta_start = theta_start_rad  # 起点位置
    ax.text(text_theta_start, text_r, f"{label}", ha='left', va='center', 
            fontsize=label_fontsize, color='black')
    
    # 在环的终点放置 Cosine 值标签
    text_theta_end = theta_start_rad + width_i  # 终点位置
    ax.text(text_theta_end, text_r, f"{value}", ha='left', va='center', 
            fontsize=label_fontsize, color='black')

# 调整图表显示设置
ax.set_ylim(0, r0 + N * (ring_thickness + gap) + 0.5)
ax.set_xticks([])  # 不显示角度刻度
ax.set_yticks([])  # 不显示径向刻度
ax.spines['polar'].set_visible(False)  # 隐藏极坐标轴线
ax.grid(False)  # 隐藏极坐标网格线

plt.tight_layout()
plt.savefig('nested_polar_bar_chart.png', dpi=300, bbox_inches='tight')
plt.show()

## std

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# =========================
# 1) Style
# =========================
sns.set_style("whitegrid")
plt.rcParams.update({
    "axes.edgecolor": "black",
    "axes.linewidth": 1.0,
    "axes.facecolor": "white",
    "grid.color": "gray",
    "grid.alpha": 0.2,
    "grid.linestyle": "-",
    "xtick.color": "black",
    "ytick.color": "black",
    "text.color": "black",
    "font.size": 12,
    "figure.facecolor": "white",
})
sns.set_context("notebook", font_scale=1.2)

# =========================
# 2) Load data
# =========================
df = pd.read_excel('./vis/shot.xlsx')

# 兼容列名空格（可选但建议保留）
df.columns = [c.strip() for c in df.columns]

df = df[["model", "Cosine", "std"]].copy()
df["Cosine"] = pd.to_numeric(df["Cosine"], errors="coerce")
df["std"] = pd.to_numeric(df["std"], errors="coerce")
df = df.dropna(subset=["Cosine", "std"])

# 按 Cosine 排序（从低到高，阅读更直观）
df = df.sort_values("Cosine", ascending=True).reset_index(drop=True)

labels = df["model"].tolist()
means  = df["Cosine"].to_numpy()
stds   = df["std"].to_numpy()

# 颜色：GnBu 离散取色
colors = sns.color_palette("GnBu", n_colors=len(df))

# =========================
# 3) Dot plot (mean ± std)
# =========================
fig, ax = plt.subplots(figsize=(4.0, 3.6))

x = np.arange(len(df))

# 误差棒 + 点（每个点用不同颜色）
for i in range(len(df)):
    ax.errorbar(
        x[i], means[i], yerr=stds[i],
        fmt="o",
        markersize=7,
        color=colors[i],          # 点颜色
        ecolor=colors[i],         # 误差棒颜色 = 点颜色
        elinewidth=1.6,
        capsize=4,
        capthick=1.6
    )

ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=0)

ax.set_ylabel("Cosine similarity")
ax.set_xlabel("Model")

# 视情况收紧 y 轴范围（让误差棒更清晰）
ymin = max(0, float(np.min(means - stds)) - 0.01)
ymax = min(1.0, float(np.max(means + stds)) + 0.01)
ax.set_ylim(ymin, ymax)

plt.tight_layout()

# =========================
# 4) Save
# =========================
plt.savefig("cosine_dotplot_with_errorbars.png", dpi=300, bbox_inches="tight")
plt.show()

# Co-reference resolution

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# ------------------ 设置图表尺寸和间距参数 ------------------ #
FIGURE_WIDTH = 15
FIGURE_HEIGHT_PER_PUB = 1
SUBPLOT_WSPACE = 0.2
SUBPLOT_HSPACE = 0.1
LEFT_MARGIN = 0.15
RIGHT_MARGIN = 0.75
TOP_MARGIN = 0.95
BOTTOM_MARGIN = 0.1

# ------------------ 设置风格 ------------------ #
sns.set_style("whitegrid")
plt.rcParams.update({
    "axes.edgecolor": "black",
    "axes.linewidth": 1.0,
    "axes.facecolor": "white",
    "grid.color": "gray",
    "grid.alpha": 0.2,
    "grid.linestyle": "-",
    "xtick.color": "black",
    "ytick.color": "black",
    "text.color": "black",
    "font.size": 12,
    "figure.facecolor": "white",
})
sns.set_context("notebook", font_scale=1.2)

# ------------------ 定义颜色方案 ------------------ #
total_color = '#084384'
pub_colors = ['#1373b2', '#42a6cb', '#77cac5', '#b2e1b9', '#d6efd0']
alpha_value = 0.8
overall_non_t_color = pub_colors[-1]

# ============================================================
# 读取“已统计好的”Excel：
#   sheet1: pub | total | t_count | non_t | t_percent
#   sheet2: overall_total | overall_t_count | overall_non_t | overall_t_percent
# ============================================================
input_stats_file = "./vis/cr.xlsx"

# 自动识别 sheet 名；若不存在则用前两个 sheet
xlsx = pd.ExcelFile(input_stats_file)
sheet_names = xlsx.sheet_names

per_pub_sheet = "per_pub" if "per_pub" in sheet_names else sheet_names[0]
overall_sheet = "overall" if "overall" in sheet_names else (sheet_names[1] if len(sheet_names) > 1 else sheet_names[0])

grouped_no_total = pd.read_excel(input_stats_file, sheet_name=per_pub_sheet)
overall_df = pd.read_excel(input_stats_file, sheet_name=overall_sheet)

# 统一列名去空格
grouped_no_total.columns = grouped_no_total.columns.astype(str).str.strip()
overall_df.columns = overall_df.columns.astype(str).str.strip()

# 必要列检查（per_pub）
required_pub_cols = {"pub", "total", "t_count", "non_t", "t_percent"}
missing_pub = required_pub_cols - set(grouped_no_total.columns)
if missing_pub:
    raise KeyError(f"Missing columns in per_pub sheet: {missing_pub}. Found: {list(grouped_no_total.columns)}")

# 必要列检查（overall）
required_overall_cols = {"overall_total", "overall_t_count", "overall_non_t", "overall_t_percent"}
missing_overall = required_overall_cols - set(overall_df.columns)
if missing_overall:
    raise KeyError(f"Missing columns in overall sheet: {missing_overall}. Found: {list(overall_df.columns)}")

# 去掉 Total 行（如果存在）
grouped_no_total["pub"] = grouped_no_total["pub"].fillna("None").astype(str).str.strip()
grouped_no_total = grouped_no_total[grouped_no_total["pub"].str.lower() != "total"].copy()

# overall 取第一行
overall_total = float(overall_df.loc[0, "overall_total"])
overall_t = float(overall_df.loc[0, "overall_t_count"])
overall_non_t = float(overall_df.loc[0, "overall_non_t"])
overall_t_pct = float(overall_df.loc[0, "overall_t_percent"])


# ------------------ 排序用于绘图：按 total 升序（内小外大） ------------------ #
labels = grouped_no_total['pub'].tolist()
values = grouped_no_total['total'].tolist()

sorted_data = sorted(zip(labels, values, range(len(labels))), key=lambda x: x[1])
sorted_labels, sorted_values, original_indices = zip(*sorted_data) if sorted_data else ([], [], [])
sorted_grouped = grouped_no_total.iloc[list(original_indices)].reset_index(drop=True) if sorted_data else grouped_no_total.copy()

N = len(sorted_labels)
max_value = max(sorted_values) if N > 0 else 1

# GnBu 渐变色
base_colors = sns.color_palette("GnBu", n_colors=N if N > 0 else 1)
t_colors = sns.color_palette("GnBu_r", n_colors=N if N > 0 else 1)

# ------------------ 极坐标图参数 ------------------ #
arc_max_deg = 300
arc_max_rad = np.deg2rad(arc_max_deg)
theta_start_deg = 90
theta_start_rad = np.deg2rad(theta_start_deg)

r0 = 1.0
ring_thickness = 1
gap = 0.15
label_fontsize = 20

# ------------------ 创建极坐标图 ------------------ #
fig = plt.figure(figsize=(12, 12))
ax = plt.subplot(111, polar=True)

# 绘制每个出版商的环（从内到外）
for i, row in sorted_grouped.iterrows():
    pub_name = row['pub']
    total = float(row['total'])
    t_count = float(row['t_count'])
    t_pct = float(row['t_percent'])

    width_i = (total / max_value) * arc_max_rad
    theta_i = theta_start_rad + width_i / 2
    bottom_i = r0 + i * (ring_thickness + gap)

    # non-t
    ax.bar(theta_i, ring_thickness, width=width_i, bottom=bottom_i,
           color=base_colors[i], edgecolor='white', alpha=alpha_value, linewidth=1)

    # t（堆积在上半部）
    if t_count > 0 and total > 0:
        t_width = width_i * (t_count / total)
        t_theta = theta_start_rad + t_width / 2
        ax.bar(t_theta, ring_thickness/2, width=t_width, bottom=bottom_i + ring_thickness/2,
               color=t_colors[i], edgecolor='white', alpha=alpha_value, linewidth=1)

    # 左侧 pub 名称
    text_r = bottom_i + ring_thickness / 2
    ax.text(theta_start_rad, text_r, f"{pub_name}", ha='left', va='center',
            fontsize=label_fontsize, color='black')

    # 右侧 total 与 t%
    ax.text(theta_start_rad + width_i, text_r, f"{int(total)} ({t_pct:.1f}%)",
            ha='left', va='center_baseline', fontsize=label_fontsize, color='black')

# 外部大环（总体）
outer_bottom = r0 + N * (ring_thickness + gap)
outer_thickness = ring_thickness

outer_total_width = 2 * np.pi
outer_non_t_width = outer_total_width * (overall_non_t / overall_total) if overall_total > 0 else 0.0
outer_t_width = outer_total_width * (overall_t / overall_total) if overall_total > 0 else 0.0

theta_non_t_center = theta_start_rad + outer_non_t_width / 2
theta_t_center = theta_start_rad + outer_non_t_width + outer_t_width / 2

ax.bar(theta_non_t_center, outer_thickness, width=outer_non_t_width, bottom=outer_bottom,
       color=overall_non_t_color, edgecolor='white', alpha=alpha_value, linewidth=1)
ax.bar(theta_t_center, outer_thickness, width=outer_t_width, bottom=outer_bottom,
       color=total_color, edgecolor='white', alpha=alpha_value, linewidth=1)

outer_text_r = outer_bottom + outer_thickness/2
ax.text(theta_start_rad, outer_text_r, f"Overall\n{int(overall_total)} ({overall_t_pct:.1f}%)",
        ha='left', va='center', fontsize=label_fontsize, color='black')

# 外观设置
ax.set_ylim(0, r0 + (N+1) * (ring_thickness + gap) + 0.5)
ax.set_xticks([])
ax.set_yticks([])
ax.spines['polar'].set_visible(False)
ax.grid(False)

fig.tight_layout()
plt.savefig('cr.pdf', dpi=300, bbox_inches='tight')
plt.show()

## std

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mticker

# ======================
# 尺寸与风格（完全沿用你的）
# ======================
FIGURE_WIDTH = 6
LEFT_MARGIN = 0.15
RIGHT_MARGIN = 0.75
TOP_MARGIN = 0.95
BOTTOM_MARGIN = 0.1

sns.set_style("whitegrid")
plt.rcParams.update({
    "axes.edgecolor": "black",
    "axes.linewidth": 1.0,
    "axes.facecolor": "white",
    "grid.color": "gray",
    "grid.alpha": 0.2,
    "grid.linestyle": "-",
    "xtick.color": "black",
    "ytick.color": "black",
    "text.color": "black",
    "font.size": 12,
    "figure.facecolor": "white",
})
sns.set_context("notebook", font_scale=1.2)

# ======================
# GnBu 调色板
# ======================
cmap = sns.color_palette("GnBu", as_cmap=True)

# ======================
# 1) Read Excel (mean & std)
# ======================
xlsx_path = "./vis/cr.xlsx"

df_mean = pd.read_excel(xlsx_path, sheet_name="mean")
df_sted = pd.read_excel(xlsx_path, sheet_name="std")

# 保证顺序一致
pub_order = df_mean["pub"].astype(str).str.strip()
df_mean["pub"] = pub_order
df_sted["pub"] = pub_order

# ======================
# 2) 合并 mean + sted
# ======================
stats = pd.DataFrame({
    "pub": pub_order,
    "mean": df_mean["t_percent"].astype(float),
    "sted": df_sted["t_percent"].astype(float)
})

# ======================
# 强制指定 x 轴顺序
# ======================
desired_order = ["ACS", "Elsevier", "RSC", "Springer", "Wiley", "Total"]

stats["pub"] = stats["pub"].astype(str)
stats = stats.set_index("pub").reindex(desired_order).reset_index()

# ======================
# 3) Plot：GnBu 点图 + 同色误差棒
# ======================
fig, ax = plt.subplots(figsize=(FIGURE_WIDTH, 5))

plt.subplots_adjust(
    left=LEFT_MARGIN,
    right=RIGHT_MARGIN,
    top=TOP_MARGIN,
    bottom=BOTTOM_MARGIN
)

x = np.arange(len(stats))
y = stats["mean"].to_numpy()
e = stats["sted"].to_numpy()

# 颜色从 GnBu 渐变中取（避开过浅）
colors = cmap(np.linspace(0.25, 0.9, len(stats)))

for i in range(len(stats)):
    ax.errorbar(
        x[i], y[i], yerr=e[i],
        fmt="o",
        markersize=7,
        elinewidth=1.6,
        capsize=4,
        capthick=1.6,
        color=colors[i],           # 点颜色
        ecolor=colors[i]  # 点边框黑色
    )

# ======================
# 4) 坐标轴与格式
# ======================
ax.set_xticks(x)
ax.set_xticklabels(stats["pub"], rotation=30, ha="right")
ax.set_ylabel("Resolution rate (%)")

ax.yaxis.set_major_formatter(mticker.FormatStrFormatter("%.1f"))

# 只保留 y 网格
ax.grid(axis="y", alpha=0.25)
ax.grid(axis="x", visible=False)

# 黑色边框
for spine in ax.spines.values():
    spine.set_visible(True)
    spine.set_color("black")
    spine.set_linewidth(1.5)

# ======================
# 5) 保存
# ======================
plt.tight_layout()
plt.savefig("crstd.pdf", dpi=300, bbox_inches="tight")
plt.show()

# Paraparagraph

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.ticker as mticker

# ------------------ 设置图表尺寸和间距参数 ------------------ #
FIGURE_WIDTH = 15  # 图表宽度，英寸
FIGURE_HEIGHT_PER_PUB = 1  # 每个出版商子图的高度，英寸
SUBPLOT_WSPACE = 0.2  # 子图之间的水平间距
SUBPLOT_HSPACE = 0.1  # 子图之间的垂直间距
LEFT_MARGIN = 0.15  # 左边距
RIGHT_MARGIN = 0.75  # 右边距，进一步调小为图例留出更多空间
TOP_MARGIN = 0.95  # 上边距
BOTTOM_MARGIN = 0.1  # 下边距

# ------------------ 设置风格 ------------------ #
sns.set_style("whitegrid")
plt.rcParams.update({
    "axes.edgecolor": "black",  # 黑色边框
    "axes.linewidth": 1.0,      # 边框宽度
    "axes.facecolor": "white",
    "grid.color": "gray",
    "grid.alpha": 0.2,
    "grid.linestyle": "-",
    "xtick.color": "black",
    "ytick.color": "black",
    "text.color": "black",
    "font.size": 12,
    "figure.facecolor": "white",
})
sns.set_context("notebook", font_scale=1.2)

# ------------------ 定义颜色方案 ------------------ #
total_color = '#084384'  # Total线的颜色
pub_colors = ['#1373b2', '#42a6cb', '#77cac5', '#b2e1b9', '#d6efd0']  # 出版商线条的颜色列表
alpha_value = 1  # 设置80%不透明度（20%透明度）

# ------------------ 读取数据 ------------------ #
df = pd.read_excel("./vis/parag.xlsx")  # 包含列: pub, 25-shot, 50-shot, 75-shot, 100-shot

# 找到并提取 "total" 行数据
total_data = df[df['pub'] == 'Total'].iloc[0]  # 假设只有一行 Total

# 提取不包括 Total 行的其他出版商数据
publishers_df = df[df['pub'] != 'Total']

# 将数据转换为长格式
df_long = pd.melt(publishers_df, id_vars=['pub'], var_name='shot', value_name='value')

# 提取 shot 数字，用于排序 & 绘图
df_long['shot_num'] = df_long['shot'].str.extract('(\d+)').astype(int)
df_long = df_long.sort_values('shot_num')

# 获取 pub 的唯一值（出版商名称）
unique_pub = df_long['pub'].unique()

# 计算每个出版商的平均值，用于确定颜色顺序
pub_averages = {}
for pub in unique_pub:
    pub_data = df_long[df_long['pub'] == pub]
    pub_averages[pub] = pub_data['value'].mean()

# 根据平均值从高到低排序出版商
sorted_pubs = sorted(pub_averages.items(), key=lambda x: x[1], reverse=True)
sorted_pub_names = [item[0] for item in sorted_pubs]

# 为每个出版商分配颜色 - 值最高的使用第一个颜色，依此类推
pub_color_map = {}
for i, pub in enumerate(sorted_pub_names):
    color_idx = min(i, len(pub_colors) - 1)  # 防止索引超出颜色列表范围
    pub_color_map[pub] = pub_colors[color_idx]

# 创建 x 轴标签和刻度
shot_columns = ['25-shot', '50-shot', '75-shot', '100-shot']
shot_nums = [int(col.split('-')[0]) for col in shot_columns]

# ------------------ 创建子图（小 multiples） ------------------ #
# 根据出版商数量和每个出版商的高度计算总高度
total_height = FIGURE_HEIGHT_PER_PUB * len(unique_pub)
fig, axes = plt.subplots(nrows=len(unique_pub), ncols=1, sharex=True, 
                        figsize=(FIGURE_WIDTH, total_height))

# 设置子图之间的间距
plt.subplots_adjust(wspace=SUBPLOT_WSPACE, hspace=SUBPLOT_HSPACE, 
                    left=LEFT_MARGIN, right=RIGHT_MARGIN, 
                    top=TOP_MARGIN, bottom=BOTTOM_MARGIN)

# 创建一个字典存储每个出版商和Total的图例句柄和标签
legend_handles = []
legend_labels = []

# 遍历每个出版商，分别绘制到对应子图
for i, pub_name in enumerate(unique_pub):
    ax = axes[i] if len(unique_pub) > 1 else axes  # 当只有一个出版商时，axes 不是列表
    
    # 筛选当前出版商的数据
    subset = df_long[df_long['pub'] == pub_name]
    
    # 使用为该出版商分配的颜色（基于其平均值排名）
    pub_color = pub_color_map[pub_name]
    
    # 绘制当前出版商的线条，使用分配的颜色和指定的透明度
    line1, = ax.plot(subset['shot_num'], subset['value'], marker='o', markersize=6, 
                    color=pub_color, alpha=alpha_value, label=pub_name)
    
    # 在同一子图中绘制 Total 数据作为参考线，使用指定的蓝色
    line2, = ax.plot(shot_nums, [total_data[col] for col in shot_columns], 
                    marker='s', markersize=5, color=total_color, alpha=alpha_value, 
                    linestyle='--', label='Total')
    
    # 仅在第一个子图时收集Total的图例信息
    if i == 0:
        legend_handles.append(line2)
        legend_labels.append('Total')
    
    # 收集每个出版商的图例信息
    legend_handles.append(line1)
    legend_labels.append(pub_name)
    
    # 设置 Y 轴刻度格式为保留两位小数
    ax.yaxis.set_major_formatter(mticker.FormatStrFormatter('%.2f'))
    
    # 只在中间位置的子图添加Y轴标签，并旋转90度
    if i == len(unique_pub) // 2:  # 中间位置的子图
        ax.set_ylabel("Cosine similarity", rotation=90, labelpad=15, va='center', fontsize=18)
    else:
        ax.set_ylabel("")  # 其他子图不显示Y轴标签
    
    # 只在最后一个子图显示 X 轴刻度和标签
    if i < len(unique_pub) - 1:
        ax.set_xticklabels([])  # 隐藏 X 轴刻度标签
    else:
        ax.set_xlabel("Shot number")
        ax.set_xticks(shot_nums)
        ax.set_xticklabels(shot_columns)
    
    # 确保显示四边黑框
    for spine in ax.spines.values():
        spine.set_visible(True)
        spine.set_color('black')
        spine.set_linewidth(1.0)
    
    # 移除单个子图的图例
    if ax.get_legend():
        ax.get_legend().remove()

# 在图的右侧添加所有图例，使用bbox_to_anchor将图例放在轴外
fig.legend(legend_handles, legend_labels, loc='center left', 
           bbox_to_anchor=(1, 0.5), frameon=False)

fig.tight_layout()
plt.savefig('parag.pdf', dpi=300, bbox_inches='tight')
plt.show()

## std

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# =========================
# 1) Style（与你之前一致）
# =========================
sns.set_style("whitegrid")
plt.rcParams.update({
    "axes.edgecolor": "black",
    "axes.linewidth": 1.0,
    "axes.facecolor": "white",
    "grid.color": "gray",
    "grid.alpha": 0.2,
    "grid.linestyle": "-",
    "xtick.color": "black",
    "ytick.color": "black",
    "text.color": "black",
    "font.size": 12,
    "figure.facecolor": "white",
})
sns.set_context("notebook", font_scale=1.2)

# =========================
# 2) Load mean + std
# =========================
xlsx_path = "./vis/parag.xlsx"
df_mean = pd.read_excel(xlsx_path, sheet_name="mean")
df_std  = pd.read_excel(xlsx_path, sheet_name="std")

df_mean.columns = [c.strip() for c in df_mean.columns]
df_std.columns  = [c.strip() for c in df_std.columns]

shot_columns = ['25-shot', '50-shot', '75-shot', '100-shot']

# =========================
# 3) Long format（不画 Total，可保留也可删）
# =========================
mean_long = pd.melt(df_mean, id_vars=['pub'], value_vars=shot_columns,
                    var_name='shot', value_name='value')
std_long  = pd.melt(df_std,  id_vars=['pub'], value_vars=shot_columns,
                    var_name='shot', value_name='std')

df_long = mean_long.merge(std_long, on=['pub', 'shot'])

# 如不想画 Total，取消下面注释
# df_long = df_long[df_long['pub'] != 'Total']

# shot 顺序
df_long['shot'] = pd.Categorical(
    df_long['shot'],
    categories=shot_columns,
    ordered=True
)

# =========================
# 4) Plot: 单一框点图 + 误差棒
# =========================
plt.figure(figsize=(10, 5))

palette = sns.color_palette("GnBu", n_colors=len(shot_columns))
markers = ['o', 's', '^', 'D']

for i, shot in enumerate(shot_columns):
    sub = df_long[df_long['shot'] == shot]

    plt.errorbar(
        sub['pub'],
        sub['value'],
        yerr=sub['std'],
        fmt=markers[i],
        linestyle='none',
        markersize=7,
        color=palette[i],
        ecolor=palette[i],
        elinewidth=1.5,
        capsize=3,
        capthick=1.5,
        label=shot
    )

# =========================
# 5) Axis & legend
# =========================
plt.ylabel("Cosine similarity")
plt.xlabel("Publisher")

plt.ylim(0.88, 1.00)  # 根据你数据范围微调
plt.xticks(rotation=30, ha='right')

plt.legend(
    frameon=False,
    ncol=4,
    loc='upper center',
    bbox_to_anchor=(0.5, 1.15)
)

# 黑色边框
ax = plt.gca()
for spine in ax.spines.values():
    spine.set_visible(True)
    spine.set_color("black")
    spine.set_linewidth(1.0)

plt.tight_layout()
plt.savefig("paragstd.pdf", dpi=300, bbox_inches="tight")
plt.show()

# compare

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mticker
from matplotlib.colors import LinearSegmentedColormap

# ------------------ 设置风格 ------------------
sns.set_style("whitegrid")
plt.rcParams.update({
    "axes.edgecolor": "black",
    "axes.linewidth": 1.0,
    "axes.facecolor": "white",
    "grid.color": "gray",
    "grid.alpha": 0.2,
    "grid.linestyle": "-",
    "xtick.color": "black",
    "ytick.color": "black",
    "text.color": "black",
    "font.size": 12,
    "figure.facecolor": "white",
})
sns.set_context("notebook", font_scale=1.2)

# ------------------ Excel 路径：改成你的 xlsx ------------------
excel_path = "./vis/compare.xlsx"

# ------------------ 分组列名（保持你原来的） ------------------
group1 = ["Metal Source", "Organic Linkers Source", "Modulator Source", "Solvent Source",
          "Quantity of Metal", "Quantity of Organic Linkers", "Quantity of Modulator", "Quantity of Solvent"]
group2 = ["pH", "Synthesis Temperature", "Synthesis Time", "Equipment"]
group3 = ["Crystal Morphology", "Yield"]
all_columns = group1 + group2 + group3

# ------------------ 统一表头的函数（防止空格差异导致 KeyError） ------------------
def norm(s: str) -> str:
    return re.sub(r"\s+", " ", str(s)).strip()

all_columns_n = [norm(c) for c in all_columns]
group1_n = [norm(c) for c in group1]
group2_n = [norm(c) for c in group2]
group3_n = [norm(c) for c in group3]

# ------------------ 颜色：0 为白色，其余为 GnBu 渐变 ------------------
base = sns.color_palette("GnBu", n_colors=256)
colors = [(1, 1, 1)] + list(base)  # 第一个颜色纯白给 0
cmap0 = LinearSegmentedColormap.from_list("GnBu_with_white0", colors, N=257)

# ------------------ 读 Excel & 只画 sheet(1-3) ------------------
xls = pd.ExcelFile(excel_path)

# 注意：这里按“sheet 名字”为 "1","2","3"
# 如果你的 sheet 名是 "Sheet1/Sheet2/Sheet3"，把这里改成相应名字即可
target_sheets = [s for s in ["1", "2", "3"] if s in xls.sheet_names]

if len(target_sheets) == 0:
    raise ValueError(f"未找到名为 1/2/3 的 sheet。当前 sheet_names = {xls.sheet_names}")

for sheet_name in target_sheets:
    df = pd.read_excel(xls, sheet_name=sheet_name, index_col=0)

    # 去掉 Accuracy 行（如果有）
    df = df.drop(index="Accuracy", errors="ignore")

    # 统一列名
    df.columns = [norm(c) for c in df.columns]

    # N/A / 空值 -> 0
    df = df.replace(["N/A", "NA", ""], np.nan).fillna(0)
    df = df.apply(pd.to_numeric, errors="coerce").fillna(0.0)

    # 缺失列自动补 0，保证结构一致
    for c in all_columns_n:
        if c not in df.columns:
            df[c] = 0.0

    selected_df = df[all_columns_n].copy()

    # 0 的数字不显示：把 annotation 设为字符串矩阵，0->""
    annot = selected_df.copy().astype(object)
    annot = annot.applymap(lambda v: "" if float(v) == 0.0 else f"{float(v):.2f}")

    # 0 的颜色变白：用 mask 把 0 “盖掉”，并设置 mask 颜色为白
    mask_zero = (selected_df == 0)

    # ------------------ 设置热图大小 ------------------
    cell_size = 2
    fig_width = len(all_columns_n) * cell_size
    fig_height = len(selected_df.index) * cell_size

    plt.figure(figsize=(fig_width, fig_height))

    ax = sns.heatmap(
        selected_df,
        annot=annot, fmt="",
        cmap=cmap0,
        square=True,
        linewidths=0.5,
        linecolor="black",
        mask=mask_zero,                 # 0 位置不涂色
        cbar_kws={"label": "Value"},
        annot_kws={"fontsize": 16, "ha": "center", "va": "center"},
        vmin=0, vmax=1                  # 若你的值不在[0,1]，可删掉或改范围
    )

    # 把被 mask 的区域显示成纯白
    ax.set_facecolor("white")

    # 设置标签
    plt.xticks(np.arange(len(all_columns_n)) + 0.5, all_columns_n, rotation=45, ha="right", fontsize=18)
    plt.yticks(np.arange(len(selected_df.index)) + 0.5, selected_df.index, fontsize=18)

      # 分组白线
    # y 轴分组白线（在第 2 行后）
    plt.axhline(y=2, color="white", linewidth=3)
    ax = plt.gca()


    # 分组白线
    plt.axvline(x=len(group1_n), color="white", linewidth=3)
    plt.axvline(x=len(group1_n) + len(group2_n), color="white", linewidth=3)

    # 组标题
    group_fontsize = 20
    plt.text(len(group1_n)/2, -0.05, "Chemicals", ha="center", fontsize=group_fontsize)
    plt.text(len(group1_n) + len(group2_n)/2, -0.05, "Conditions", ha="center", fontsize=group_fontsize)
    plt.text(len(group1_n) + len(group2_n) + len(group3_n)/2, -0.05, "Crystallizations", ha="center", fontsize=group_fontsize)

    # colorbar
    cbar = ax.collections[0].colorbar
    cbar.ax.tick_params(labelsize=18)
    cbar.set_label("Value", fontsize=18)
    cbar.ax.yaxis.set_major_formatter(mticker.FormatStrFormatter("%.2f"))

    plt.tight_layout()
    plt.savefig(f"heatmap_sheet_{sheet_name}.pdf", format="pdf", bbox_inches="tight")
    plt.show()

# kappa

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score

# =========================
# 1. Global plotting style
# =========================
sns.set_style("whitegrid")
plt.rcParams.update({
    "axes.edgecolor": "black",
    "axes.linewidth": 1.0,
    "axes.facecolor": "white",
    "grid.color": "gray",
    "grid.alpha": 0.2,
    "grid.linestyle": "-",
    "xtick.color": "black",
    "ytick.color": "black",
    "text.color": "black",
    "font.size": 12,
    "figure.facecolor": "white",
})
sns.set_context("notebook", font_scale=1.2)

# GnBu colormap（论文友好、低饱和）
cmap = sns.color_palette("GnBu", as_cmap=True)

# =========================
# 2. Load annotation data
# =========================
xlsx_path = "./vis/paragraphmeta.xlsx"
df = pd.read_excel(xlsx_path)

# 第 6–9 列为 expert 标签（Scheme B：0/1/2）
labels = df.iloc[:, 5:9].copy()
labels.columns = ["expert1", "expert2", "expert3", "expert4"]

labels = labels.apply(pd.to_numeric, errors="coerce")
labels = labels.dropna(subset=labels.columns)

valid_mask = labels.isin([0, 1, 2]).all(axis=1)
labels = labels.loc[valid_mask].astype(int)

raters = labels.columns.tolist()

print(labels.head())

# =========================
# 3. Pairwise Cohen’s Kappa
# =========================
K = pd.DataFrame(index=raters, columns=raters, dtype=float)
for i in raters:
    for j in raters:
        K.loc[i, j] = 1.0 if i == j else cohen_kappa_score(labels[i], labels[j])

# =========================
# 4. Heatmap visualization
# =========================
fig, ax = plt.subplots(figsize=(8, 4))

sns.heatmap(
    K,
    ax=ax,
    cmap=cmap,
    vmin=0,
    vmax=1,
    annot=True,
    fmt=".2f",
    linewidths=0.8,
    linecolor="black",
    square=True,
    cbar_kws={"label": "Cohen’s κ"}
)

#ax.set_title("Scheme B: Pairwise Cohen’s Kappa (3-level quality)")
ax.set_xlabel("")
ax.set_ylabel("")
plt.tight_layout()
fig.savefig("pairwise_kappa.pdf", bbox_inches="tight")
plt.show()

# =========================
# 5. Consensus distribution
# =========================
# 使用中位数作为共识（对极端值鲁棒）
consensus = labels.median(axis=1).astype(int)
counts = consensus.value_counts().sort_index()

fig, ax = plt.subplots(figsize=(5, 3.5))

sns.barplot(
    x=counts.index.astype(int),
    y=counts.values,
    palette="GnBu",
    edgecolor="black",
    ax=ax
)

ax.set_xlabel("Consensus quality level (median of 4 experts)")
ax.set_ylabel("Count of paragraphs")
ax.set_xticks([0, 1, 2])

plt.tight_layout()
fig.savefig("consensus_distribution.pdf", bbox_inches="tight")
plt.show()