In [None]:
# main figure c 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.cm as cm

# Define file path
file_path = "CATH_cntA_cntB_testtype_pval_padjusted_direction_log2FC_-log10p-adj_nNovelPair_meannDomain_cathname-significant.tsv"

# Read the CSV file
df = pd.read_csv(file_path, sep="\t")

# Compute -log10(p_adjusted)
df["-log10(p_adjusted)"] = -np.log10(df["p_adjusted"])

# Define p-value threshold
p_adjusted_threshold = 0.05

# Define log2FC threshold for coloring
log2fc_threshold = 1

# Get entries with p-value < 0.05
significant_df = df[df["p_adjusted"] < p_adjusted_threshold]

# Normalize 'nNovelPair' to a scale of 0 to 1 for color mapping
if not significant_df.empty:
    min_nNovelPair = significant_df["nNovelPair"].min()
    max_nNovelPair = significant_df["nNovelPair"].max()
    if max_nNovelPair > min_nNovelPair:  # Prevent division by zero
        norm = mcolors.Normalize(vmin=min_nNovelPair, vmax=max_nNovelPair)
        cmap = cm.viridis
        df.loc[df["p_adjusted"] < p_adjusted_threshold, "color"] = significant_df["nNovelPair"].apply(
            lambda x: cmap(norm(x))
        )

# First, set color to grey for |log2FC| <= 1, as the last step
df["color"] = df.apply(lambda row: "grey" if abs(row["log2FC"]) <= log2fc_threshold else row["color"], axis=1)

# Convert colors to Matplotlib-friendly format
df["color"] = df["color"].apply(lambda x: (x[0], x[1], x[2]) if isinstance(x, tuple) else x)
# Drop rows with missing values in essential columns
df = df.dropna(subset=["log2FC", "-log10(p_adjusted)", "color", "nNovelPair", "Category"])

# Select top 10 highest log2FC values (among p < 0.05)
top_abun_points = significant_df.query("p_adjusted < 0.05 and log2FC > 1") \
                                .nlargest(10, "nNovelPair")

# Create a static volcano plot
fig, ax = plt.subplots(figsize=(15, 6))

# Plot points
for _, row in df.iterrows():
    # Use 'nNovelPair' directly as marker size (scaled for visibility)
    marker_size = row["nNovelPair"] * 1.2  # Adjust scaling factor as needed
    linewidth = 0.45
    if row["Category"] in top_abun_points["Category"].values:
        marker_size = row["nNovelPair"] * 1.2  # Same scaling
        linewidth = 1.2  # Thicker outline for highlighted
    ax.scatter(
        row["log2FC"], row["-log10(p_adjusted)"],
        color=row["color"], marker="o", s=marker_size, alpha=0.7, linewidths=linewidth
    )

# Add dotted lines for log2FC thresholds (-1 and 1)
ax.axvline(x=-1, linestyle="dashed", color="gray", linewidth=0.5)
ax.axvline(x=1, linestyle="dashed", color="gray", linewidth=0.5)

# Add dotted line for p-value threshold
ax.axhline(y=-np.log10(p_adjusted_threshold), linestyle="dashed", color="lightgrey", linewidth=0.5)

# Labels and title
ax.set_xlabel("Log₂(freq. of CATH category in Novel set / freq. of CATH category in Non-novel set)", fontsize=15)
ax.set_ylabel("−Log₁₀(multiplicity-corrected p-value)", fontsize=15)



# Save plot as SVG
plt.savefig("volcano_svg_final.svg", format="svg")

# Show plot
plt.show()

In [None]:
# Create a manual legend with circles of varying color and size
from matplotlib.lines import Line2D

# Create a horizontal legend with double encoding (color + size)
fig, ax = plt.subplots(figsize=(3, 1.5))  # You can shrink width later if needed

legend_values = [1, 25, 50, 75, 100, 113]
for i, val in enumerate(legend_values):
    color = cmap(norm(val))
    size = val * 1.2
    ax.scatter(i * 0.5, 0, s=size, color=color, alpha=0.9)

# Add corresponding text labels under the circles
for i, val in enumerate(legend_values):
    ax.text(i * 0.5, -0.15, str(val), ha='center', va='top', fontsize=10)

# Clean up axes
ax.set_xlim(-0.5, (len(legend_values) - 1) * 0.5 + 0.5)
ax.set_ylim(-0.5, 0.5)
ax.axis("off")

# Save legend
plt.savefig("volcano_svg_final_circle-legend.svg", format="svg", bbox_inches='tight', transparent=True)
plt.show()
