In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import umap

# =========================
# 1. Load data
# =========================
df = pd.read_csv("ppi.csv")
print("Data shape:", df.shape)

# =========================
# 2. Define label column (from your file)
# =========================
label_col = "p_interface"   # <-- your true label column

# =========================
# 3. Exclude non-feature columns
# =========================
exclude_cols = [
    label_col,
    "Unnamed: 0",
    "domain",
    "aa_ProtPosition",
    "uniprot_id",
    "sequence"
]

# Keep only numeric feature columns
feature_cols = [c for c in df.columns if c not in exclude_cols and pd.api.types.is_numeric_dtype(df[c])]

print("Number of features used:", len(feature_cols))

X = df[feature_cols].values
y = df[label_col].astype(int).values

# =========================
# 4. Standardize features (IMPORTANT)
# =========================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# =========================
# 5. Run UMAP
# =========================
reducer = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=2,
    random_state=42
)

X_umap = reducer.fit_transform(X_scaled)
print("UMAP embedding shape:", X_umap.shape)

# =========================
# 6. Plot
# =========================
plt.figure()
sc = plt.scatter(
    X_umap[:, 0],
    X_umap[:, 1],
    c=y,
    s=5
)
plt.title("UMAP of PPI residues (colored by p_interface)")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.colorbar(sc, label="p_interface (0=non-interface, 1=interface)")
plt.tight_layout()
plt.show()

# =========================
# 7. Save embedding
# =========================
umap_df = pd.DataFrame({
    "UMAP1": X_umap[:, 0],
    "UMAP2": X_umap[:, 1],
    "p_interface": y
})
umap_df.to_csv("ppi_umap_embedding.csv", index=False)

print("UMAP embedding saved to ppi_umap_embedding.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'ppi.csv'