## Clustering

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_rand_score
from sklearn_extra.cluster import KMedoids
from sklearn.datasets import make_blobs
import pandas as pd
from sklearn import metrics
from sklearn.cluster import KMeans
import os
import sys

# get absolute path to the directory where the script resides
current_dir = os.path.dirname(os.path.abspath(""))

# get absolute path to the parent directory
parent_dir = os.path.dirname(current_dir)

# add the parent directory to the sys.path
sys.path.append(parent_dir)
from sparsemedoid import clustering

In [2]:
hyperparams = [1.2, 1.5, 3, 10]
distance_types = ['gower','wishart','podani']
clusters = [4, 7, 9] # Try increasing number of clusters later
maf_file = "gene_level_data/merged.csv"

maf_df = pd.read_csv(maf_file, index_col=0)

In [3]:
subset_dfs = [maf_df.head(200).loc[:, maf_df.head(200).nunique() > 1]]
subset_dfs

[             Tumor_Sample_Barcode  PIK3CA  TP53  TTN  CDH1  GATA3  MUC16  \
 1    TCGA-3C-AAAU-01A-11D-A41F-09       0     0    0     0      1      0   
 2    TCGA-3C-AALI-01A-11D-A41F-09       0     1    1     0      0      0   
 3    TCGA-3C-AALJ-01A-31D-A41F-09       0     0    1     0      1      0   
 4    TCGA-3C-AALK-01A-11D-A41F-09       1     0    0     0      0      0   
 5    TCGA-4H-AAAK-01A-12D-A41F-09       0     0    0     1      0      0   
 ..                            ...     ...   ...  ...   ...    ...    ...   
 196  TCGA-A8-A095-01A-11W-A019-09       1     0    1     0      0      1   
 197  TCGA-A8-A096-01A-11W-A019-09       1     0    0     0      0      0   
 198  TCGA-A8-A097-01A-11W-A050-09       0     1    1     0      0      0   
 199  TCGA-A8-A099-01A-11W-A019-09       0     0    0     0      0      0   
 200  TCGA-A8-A09A-01A-11W-A019-09       0     0    0     0      0      1   
 
      MAP3K1  KMT2C  HMCN1  ...  In_Frame_Del.y  In_Frame_Ins.y  \
 1     

In [13]:
subset_dfs = [maf_df.head(200).loc[:, maf_df.head(200).nunique() > 1]]

runs = ["numerical+categorical"]

for i, subset_df in enumerate(subset_dfs):
    X_full = subset_df.values
    X = X_full

    N = X.shape[0]
    P = X.shape[1]

    trials = len(clusters) * len(distance_types) * len(hyperparams)

    Scores = np.zeros((1, trials))
    all_feature_weights = np.zeros((P, trials))
    all_feature_orders = np.zeros((P, trials))
    all_cluster_labels = np.zeros((N, trials))

    result_labels = []

    iter = 0

    for K in clusters:

        for distance in distance_types:

            for S in hyperparams:

                results_path_prefix = f"Run={runs[i]}_P={P}_K={K}_dist={distance}_S={S}"
                result_labels.append(results_path_prefix)

                print(f"Run={runs[i]}_P={P} | K={K} | {distance} {S} started")

                (
                    cluster_labels,
                    feature_weights,
                    feature_order,
                    weighted_distances,
                ) = clustering.sparse_kmedoids(
                    X,
                    distance_type=distance,
                    k=K,
                    s=S,
                    max_attempts=6,
                    method="pam",
                    init="build",
                    max_iter=100,
                    random_state=None,
                )

                Scores[0, iter] += metrics.silhouette_score(
                    weighted_distances, cluster_labels, metric="precomputed"
                )

                all_feature_weights[:, iter] = feature_weights
                all_cluster_labels[:, iter] = cluster_labels
                feature_order_vec = (
                    feature_order["Numerical Features"]
                    + feature_order["Binary Features"]
                    + feature_order["Categorical Features"]
                )
                all_feature_orders[:, iter] = feature_order_vec

                weighted_distances_df = pd.DataFrame(weighted_distances)
                os.makedirs(
                    os.path.dirname(
                        "results/" + results_path_prefix + "_distance_matrix.csv"
                    ),
                    exist_ok=True,
                )
                weighted_distances_df.to_csv(
                    "results/" + results_path_prefix + "_distance_matrix.csv", index=False
                )

                iter += 1
    labels_df = pd.DataFrame(all_cluster_labels, columns=result_labels)
    os.makedirs(
        os.path.dirname("results/" + results_path_prefix + "_cluster_labels.csv"),
        exist_ok=True,
    )
    labels_df.to_csv("results/" + results_path_prefix + "_cluster_labels.csv", index=False)

    weights_df = pd.DataFrame(all_feature_weights, columns=result_labels)
    os.makedirs(
        os.path.dirname("results/" + results_path_prefix + "_cluster_weights.csv"),
        exist_ok=True,
    )
    weights_df.to_csv(
        "results/" + results_path_prefix + "_cluster_weights.csv", index=False
    )

    Scores_df = pd.DataFrame(Scores, columns=result_labels)
    os.makedirs(
        os.path.dirname("results/" + results_path_prefix + "_cluster_scores.csv"),
        exist_ok=True,
    )
    Scores_df.to_csv("results/" + results_path_prefix + "_cluster_scores.csv", index=False)

    orders_df = pd.DataFrame(all_feature_orders, columns=result_labels)
    os.makedirs(
        os.path.dirname("results/" + results_path_prefix + "_feature_orders.csv"),
        exist_ok=True,
    )
    orders_df.to_csv("results/" + results_path_prefix + "_feature_orders.csv", index=False)

Run=numerical+categorical_P=7113 | K=4 | gower 1.2 started
Run=numerical+categorical_P=7113 | K=4 | gower 1.5 started
Run=numerical+categorical_P=7113 | K=4 | gower 3 started
Run=numerical+categorical_P=7113 | K=4 | gower 10 started
Run=numerical+categorical_P=7113 | K=4 | wishart 1.2 started
Run=numerical+categorical_P=7113 | K=4 | wishart 1.5 started
Run=numerical+categorical_P=7113 | K=4 | wishart 3 started
Run=numerical+categorical_P=7113 | K=4 | wishart 10 started
Run=numerical+categorical_P=7113 | K=4 | podani 1.2 started
Run=numerical+categorical_P=7113 | K=4 | podani 1.5 started
Run=numerical+categorical_P=7113 | K=4 | podani 3 started
Run=numerical+categorical_P=7113 | K=4 | podani 10 started
Run=numerical+categorical_P=7113 | K=7 | gower 1.2 started
Run=numerical+categorical_P=7113 | K=7 | gower 1.5 started
Run=numerical+categorical_P=7113 | K=7 | gower 3 started
Run=numerical+categorical_P=7113 | K=7 | gower 10 started
Run=numerical+categorical_P=7113 | K=7 | wishart 1.2 sta

In [None]:
subset_dfs[0]

(981, 15451)