In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from qiime2 import Artifact
from scipy.spatial.distance import cdist

>qiime taxa collapse \
--i-table Data/noMito_noChloro-filtered-table.qza \
--i-taxonomy Data/taxonomy-silva.qza \
--p-level 6 \
--o-collapsed-table Data/tax-6-table.qza`

In [97]:
df = Artifact.load("Data/tax-6-table.qza").view(pd.DataFrame)

In [98]:
mat = np.matrix(df)
mat /= np.sum(mat, axis=1)
df = pd.DataFrame(mat,
                  index=df.index,
                  columns=df.columns)

In [99]:
metadata = pd.read_csv("Data/IBS-metadata.txt", sep="\t", index_col="Sampleid")

In [100]:
#returns true if the indexes contain all of the same values
len(set(df.index) & set(metadata.index)) == len(df.index)

True

In [101]:
df = df.reindex(metadata.index)

## Cluster time

`!pip install scikit-learn-extra`

In [102]:
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import completeness_score, v_measure_score

In [103]:
kmed = KMedoids(n_clusters=4, 
                metric="braycurtis", 
                method="pam",
                random_state=42).fit(df)
kmed.labels_

array([2, 1, 2, 3, 1, 2, 2, 1, 2, 1, 3, 1, 3, 1, 2, 2, 2, 2, 0, 0, 3, 2,
       2, 1, 0, 2, 3, 2, 2, 0, 2, 2, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0,
       1])

In [104]:
completeness_score(labels_true = metadata["IBS Type"],
                   labels_pred = kmed.labels_)

0.208667222804086

In [105]:
v_measure_score(labels_true = metadata["IBS Type"],
                labels_pred = kmed.labels_)

0.20148707539445432

Very poor performance...