In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from qiime2 import Artifact
from scipy.spatial.distance import cdist

>qiime taxa collapse \
--i-table Data/noMito_noChloro-filtered-table.qza \
--i-taxonomy Data/taxonomy-silva.qza \
--p-level 6 \
--o-collapsed-table Data/tax-6-table.qza`

In [48]:
df = Artifact.load("Data/tax-6-table.qza").view(pd.DataFrame)

In [49]:
metadata = pd.read_csv("Data/IBS-metadata.txt", sep="\t", index_col="Sampleid")

In [50]:
#returns true if the indexes contain all of the same values
len(set(df.index) & set(metadata.index)) == len(df.index)

True

In [80]:
df = df.reindex(metadata.index)

## Cluster time

`!pip install scikit-learn-extra`

In [86]:
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import completeness_score, v_measure_score

In [87]:
kmed = KMedoids(n_clusters=4, 
                metric="braycurtis", 
                method="pam",
                random_state=42).fit(df)
kmed.labels_

array([0, 3, 0, 1, 3, 0, 0, 3, 0, 3, 2, 2, 1, 3, 0, 0, 0, 0, 3, 3, 1, 3,
       0, 3, 3, 2, 2, 0, 0, 3, 0, 0, 2, 1, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3,
       3])

In [84]:
completeness_score(labels_true = metadata["IBS Type"],
                   labels_pred = kmed.labels_)

0.1964614507653102

In [88]:
v_measure_score(labels_true = metadata["IBS Type"],
                labels_pred = kmed.labels_)

0.18174420878998115