In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Determine the optimal number of clusters using the Elbow method
inertia = []
k_values = range(1, 10)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(diversity_df)
    inertia.append(kmeans.inertia_)

# Plot the Elbow graph
plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia, marker='o', linestyle='-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.show()


In [None]:
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
diversity_df['Cluster'] = kmeans.fit_predict(diversity_df)

# Chekin
diversity_df.head()


In [None]:
# Reload necessary libraries
import pandas as pd
import numpy as np
from scipy.stats import entropy
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load the CSV file again
file_path = "/mnt/data/pan_otutab.csv"
df = pd.read_csv(file_path)

# Define functions for diversity indices
def simpson_index(counts):
    total = np.sum(counts)
    if total == 0:
        return 0
    proportions = counts / total
    return 1 - np.sum(proportions ** 2)

# Dictionary to store diversity indices
diversity_data = {}

# Process each subject
subject_columns = df.columns[1:]  # Exclude OTU_ID
for subject in subject_columns:
    subject_data = df[['OTU_ID', subject]].copy()

    # Keep top 30 OTUs based on abundance
    top_otus = subject_data.nlargest(30, subject)

    # Compute diversity indices
    shannon = entropy(top_otus[subject])  # Shannon Index
    simpson = simpson_index(top_otus[subject].values)  # Simpson Index

    diversity_data[subject] = {'Shannon': shannon, 'Simpson': simpson}

# Convert to DataFrame
diversity_df = pd.DataFrame.from_dict(diversity_data, orient='index')

# Apply K-means clustering with an optimal number of clusters (choosing k=3)
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
diversity_df['Cluster'] = kmeans.fit_predict(diversity_df)

# Merge cluster labels back to the original dataset (excluding OTU_ID for now)
subject_cluster_map = diversity_df[['Cluster']].reset_index().rename(columns={'index': 'Subject'})
otu_data = df.set_index('OTU_ID').T  # Transpose to align subjects as rows
otu_data = otu_data.merge(subject_cluster_map, left_index=True, right_on='Subject').set_index('Subject')

# Group by cluster and sum OTU abundances
clustered_otus = otu_data.groupby('Cluster').sum().T

# Extract top OTUs per cluster
top_otus_per_cluster = {cluster: clustered_otus[cluster].nlargest(30).index.tolist() for cluster in clustered_otus.columns}

top_otus_per_cluster


In [None]:
## Cluster 0:

Prevotella copri
Faecalibacterium prausnitzii
Bacteroides plebeius
Prevotella stercorea
Pseudobutyrivibrio ruminis
Parasutterella excrementihominis
Megasphaera elsdenii

##Cluster 1:

Prevotella copri
Faecalibacterium prausnitzii
Haemophilus parainfluenzae
Bacteroides plebeius
Roseburia faecis
Megasphaera elsdenii
Lactobacillus rogosae

##Cluster 2:

Prevotella copri
Faecalibacterium prausnitzii
Bacteroides plebeius
Haemophilus parainfluenzae
Megasphaera elsdenii
Prevotella stercorea
Roseburia faecis