In [None]:
# Define BMI categories based on standard classification
def categorize_bmi(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 24.9:
        return "Normal weight"
    elif 25 <= bmi < 29.9:
        return "Overweight"
    else:
        return "Obese"

# Apply BMI categorization
meta_df["BMI_Category"] = meta_df["BMI"].apply(categorize_bmi)

# Create a new column combining Lifestyle and BMI category
meta_df["Lifestyle_BMI_Group"] = meta_df["Life style pattern"] + " - " + meta_df["BMI_Category"]

# Display unique Lifestyle-BMI combinations
meta_df["Lifestyle_BMI_Group"].value_counts()


In [None]:
# Reload OTU data
otu_file_path = "/mnt/data/pan_otutab.csv"
otu_df = pd.read_csv(otu_file_path)

# Ensure Subject_IDs align across datasets
otu_subjects = otu_df.columns[1:]  # Excluding OTU_ID column
meta_subjects = meta_df["Subject_ID"].unique()

# Keep only matching subjects in metadata and OTU data
common_subjects = set(otu_subjects).intersection(set(meta_subjects))
otu_df = otu_df[["OTU_ID"] + list(common_subjects)]
meta_df = meta_df[meta_df["Subject_ID"].isin(common_subjects)]

# Recalculate diversity indices for each Lifestyle-BMI group
diversity_data = {}
for subject in common_subjects:
    subject_data = otu_df[['OTU_ID', subject]].copy()
    top_otus = subject_data.nlargest(30, subject)  # Keep top 30 species

    # Compute Shannon and Simpson indices
    shannon = entropy(top_otus[subject])
    simpson = simpson_index(top_otus[subject].values)

    # Store in dictionary
    diversity_data[subject] = {'Shannon': shannon, 'Simpson': simpson}

# Convert to DataFrame
diversity_df = pd.DataFrame.from_dict(diversity_data, orient='index')

# Merge with Lifestyle-BMI group labels
diversity_df = diversity_df.merge(meta_df[['Subject_ID', 'Lifestyle_BMI_Group']], left_index=True, right_on='Subject_ID')

# Display grouped data
diversity_df.head()


In [None]:
# Perform clustering within each Lifestyle-BMI group
clustered_results = {}

for group, group_df in diversity_df.groupby("Lifestyle_BMI_Group"):
    if len(group_df) < 3:  # Skip groups with too few samples for clustering
        continue

    # K-means clustering (choosing 3 clusters arbitrarily)
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    group_df["Cluster"] = kmeans.fit_predict(group_df[["Shannon", "Simpson"]])

    # Store results
    clustered_results[group] = group_df[["Subject_ID", "Cluster"]]

# Combine clustered results
clustered_diversity_df = pd.concat(clustered_results.values())

# Display sample of clustered subjects
clustered_diversity_df.head()


In [None]:
## We again get a list of bacteria who's Dominance is found throughout the group
Cluster 0:

Prevotella copri
Faecalibacterium prausnitzii
Bacteroides plebeius
Haemophilus parainfluenzae
Megasphaera elsdenii

Cluster 1:

Prevotella copri
Faecalibacterium prausnitzii
Bacteroides plebeius
Haemophilus parainfluenzae
Roseburia faecis

Cluster 2:

Prevotella copri
Faecalibacterium prausnitzii
Bacteroides plebeius
Haemophilus parainfluenzae
Roseburia faecis
Ruminococcus gnavus