In [None]:
!pip install scikit-optimize



In [None]:
import pandas as pd



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.metrics import adjusted_rand_score
from sklearn.mixture import GaussianMixture
from sklearn.utils import resample
from sklearn.metrics import silhouette_score, davies_bouldin_score


#------------------------------------------

def preprocess_diabetes_data(diabetes_path):

    print("Loading Diabetes dataset...")
    column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
                    'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

    diabetes_df = pd.read_csv(diabetes_path,names=column_names,header=0 )
    print(f"Diabetes dataset shape: {diabetes_df.shape}")
    print("First few rows of the diabetes dataset:")
    print(diabetes_df.head())

    X_diabetes = diabetes_df.drop('Outcome', axis=1)
    y_diabetes = diabetes_df['Outcome']


    print("\nPerforming outlier removal...")


    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    outlier_mask = iso_forest.fit_predict(X_diabetes) == 1
    X_diabetes_clean = X_diabetes[outlier_mask]
    y_diabetes_clean = y_diabetes[outlier_mask]


    print(f"Removed {X_diabetes.shape[0] - X_diabetes_clean.shape[0]} outliers")
    print(f"Clean diabetes dataset shape: {X_diabetes_clean.shape}")


    print("\nPerforming feature scaling...")
    scaler = StandardScaler()
    X_diabetes_scaled = scaler.fit_transform(X_diabetes_clean)


    print("\nPerforming PCA dimensionality reduction...")
    pca = PCA(n_components=0.95, random_state=42)
    X_diabetes_pca = pca.fit_transform(X_diabetes_scaled)

    print(f"Reduced dimensions: {X_diabetes_pca.shape[1]} components explain 95% of variance")
    print(f"Explained variance ratio: {pca.explained_variance_ratio_}")

    return X_diabetes_pca, y_diabetes_clean, pca, scaler

def train_evaluate_gmm(X, dataset_name, y=None, n_components=3, subsample_size=10000):
    """
    Train GMM with specified hyperparameters and evaluate performance.
    """
    print(f"\n===== Training GMM on {dataset_name} Dataset =====")

    if X.shape[0] > subsample_size:
        if y is not None:
            X, y = resample(X, y, n_samples=subsample_size, random_state=42)
        else:
            X = resample(X, n_samples=subsample_size, random_state=42)


    gmm = GaussianMixture(
        n_components=n_components,
        covariance_type='full',
        random_state=42
    )


    gmm.fit(X)


    labels = gmm.predict(X)


    bic = gmm.bic(X)
    aic = gmm.aic(X)


    log_likelihood = gmm.score(X) * X.shape[0]


    if n_components > 1 and len(np.unique(labels)) > 1:
        silhouette = silhouette_score(X, labels)
        db_index = davies_bouldin_score(X, labels)
    else:
        silhouette, db_index = 0, float('inf')

    print("\n===== GMM Performance Metrics =====")
    print(f"Number of components: {n_components}")
    print(f"Covariance type: {gmm.covariance_type}")
    print(f"BIC: {bic:.2f}")
    print(f"AIC: {aic:.2f}")
    print(f"Log Likelihood: {log_likelihood:.2f}")
    if n_components > 1 and len(np.unique(labels)) > 1:
        print(f"Silhouette Score: {silhouette:.4f}")
        print(f"Davies-Bouldin Index: {db_index:.4f}")
    else:
        print("Silhouette Score: Not applicable")
        print("Davies-Bouldin Index: Not applicable")

    if y is not None:
        ari_score = adjusted_rand_score(y, labels)
        print(f"Adjusted Rand Index (ARI): {ari_score:.4f}")

    return gmm, n_components


if __name__ == "__main__":

    diabetes_path = 'diabetes.csv'

    print("Using Diabetes dataset...")
    try:
        X, y, _, _ = preprocess_diabetes_data(diabetes_path)
    except FileNotFoundError:
        print(f"File not found: {diabetes_path}. Please check the file path.")
        raise


    try:
        n_components = 4
        diabetes_gmm, diabetes_n_components = train_evaluate_gmm(
            X,
            dataset_name='Diabetes',
            y=y,
            n_components=n_components
        )

    except NameError:
        print("Diabetes dataset not found. Make sure to run preprocessing first.")

Using Diabetes dataset...
Loading Diabetes dataset...
Diabetes dataset shape: (768, 9)
First few rows of the diabetes dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Performing outlier removal...
Removed 39 outliers
Clean diabetes dataset shape: (729, 8)

Performing feature scaling...

Performing PCA dimensionality reduction...
Reduced dim