In [1]:
%%writefile pre_processing.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest


#------------------------------------------

def preprocess_wine_data(wine_path):

    print("Loading Wine dataset...")

    column_names = ['Class', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
                    'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
                    'Proanthocyanins', 'Color intensity', 'Hue',
                    'OD280/OD315 of diluted wines', 'Proline']

    wine_df = pd.read_csv(wine_path, names=column_names)

    print(f"Wine dataset shape: {wine_df.shape}")
    print("First few rows of the wine dataset:")
    print(wine_df.head())


    X_wine = wine_df.drop('Class', axis=1)
    y_wine = wine_df['Class']


    print("\nPerforming outlier removal...")
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    outlier_mask = iso_forest.fit_predict(X_wine) == 1

    X_wine_clean = X_wine[outlier_mask]
    y_wine_clean = y_wine[outlier_mask]

    print(f"Removed {X_wine.shape[0] - X_wine_clean.shape[0]} outliers")
    print(f"Clean wine dataset shape: {X_wine_clean.shape}")

    print("\nPerforming feature scaling...")
    scaler = StandardScaler()
    X_wine_scaled = scaler.fit_transform(X_wine_clean)


    print("\nPerforming PCA dimensionality reduction...")

    pca = PCA(n_components=0.95, random_state=42)
    X_wine_pca = pca.fit_transform(X_wine_scaled)

    print(f"Reduced dimensions: {X_wine_pca.shape[1]} components explain 95% of variance")
    print(f"Explained variance ratio: {pca.explained_variance_ratio_}")


    if X_wine_pca.shape[1] == 2:
        plt.figure(figsize=(10, 6))
        for i in np.unique(y_wine_clean):
            plt.scatter(X_wine_pca[y_wine_clean == i, 0], X_wine_pca[y_wine_clean == i, 1],
                       label=f'Class {i}')
        plt.legend()
        plt.title('PCA of Wine Dataset (2 components)')
        plt.xlabel('PC1')
        plt.ylabel('PC2')
        plt.show()

    return X_wine_pca, y_wine_clean, pca, scaler

Writing pre_processing.py


In [2]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2


In [12]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import adjusted_rand_score
from sklearn.mixture import GaussianMixture
from sklearn.utils import resample
from sklearn.metrics import silhouette_score, davies_bouldin_score

def train_evaluate_gmm(X, dataset_name, y=None, n_components=3, subsample_size=10000):

    print(f"\n===== Training GMM on {dataset_name} Dataset =====")

    if X.shape[0] > subsample_size:
        if y is not None:
            X, y = resample(X, y, n_samples=subsample_size, random_state=42)
        else:
            X = resample(X, n_samples=subsample_size, random_state=42)


    gmm = GaussianMixture(
        n_components=n_components,
        covariance_type='full',
        random_state=42
    )

    gmm.fit(X)


    labels = gmm.predict(X)


    bic = gmm.bic(X)
    aic = gmm.aic(X)


    log_likelihood = gmm.score(X) * X.shape[0]


    if n_components > 1 and len(np.unique(labels)) > 1:
        silhouette = silhouette_score(X, labels)
        db_index = davies_bouldin_score(X, labels)
    else:
        silhouette, db_index = 0, float('inf')


    print("\n===== GMM Performance Metrics =====")
    print(f"Number of components: {n_components}")
    print(f"Covariance type: {gmm.covariance_type}")
    print(f"BIC: {bic:.2f}")
    print(f"AIC: {aic:.2f}")
    print(f"Log Likelihood: {log_likelihood:.2f}")
    if n_components > 1 and len(np.unique(labels)) > 1:
        print(f"Silhouette Score: {silhouette:.4f}")
        print(f"Davies-Bouldin Index: {db_index:.4f}")
    else:
        print("Silhouette Score: Not applicable")
        print("Davies-Bouldin Index: Not applicable")

    if y is not None:
        ari_score = adjusted_rand_score(y, labels)
        print(f"Adjusted Rand Index (ARI): {ari_score:.4f}")

    return gmm, n_components

if __name__ == "__main__":
    from pre_processing import preprocess_wine_data


    wine_path = 'wine.data'

    print("Using Wine dataset...")
    X, y, _, _ = preprocess_wine_data(wine_path)


    try:
        n_components = 4
        wine_gmm, wine_n_components = train_evaluate_gmm(
            X,
            dataset_name='Wine',
            y=y,
            n_components=n_components
        )

    except NameError:
        print("Wine dataset not found. Make sure to run preprocessing first.")

Using Wine dataset...
Loading Wine dataset...
Wine dataset shape: (178, 14)
First few rows of the wine dataset:
   Class  Alcohol  Malic acid   Ash  Alcalinity of ash  Magnesium  \
0      1    14.23        1.71  2.43               15.6        127   
1      1    13.20        1.78  2.14               11.2        100   
2      1    13.16        2.36  2.67               18.6        101   
3      1    14.37        1.95  2.50               16.8        113   
4      1    13.24        2.59  2.87               21.0        118   

   Total phenols  Flavanoids  Nonflavanoid phenols  Proanthocyanins  \
0           2.80        3.06                  0.28             2.29   
1           2.65        2.76                  0.26             1.28   
2           2.80        3.24                  0.30             2.81   
3           3.85        3.49                  0.24             2.18   
4           2.80        2.69                  0.39             1.82   

   Color intensity   Hue  OD280/OD315 of dilut