In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
import seaborn as sns

dataset = pd.read_csv('hdddata.csv')
X = dataset.drop('failure', axis=1)
y = dataset['failure']
X.fillna(X.mean(), inplace=True)
# A1. Merge train and test sets and remove class labels
#merged_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
#X = merged_data.drop('failure', axis=1)

In [None]:
# A2. Determine the ideal k value using the elbow method
distortions = []
K_range = range(1, 32)

for k in K_range:
    kmeans = KMeans(n_clusters=k, n_init=5, max_iter=300, random_state=42)  # Adjust other parameters for speed
    kmeans.fit(X)
    distortions.append(kmeans.inertia_)

# Parallelize the computation using n_jobs
# This will use all available CPU cores
# Adjust n_jobs based on the number of cores you want to use
# Set to -1 to use all available cores
with Parallel(n_jobs=-1):
    kmeans = KMeans(n_clusters=k, n_init=5, max_iter=300, random_state=42)  # Adjust other parameters for speed
    kmeans.fit(X)
    distortions.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(K_range, distortions, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Average Euclidean Distance from Cluster Center')
plt.show()

KeyboardInterrupt: ignored

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_dense'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 177, in where
KeyboardInterrupt: 


In [None]:
# A3. Use Agglomerative Clustering and plot dendrogram
plt.figure(figsize=(15, 8))
dendrogram(linkage(X, method='ward'))
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

In [None]:
# A4. Perform Sequential Feature Selection
sfs = SequentialFeatureSelector(k_features='best', forward=True, scoring='accuracy', cv=5)
sfs.fit(X, y)
optimal_features = list(X.columns[list(sfs.k_feature_idx_)])


In [None]:
# A5. Perform PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
num_components = np.argmax(cumulative_variance >= 0.95) + 1

# Fit PCA with the optimal number of components
pca = PCA(n_components=num_components)
X_pca = pca.fit_transform(X_scaled)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train and evaluate your model
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
