In [None]:
from google.colab import files
import pandas as pd

# Upload the training and testing datasets
uploaded = files.upload()

# Load the training and testing datasets
X_train_scaled = pd.read_csv('X_train_scaled.csv')
X_test_scaled = pd.read_csv('X_test_scaled.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

# Display the first few rows of the training data
X_train_scaled.head()


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Calculate WCSS for different number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(X_train_scaled)
    wcss.append(kmeans.inertia_)

# Plot the Elbow graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()


In [None]:
# Assuming the optimal number of clusters is 3 based on the Elbow method (adjust if different)
optimal_clusters = 3

# Train the K-Means model
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=42)
kmeans.fit(X_train_scaled)

# Predict the clusters for the training data
clusters = kmeans.predict(X_train_scaled)

# Add the cluster labels to the training data
X_train_scaled['Cluster'] = clusters

# Display the first few rows of the data with cluster labels
X_train_scaled.head()


In [None]:
from sklearn.decomposition import PCA

# Reduce the dimensionality of the data to 2 components using PCA
pca = PCA(n_components=2)
pca_components = pca.fit_transform(X_train_scaled.drop('Cluster', axis=1))

# Plot the clusters
plt.figure(figsize=(10, 6))
plt.scatter(pca_components[:, 0], pca_components[:, 1], c=X_train_scaled['Cluster'], cmap='viridis', marker='o')
plt.title('Clusters Visualization')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()


In [None]:
import joblib

# Save the trained K-Means model
joblib.dump(kmeans, 'kmeans_model.pkl')

# Save the cluster labels with the training data
X_train_scaled.to_csv('train_with_clusters.csv', index=False)

# Save the cluster centers for interpretation
cluster_centers = pd.DataFrame(kmeans.cluster_centers_, columns=X_train_scaled.columns[:-1])
cluster_centers.to_csv('cluster_centers.csv', index=False)

# Download the model and the cluster data
files.download('kmeans_model.pkl')
files.download('train_with_clusters.csv')
files.download('cluster_centers.csv')
