In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
data = pd.read_csv(r"abalone.csv")

In [None]:
print(data.head)

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])

In [None]:
data.head(5)

In [None]:
#Calculate the correlation matrix
corr = data.corr()
# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap of Abalone Dataset')
plt.show()


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Assuming 'data' is your dataset

X = data[['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight']]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)  # Reduce to 2 dimensions
X_pca = pca.fit_transform(X_scaled)

# Initialize variables to store best silhouette score and corresponding k
best_silhouette_score = -1
best_k = -1

# Iterate over different values of k
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_pca)
    cluster_labels = kmeans.labels_
    silhouette_avg = silhouette_score(X_pca, cluster_labels)
    print(f"For k={k}, silhouette score is {silhouette_avg}")

    # Check if silhouette score is better than previous best
    if silhouette_avg > best_silhouette_score:
        best_silhouette_score = silhouette_avg
        best_k = k

# Print best silhouette score and corresponding k
print("Best Silhouette Score:", best_silhouette_score)
print("Best number of clusters (k):", best_k)

# Visualize clustering results for the best k
kmeans_best = KMeans(n_clusters=best_k, random_state=42)
kmeans_best.fit(X_pca)
cluster_labels_best = kmeans_best.labels_

# Scatter plot of clustered data points
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels_best, cmap='viridis', alpha=0.5)
plt.title(f'Clustering Results (k={best_k})')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster')
plt.show()
