In [17]:
import os
import tarfile
      3 import urllib.request
----> 4 import pandas as pd
      5 import matplotlib.pyplot as plt
      6 import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Constants for downloading the dataset
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

# Function to fetch housing data
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

# Fetch the dataset
fetch_housing_data()

# Load the data into a DataFrame
csv_path = os.path.join(HOUSING_PATH, "housing.csv")
housing_data = pd.read_csv(csv_path)

# Selecting relevant columns for clustering: 'longitude', 'latitude', and 'median_income'
housing_data = housing_data[['longitude', 'latitude', 'median_income']]

# Step 1: Preprocessing - Normalize the data using StandardScaler
scaler = StandardScaler()
scaled_housing_data = scaler.fit_transform(housing_data)

# Step 2: Optimize the Number of Clusters Using Silhouette Score
silhouette_scores = []
k_values = range(2, 11)  # Testing k values from 2 to 10

# K-Means Clustering and Silhouette Score Calculation
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(scaled_housing_data)
    score = silhouette_score(scaled_housing_data, labels)
    silhouette_scores.append(score)

# Step 3: Choose the Optimal K (based on the highest silhouette score)
optimal_k = k_values[silhouette_scores.index(max(silhouette_scores))]
print(f"Optimal number of clusters: {optimal_k}")

# Step 4: Apply K-Means with Optimal K
kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42)
optimal_labels = kmeans_optimal.fit_predict(scaled_housing_data)

# Step 5: Add cluster labels to the original DataFrame
housing_data['Cluster'] = optimal_labels

# Visualization: Boxplot for Median Income Distribution in Each Cluster
plt.figure(figsize=(12, 6))
sns.boxplot(x='Cluster', y='median_income', data=housing_data)
plt.title('Distribution of Median Income in Each Cluster')
plt.xlabel('Cluster')
plt.ylabel('Median Income')
plt.xticks(rotation=0)
plt.grid()
plt.show()


IndentationError: unexpected indent (<ipython-input-17-332faaad054b>, line 3)

In [19]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load the California Housing dataset
california = fetch_california_housing(as_frame=True)
df = california.frame

# Select features for clustering: Longitude, Latitude, and Median Income
X = df[['Longitude', 'Latitude', 'MedInc']]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Function to calculate silhouette scores for different values of k
def calculate_silhouette_scores(X, max_k=10):
    silhouette_scores = []
    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        score = silhouette_score(X, labels)
        silhouette_scores.append(score)
        print(f'For k = {k}, Silhouette Score = {score:.4f}')
    return silhouette_scores

# Calculate silhouette scores for k from 2 to 10
silhouette_scores = calculate_silhouette_scores(X_scaled, max_k=10)

# Plot silhouette scores to identify the optimal k
plt.figure(figsize=(8, 6))
plt.plot(range(2, 11), silhouette_scores, marker='o', linestyle='--', color='b')
plt.title('Silhouette Scores for Different Values of k', fontsize=14)
plt.xlabel('Number of clusters (k)', fontsize=12)
plt.ylabel('Silhouette Score', fontsize=12)
plt.grid(True)
plt.show()

# Function to perform KMeans clustering and plot the clusters
def plot_clusters(X, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X)
    labels = kmeans.labels_

    plt.figure(figsize=(8, 6))
    


ModuleNotFoundError: No module named 'pandas'