In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Data Loading: Reads the dataset and assigns column names
data = pd.read_csv('House_Price_dataset.csv', error_bad_lines=False, skip_blank_lines=True)
data.columns = [
    "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", 
    "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"
]

# Preprocessing: Drops missing values and selects features for clustering
data = data.dropna()
features = data[["RM", "LSTAT", "CRIM"]]
scaler = StandardScaler()  # Normalizes data for clustering
scaled_features = scaler.fit_transform(features)

# Clustering: Groups data into 3 clusters and appends labels to the dataset
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(scaled_features)
data['Cluster'] = clusters

# Visualization: Scatter plot for clusters and box plot for crime rate
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data, x='RM', y='LSTAT', hue='Cluster', palette='viridis')
plt.title("K-Means Clustering: RM vs LSTAT")
plt.xlabel("Average Number of Rooms (RM)")
plt.ylabel("Lower Status Population (%)")
plt.legend(title="Cluster")
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(data=data, x='Cluster', y='CRIM', palette='viridis')
plt.title("Box Plot of Crime Rate by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Crime Rate (CRIM)")
plt.show()