In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import plotly.express as px


from sklearn.cluster import KMeans                        # for KMeans
from scipy.cluster.hierarchy import dendrogram, linkage   # hierarchical clustering
from sklearn.decomposition import PCA                     # for PCA

from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram

ModuleNotFoundError: No module named 'seaborn'

In [None]:
df = pd.read_csv('apartments_for_rent.csv')
df.head()

# Data Visualization

In [None]:
# Histogram of price
fig = px.histogram(df, x="price", nbins=50, title="Distribution of Apartment Prices")
fig.show()

# Scatter plot of price
fig = px.scatter(df, x="square_feet", y="price", title="Price vs. Size")
fig.show()

In [None]:
df[df['square_feet'] >= 20000]

In [None]:
# Removing the Outliers from dataframe
df = df[df['square_feet'] < 20000]

In [None]:
# Scatter plot for Square Feet vs Price
fig = px.scatter(df, x='square_feet', y='price', color='state', hover_data=['id'])
fig.update_layout(title='Apartment Price vs. square_feet',
                  xaxis_title='square_feet (sqft)',
                  yaxis_title='Price ($)')
fig.show()

# A 3d Scatter plot for Square Feet vs Bedrooms vs Price
fig = px.scatter_3d(df, x='square_feet', y='price', z='bedrooms', color='state', hover_data=['id'])
fig.update_layout(title='Apartment Price vs. square_feet vs. Rooms',
                  scene=dict(
                      xaxis_title='square_feet (sqft)',
                      yaxis_title='Price ($)',
                      zaxis_title='Number of Rooms'))
fig.show()

# Histogram for Price Distribution by location
fig = px.histogram(df, x='price', color='state', nbins=50, title="Price Distribution by Location")
fig.show()

# Box plot for State vs Price
fig = px.box(df, x="state", y="price", points="all")
fig.show()

# Correlation heatmap
correlation_matrix = df[['price', 'square_feet', 'bedrooms', 'bathrooms']].corr()
fig = px.imshow(correlation_matrix, text_auto=True, color_continuous_scale='RdBu')
fig.show()

# Price vs bedrooms
fig = px.scatter(df, x='bedrooms', y='price')
fig.update_layout(title='Price vs. Bedrooms')
fig.show()

# Bedrooms vs bathrooms, color = payment_type
fig = px.scatter(df, x='bedrooms', y='bathrooms', color='price_type')
fig.update_layout(title='Bedrooms vs. Bathrooms (Colored by Payment Type)')
fig.show()

# Price vs square_feet, color = bedrooms
fig = px.scatter(df, x='square_feet', y='price', color='bedrooms')
fig.update_layout(title='Price vs. Square Feet (Colored by Number of Bedrooms)')
fig.show()

# Data Preprocessing

In [None]:
# Printing the rows that have bathrooms more than 100 and bedrooms more than 15
print(df[(df['bathrooms'] > 100)])
print(df[(df['bedrooms'] > 15)])

In [None]:
#Removing the outliers rows from df
df = df[(df['bathrooms'] <= 100)]
df = df[(df['bedrooms'] <= 15)]

In [None]:
# Checking if the Outliers are removed
print(df[(df['bathrooms'] > 100)])
print(df[(df['bedrooms'] > 15)])

In [None]:
# Drop the 'id' column if it's not needed for clustering
df.drop(['id', 'pets_allowed'],  axis=1, inplace=True)

# Identify non-numeric columns
non_numeric_columns = df.select_dtypes(exclude=['number']).columns
print("Non-numeric columns in the dataset:")
print(non_numeric_columns)

# Apply one-hot encoding using get_dummies for all categorical columns
df = pd.get_dummies(df, columns=non_numeric_columns)

# Check the updated DataFrame
print(df.head())

In [None]:
df.isna().sum()

In [None]:
# drop null values in latitude, longitude and State columns
df.dropna(subset=['latitude', 'longitude'], inplace=True)

In [None]:
# Fill null values in 'bathrooms', 'bedrooms', and 'square_feet' with the mode
for col in ['bathrooms', 'bedrooms', 'square_feet']:
    mode_val = df[col].mode()[0]  # Get the mode (most frequent value)
    df[col].fillna(mode_val, inplace=True)

# Fill null values in 'price' with the mean
mean_price = df['price'].mean()
df['price'].fillna(mean_price, inplace=True)

In [None]:
df.isna().sum()

In [None]:
for col in df.select_dtypes(include=['bool']).columns:
    df[col] = df[col].map({True: 1, False: 0})
df.head()

# Data Modeling

## K-Means

In [None]:
# scale the dataset
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df)
# data_scaled = StandardScaler.fit_transformed(data)

# create and train the model
kmeans_model = KMeans(n_clusters=65, max_iter=50000)
kmeans_model.fit(df)

In [None]:
centroids = kmeans_model.cluster_centers_ # centriods

In [None]:
clusters = kmeans_model.fit_predict(df) # Clusters

In [None]:
# visulaize the clusters and centriods
plt.scatter(x=df['price'], y=df['bedrooms'], c=clusters)

# plot centriod for cluster 1
plt.scatter(x=centroids[0, 0], y=centroids[0, 1], c='red', marker='X')
# plot centriod for cluster 2
plt.scatter(x=centroids[1,0], y=centroids[1,1], c='blue', marker='X')


In [None]:
df_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(data=df_scaled, columns=df.columns)
df_scaled.head()

In [None]:
#create and fit the PCA model
pca_model = PCA(n_components=len(df_scaled.columns))
pca_model.fit(df_scaled)
df_components = pca_model.transform(df_scaled)
df_components.shape

In [None]:
pca_model.explained_variance_

In [None]:
pca_model.explained_variance_ratio_

In [None]:
# Plotting Explained Varaince by Principal Component Gragh
x_axis = range(1, len(pca_model.explained_variance_) + 1)

plt.plot(x_axis, pca_model.explained_variance_, '*-')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance')
plt.title('Explained Variance by Principal Component')
plt.show()

In [None]:
df_pca = df_components[:, [0,1]]
df_pca = pd.DataFrame(df_pca, columns=['PC1', 'PC2'])
df_pca.head()

In [None]:
# create and train the model
kmeans_model_rock = KMeans(n_clusters=10, max_iter=500000)
kmeans_model_rock.fit(df_pca)

In [None]:
centroids = kmeans_model_rock.cluster_centers_ # centriods
clusters = kmeans_model_rock.fit_predict(df_pca) # Clusters

# visulaize the clusters and centriods
sns.scatterplot(data=df_pca, x='PC1', y='PC2', hue=clusters, style=clusters, palette='Set2')

# Plot centroid for each cluster
for i in range(len(centroids)):
    plt.scatter(x=centroids[i, 0], y=centroids[i, 1], c='black', marker='X')

## Hierarcial Clustering

In [None]:
# Droping all the columns expect for price, bathrooms, bedrooms, square_feet

# Select specific columns
columns_to_keep = ['price', 'bathrooms', 'bedrooms', 'square_feet']
df_scaled_hc= df_scaled[columns_to_keep]

# Now df contains only the specified columns
print(df_scaled_hc.head())

In [None]:
# select only first 1000 rows from the df_scaled_hc dataset.
df_scaled_hc_first_1000 = df_scaled_hc.head(1000)
df_scaled_hc_first_1000.shape

In [None]:
df_dist_matrix = pdist(df_scaled_hc_first_1000, metric='euclidean')

In [None]:
df_dist_matrix

### Complete Linkage

In [None]:
# Complete Linkage
complete_linkage = linkage(df_dist_matrix, method='complete')
dendrogram(complete_linkage, leaf_rotation = 90);

In [None]:
complete_p10 = dendrogram(complete_linkage, p=10, truncate_mode='level', leaf_rotation = 90);

In [None]:
complete_p5 = dendrogram(complete_linkage, p=5, truncate_mode='level', leaf_rotation = 90);

In [None]:
complete_p3 = dendrogram(complete_linkage, p=3, truncate_mode='level', leaf_rotation = 90);

### Single Linkage

In [None]:
# Single Linkage
single_linkage = linkage(df_dist_matrix, method='single')
single_p10= dendrogram(single_linkage, p=10, truncate_mode='level', leaf_rotation = 90);

In [None]:
single_p5= dendrogram(single_linkage, p=5, truncate_mode='level', leaf_rotation = 90);

In [None]:
single_p3= dendrogram(single_linkage, p=3, truncate_mode='level', leaf_rotation = 90);

### Centroid Linkage

In [None]:
# Centroid Linkage
centroid_linkage = linkage(df_dist_matrix, method='centroid')
centroid_p10 = dendrogram(centroid_linkage, p=10, truncate_mode='level', leaf_rotation = 90);

In [None]:
centroid_p5 = dendrogram(centroid_linkage, p=5, truncate_mode='level', leaf_rotation = 90);

In [None]:
centroid_p3 = dendrogram(centroid_linkage, p=3, truncate_mode='level', leaf_rotation = 90);

### Average Linkage

In [None]:
# Average Linkage
average_linkage = linkage(df_dist_matrix, method='average')
average_p10 = dendrogram(average_linkage, p=10, truncate_mode='level', leaf_rotation = 90);

In [None]:
average_p5 = dendrogram(average_linkage, p=5, truncate_mode='level', leaf_rotation = 90);

In [None]:
average_p3 = dendrogram(average_linkage, p=3, truncate_mode='level', leaf_rotation = 90);

### Ward Linkage

In [None]:
# Ward Linkage
ward_linkage = linkage(df_dist_matrix, method='ward')
ward_p10 = dendrogram(ward_linkage, p=10, truncate_mode='level', leaf_rotation = 90);

In [None]:
ward_p5 = dendrogram(ward_linkage, p=5, truncate_mode='level', leaf_rotation = 90)

In [None]:
ward_p3 = dendrogram(ward_linkage, p=3, truncate_mode='level', leaf_rotation = 90)

### Printing all dendrograms at once for comparing

In [None]:
# Creating Dendrogram list
dendrogram_data = [
    complete_p10,
    complete_p5,
    complete_p3,
    single_p10,
    single_p5,
    single_p3,
    centroid_p10,
    centroid_p5,
    centroid_p3,
    average_p10,
    average_p5,
    average_p3,
    ward_p10,
    ward_p5,
    ward_p3
]

# Titles for each dendrogram
titles = [
    "Complete_Linkage_p10",
    "Complete_Linkage_p5",
    "Complete_Linkage_p3",
    "Single_Linkage_p10",
    "Single_Linkage_p5",
    "Single_Linkage_p3",
    "Centroid_Linkage_p10",
    "Centroid_Linkage_p5",
    "Centroid_Linkage_p",
    "Average_Linkage_p10",
    "Average_Linkage_p5",
    "Average_Linkage_p3",
    "Ward_Linkage_p10",
    "Ward_Linkage_p5",
    "Ward_Linkage_p3"
]

# Calculate the number of rows needed
num_rows = (len(dendrogram_data) + 2) // 3

# Create subplots
fig, axes = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5 * num_rows))
axes = axes.flatten()

# p values to cycle through
p_values = [10, 5, 3]

# Loop through precomputed dendrograms, titles, and corresponding linkage matrices
for i, (data, title, linkage_matrix) in enumerate(zip(dendrogram_data, titles,
                                                     [complete_linkage, complete_linkage, complete_linkage,
                                                      single_linkage, single_linkage, single_linkage,
                                                      centroid_linkage, centroid_linkage, centroid_linkage,
                                                      average_linkage, average_linkage, average_linkage,
                                                      ward_linkage, ward_linkage, ward_linkage])):

    # Get the current p value from the cycle
    current_p = p_values[i % len(p_values)]

    # Plot dendrogram
    dendrogram(linkage_matrix,
                ax=axes[i],
                p=current_p,  # Use the current p value
                truncate_mode='level',  # Truncate the dendrogram
                leaf_rotation=90)

    axes[i].set_title(title)

# Hide any extra empty subplots
for j in range(len(dendrogram_data), len(axes)):
    axes[j].axis('off')

# Adjust layout
plt.tight_layout()
plt.show()