Data cleaning + EDA + feature engineering

In [None]:
# laod and clean
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_train = pd.read_csv("data/train.csv")
df_test  = pd.read_csv("data/test.csv")

df_train["bedrooms"] = df_train["bedrooms"].astype(int)
df_train["bathrooms"] = df_train["bathrooms"].astype(float)
df_train["sqft_living"] = df_train["sqft_living"].astype(float)
df_train["price"] = df_train["price"].astype(float)

df_train = df_train[
    (df_train["lat"].between(-90, 90)) &
    (df_train["long"].between(-180, 180))
]

upper = df_train["price"].quantile(0.99)
df_train = df_train[df_train["price"] <= upper]

df_train["log_price"] = np.log1p(df_train["price"])

df_train.to_csv("data/clean_train.csv", index=False)
df_test.to_csv("data/clean_test.csv", index=False)


EDA

In [None]:
# price distribution graph 

df = pd.read_csv("/Users/prashantmaurya/Desktop/Satellite_Property_Valuation/data/clean_train.csv")
plt.figure(figsize=(6,4))
sns.histplot(df["price"], bins=30, kde=True)
plt.xlabel("Price")
plt.title("Distribution of Property Prices")
plt.show()


In [None]:
# Log(Price) Distribution
df["log_price"] = np.log1p(df["price"])

plt.figure(figsize=(6,4))
sns.histplot(df["log_price"], bins=30, kde=True)
plt.xlabel("Log(Price)")
plt.title("Log-Transformed Price Distribution")
plt.show()


In [None]:
# Price vs Sqft Living
plt.figure(figsize=(6,4))
sns.scatterplot(
    x=df["sqft_living"],
    y=df["price"],
    alpha=0.5
)
plt.xlabel("Sqft Living")
plt.ylabel("Price")
plt.title("Price vs Living Area")
plt.show()


In [None]:
# Price ko Lakhs (₹ in Lakhs) , Price vs Sqft Living
plt.figure(figsize=(6,4))
plt.scatter(
    df["sqft_living"],
    df["price"] / 1e5,   # convert to lakhs
    alpha=0.4
)

plt.xlabel("Sqft Living")
plt.ylabel("Price (in Lakhs)")
plt.title("Price vs Living Area")
plt.grid(True)
plt.show()


In [None]:
# ye original price me karna hai , price vs sqft living

import matplotlib.ticker as mtick

plt.figure(figsize=(6,4))
plt.scatter(df["sqft_living"], df["price"], alpha=0.4)

plt.xlabel("Sqft Living")
plt.ylabel("Price")
plt.title("Price vs Living Area")

plt.gca().yaxis.set_major_formatter(
    mtick.StrMethodFormatter('{x:,.0f}')
)

plt.show()


In [None]:
highest_price_row = df.loc[df["price"].idxmax()]
print(highest_price_row)


In [None]:
# Price vs Number of Bedrooms
# iska scatter plot nhi bnaya kyuki Bedrooms categorical / discrete variable hai, Same bedroom count ke hundreds of points ho jaate hain, Scatter plot me over-plotting hoti hai, Pattern clearly dikhai nahi deta, 
# isliye boxplot use kiya hai 
# and hmi pe satellite image justify hoti hai.

plt.figure(figsize=(7,4))
sns.boxplot(
    x=df["bedrooms"],
    y=df["price"] / 1e5   # convert to lakhs
)

plt.xlabel("Number of Bedrooms")
plt.ylabel("Price (in Lakhs)")
plt.title("Price vs Number of Bedrooms")
plt.show()


In [None]:
# Latitude–Longitude Scatter Plot
plt.figure(figsize=(6,6))
plt.scatter(
    df["long"],
    df["lat"],
    s=5,
    alpha=0.4
)

plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("Spatial Distribution of Properties")
plt.show()


In [None]:
# High-Price vs Low-Price Clusters
price_threshold = df["price"].quantile(0.75)

df["price_category"] = np.where(
    df["price"] >= price_threshold,
    "High Price",
    "Low Price"
)
plt.figure(figsize=(6,6))
sns.scatterplot(
    x="long",
    y="lat",
    hue="price_category",
    data=df,
    alpha=0.6,
    s=15
)

plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("High vs Low Price Spatial Clusters")
plt.legend()
plt.show()


In [None]:
# spatial price disturbution

plt.figure(figsize=(6,6))
sc = plt.scatter(
    df["long"],
    df["lat"],
    c=df["price"] / 1e5,  # Lakhs
    cmap="viridis",
    s=10,
    alpha=0.6
)

plt.colorbar(sc, label="Price (Lakhs)")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("Spatial Price Gradient")
plt.show()
