In [9]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler


In [10]:
%pip install openpyxl


Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\saral\OneDrive\Desktop\House Price Prediction Model\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [11]:
#Load datasets
DATA_DIR = Path("datasets")
IMAGE_DIR = Path("satellite_images/train")

train_df = pd.read_excel(DATA_DIR / "train(1).xlsx")

print(train_df.shape)
train_df.head()


(16209, 21)


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,9117000170,20150505T000000,268643,4,2.25,1810,9240,2.0,0,0,...,7,1810,0,1961,0,98055,47.4362,-122.187,1660,9240
1,6700390210,20140708T000000,245000,3,2.5,1600,2788,2.0,0,0,...,7,1600,0,1992,0,98031,47.4034,-122.187,1720,3605
2,7212660540,20150115T000000,200000,4,2.5,1720,8638,2.0,0,0,...,8,1720,0,1994,0,98003,47.2704,-122.313,1870,7455
3,8562780200,20150427T000000,352499,2,2.25,1240,705,2.0,0,0,...,7,1150,90,2009,0,98027,47.5321,-122.073,1240,750
4,7760400350,20141205T000000,232000,3,2.0,1280,13356,1.0,0,0,...,7,1280,0,1994,0,98042,47.3715,-122.074,1590,8071


In [12]:
#Basic Cleaning
train_df = train_df.drop(columns=["date", "zipcode"])

train_df["is_renovated"] = (train_df["yr_renovated"] > 0).astype(int)
train_df = train_df.drop(columns=["yr_renovated"])


In [13]:
#Attach image paths
def image_path_from_id(pid):
    path = IMAGE_DIR / f"{pid}.png"
    return str(path) if path.exists() else None

train_df["image_path"] = train_df["id"].apply(image_path_from_id)

# Drop rows without images
train_df = train_df.dropna(subset=["image_path"])


In [14]:
#Separate features & target
TARGET = "price"

X = train_df.drop(columns=["price", "id", "image_path"])
y = train_df["price"]


In [15]:
# Scale tabular features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns=X.columns)


In [16]:
# Final dataset
final_df = pd.concat(
    [
        train_df[["id", "image_path"]].reset_index(drop=True),
        X_scaled.reset_index(drop=True),
        y.reset_index(drop=True)
    ],
    axis=1
)

final_df.to_csv("train_with_images.csv", index=False)
final_df.head()


Unnamed: 0,id,image_path,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,lat,long,sqft_living15,sqft_lot15,is_renovated,price
0,9117000170,satellite_images\train\9117000170.png,0.677402,0.178963,-0.290276,-0.144952,0.922943,-0.083788,-0.306964,-0.626,-0.557611,0.03072,-0.657843,-0.345663,-0.900034,0.192759,-0.473911,-0.129791,-0.20797,268643
1,6700390210,satellite_images\train\6700390210.png,-0.394132,0.505667,-0.521813,-0.311135,0.922943,-0.083788,-0.306964,0.908842,-0.557611,-0.224818,-0.657843,0.709771,-1.137139,0.192759,-0.385919,-0.339019,-0.20797,245000
2,7212660540,satellite_images\train\7212660540.png,0.677402,0.505667,-0.389506,-0.160457,0.922943,-0.083788,-0.306964,-0.626,0.29635,-0.078796,-0.657843,0.777864,-2.098571,-0.706669,-0.165941,-0.196068,-0.20797,200000
3,8562780200,satellite_images\train\8562780200.png,-1.465666,0.178963,-0.918734,-0.364787,0.922943,-0.083788,-0.306964,-0.626,-0.557611,-0.772399,-0.452638,1.288558,-0.206791,1.006527,-1.089851,-0.445025,-0.20797,352499
4,7760400350,satellite_images\train\7760400350.png,-0.394132,-0.147741,-0.874632,-0.038936,-0.918626,-0.083788,-0.306964,-0.626,-0.557611,-0.614209,-0.657843,0.777864,-1.367738,0.999388,-0.576568,-0.173196,-0.20797,232000


In [17]:
final_df.isna().sum()

id               0
image_path       0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
is_renovated     0
price            0
dtype: int64