In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing import image
import xgboost as xgb
from tqdm import tqdm

# Paths
csv_path = r"C:\Users\sagni\Downloads\House Pricing\archive (1)\socal2.csv"
img_folder = r"C:\Users\sagni\Downloads\House Pricing\archive (1)\socal2\socal_pics"

# Load dataset
df = pd.read_csv(csv_path)
print(f"Data shape: {df.shape}")
print(df.head())

# Drop rows with missing values (if any)
df.dropna(inplace=True)

# Encode categorical features (e.g., 'citi')
for col in df.select_dtypes(include=['object']).columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Split features and target
y = df['price']
X_tabular = df.drop(['price', 'image_id'], axis=1)

# Normalize tabular data
scaler = StandardScaler()
X_tabular_scaled = scaler.fit_transform(X_tabular)

# CNN feature extraction model
model_cnn = InceptionV3(weights='imagenet', include_top=False, pooling='avg')

def extract_image_features(img_id):
    img_filename = f"{img_id}.jpg"  # Assuming images are named like '0.jpg', '1.jpg', etc.
    img_path = os.path.join(img_folder, img_filename)
    if not os.path.exists(img_path):  # Handle missing images gracefully
        return np.zeros(2048)
    img = image.load_img(img_path, target_size=(299, 299))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features = model_cnn.predict(img_array, verbose=0)
    return features.flatten()

# Extract features for all images
image_features = []
for img_id in tqdm(df['image_id'], desc="Extracting CNN features"):
    image_features.append(extract_image_features(img_id))
image_features = np.array(image_features)

# Combine tabular and image features
X_combined = np.hstack([X_tabular_scaled, image_features])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# XGBoost regression
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Predictions
y_pred = xgb_model.predict(X_test)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")


Data shape: (15474, 8)
   image_id                 street             citi  n_citi  bed  bath  sqft  \
0         0  1317 Van Buren Avenue  Salton City, CA     317    3   2.0  1560   
1         1         124 C Street W      Brawley, CA      48    3   2.0   713   
2         2        2304 Clark Road     Imperial, CA     152    3   1.0   800   
3         3     755 Brawley Avenue      Brawley, CA      48    3   1.0  1082   
4         4  2207 R Carrillo Court     Calexico, CA      55    4   3.0  2547   

    price  
0  201900  
1  228500  
2  273950  
3  350000  
4  385100  
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m87910968/87910968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 0us/step


Extracting CNN features: 100%|███████████████████████████████████████████████████████████████████████████████████████| 15474/15474 [41:01<00:00,  6.29it/s]


RMSE: 257951.43
R² Score: 0.5486
