In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.applications import DenseNet121, ResNet50
from tensorflow.keras.applications.densenet import preprocess_input as densenet_preprocess
from tensorflow.keras.applications.resnet import preprocess_input as resnet_preprocess
from tensorflow.keras.preprocessing import image
import xgboost as xgb
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm

# Paths
csv_path = r"C:\Users\sagni\Downloads\House Pricing\archive (1)\socal2.csv"
img_folder = r"C:\Users\sagni\Downloads\House Pricing\archive (1)\socal2\socal_pics"
cache_file = r"C:\Users\sagni\Downloads\House Pricing\archive (1)\cnn_features.npy"

# Load dataset
df = pd.read_csv(csv_path)
print(f"Data shape: {df.shape}")

# Drop missing values
df.dropna(inplace=True)

# Encode categorical variables
for col in df.select_dtypes(include=['object']).columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Target and features
y = df['price']
X_tabular = df.drop(['price', 'image_id'], axis=1)

# Scale tabular data
scaler = StandardScaler()
X_tabular_scaled = scaler.fit_transform(X_tabular)

# Select CNN model (DenseNet121 or ResNet50)
cnn_model_name = "DenseNet121"  # Change to "ResNet50" for ResNet
if cnn_model_name == "DenseNet121":
    cnn_model = DenseNet121(weights='imagenet', include_top=False, pooling='avg')
    preprocess_func = densenet_preprocess
else:
    cnn_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
    preprocess_func = resnet_preprocess

# Feature extraction function
def extract_image_features(img_id):
    img_filename = f"{img_id}.jpg"
    img_path = os.path.join(img_folder, img_filename)
    if not os.path.exists(img_path):
        return np.zeros(cnn_model.output_shape[1])
    img = image.load_img(img_path, target_size=(224, 224))  # ResNet/DenseNet input size
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_func(img_array)
    features = cnn_model.predict(img_array, verbose=0)
    return features.flatten()

# Check for cached features
if os.path.exists(cache_file):
    print("Loading cached CNN features...")
    image_features = np.load(cache_file)
else:
    print("Extracting CNN features (this may take ~40 mins)...")
    image_features = []
    for img_id in tqdm(df['image_id'], desc="Extracting CNN features"):
        image_features.append(extract_image_features(img_id))
    image_features = np.array(image_features)
    np.save(cache_file, image_features)
    print("CNN features cached to:", cache_file)

# Combine tabular and CNN features
X_combined = np.hstack([X_tabular_scaled, image_features])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# XGBoost with hyperparameter tuning
param_grid = {
    'n_estimators': [200, 500],
    'max_depth': [6, 8],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
xgb_model = xgb.XGBRegressor(random_state=42)
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best XGBoost Parameters:", grid_search.best_params_)

# Predictions
y_pred = best_model.predict(X_test)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

# Plot: Actual vs Predicted
plt.figure(figsize=(8,8))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted House Prices')
plt.show()

# Plot: XGBoost Feature Importance
xgb.plot_importance(best_model, max_num_features=10, height=0.5)
plt.title('Top 10 XGBoost Feature Importances')
plt.show()


Data shape: (15474, 8)
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m29084464/29084464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 1us/step
Extracting CNN features (this may take ~40 mins)...


Extracting CNN features: 100%|███████████████████████████████████████████████████████████████████████████████████████| 15474/15474 [39:13<00:00,  6.57it/s]


CNN features cached to: C:\Users\sagni\Downloads\House Pricing\archive (1)\cnn_features.npy
Fitting 3 folds for each of 32 candidates, totalling 96 fits
