In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from torchvision.models import VGG16_Weights
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [2]:
# Load the data
file_path = 'data/socal.csv'
df = pd.read_csv(file_path)

# Folder where the images are stored
image_folder = 'data/socal_pics'

# Create a new column 'image_path' by constructing the path based on 'image_id'
df['image_path'] = df['image_id'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))


In [3]:
# Define the image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # VGG16 input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # VGG16 normalization
                         std=[0.229, 0.224, 0.225])
])

In [4]:
# Load the pretrained VGG16 model. Setting `pretrained=True` loads weights trained on ImageNet.
model = models.vgg16(weights=VGG16_Weights.DEFAULT)
# We only need the features, so we remove the classifier part by taking only `model.features`.
model = model.features
# Set the model to evaluation mode to prevent training-related behavior, such as dropout.
model.eval()

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_si

In [5]:
def extract_features(image_path):
    try:
        image = Image.open(image_path).convert('RGB')
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return np.zeros(4096)  # Return a zero vector if image is not found or corrupted
    image = transform(image)
    image = image.unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = model(image)
    return features.numpy().flatten()

In [6]:
# Extract image features
image_features_list = []
for idx, row in df.iterrows():
    image_path = row['image_path']
    features = extract_features(image_path)
    image_features_list.append(features)

In [7]:
# Convert list to numpy array
image_features = np.array(image_features_list)

# Save the image features to a numpy .npz file using np.savez
np.savez('image_features.npz', image_features=image_features)

In [8]:
def MSE(y, yhat):
    return np.mean((y - yhat) ** 2)

def RMSE(y,yhat):
    return np.sqrt(MSE(y,yhat))

In [9]:
# Load the .npz file containing image features
data = np.load("image_features.npz")
image_features = data['image_features']

# Extract numerical features from the DataFrame
numerical_features = df[['sqft', 'n_citi', 'bed', 'bath']].values  # Shape: (n_samples, 4)

# Combine numerical features with image features
features = np.hstack((numerical_features, image_features))  # Shape: (n_samples, total_feature_dim)

# Extract targets
targets = df['price']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, targets, test_size=0.2, random_state=42)

# Initialize the XGBoost regressor with regularization parameters and early stopping
xgb_model = xgb.XGBRegressor(
    n_estimators=2000,         # Number of boosting rounds
    learning_rate=0.1,        # Smaller learning rate for better generalization
    max_depth=6,               # Maximum depth of each tree
    reg_alpha=1,               # L1 regularization term (lasso)
    reg_lambda=1,              # L2 regularization term (ridge)
    subsample=0.8,             # Subsample ratio of training data
    colsample_bytree=0.8,      # Subsample ratio of features per tree
    objective='reg:squarederror',  # Objective function for regression
    early_stopping_rounds=100,
)

# Train the model with early stopping
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],          # Evaluation set for early stopping
    verbose=True                        # Print progress
)

# Make predictions on the training set for evaluation
y_train_pred = xgb_model.predict(X_train)
y_val_pred = xgb_model.predict(X_val)

# Print out feature importances for further analysis
feature_importances = xgb_model.feature_importances_
print("Feature Importances:", feature_importances)


[0]	validation_0-rmse:369739.42073
[1]	validation_0-rmse:357837.77848
[2]	validation_0-rmse:348390.43760
[3]	validation_0-rmse:339438.66674
[4]	validation_0-rmse:332501.95256
[5]	validation_0-rmse:325952.91093
[6]	validation_0-rmse:320488.07200
[7]	validation_0-rmse:315988.26677
[8]	validation_0-rmse:311900.92154
[9]	validation_0-rmse:308500.93247
[10]	validation_0-rmse:306099.17755
[11]	validation_0-rmse:303290.28814
[12]	validation_0-rmse:301148.66910
[13]	validation_0-rmse:299934.50107
[14]	validation_0-rmse:298068.92982
[15]	validation_0-rmse:296604.82368
[16]	validation_0-rmse:295376.01047
[17]	validation_0-rmse:293689.89669
[18]	validation_0-rmse:292292.25544
[19]	validation_0-rmse:291018.80407
[20]	validation_0-rmse:290454.07042
[21]	validation_0-rmse:289443.79651
[22]	validation_0-rmse:287399.02615
[23]	validation_0-rmse:286396.30157
[24]	validation_0-rmse:285226.80806
[25]	validation_0-rmse:284648.58625
[26]	validation_0-rmse:284070.62550
[27]	validation_0-rmse:283579.19612
[2

In [10]:
# Calculate Mean Squared Error (MSE) and Root Mean Squared Error (RMSE) for the training set
train_mse = MSE(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)  # Root Mean Squared Error

print("Training MSE:", train_mse)
print("Training RMSE:", train_rmse)

Training MSE: 88863007.72171761
Training RMSE: 9426.717759735762


In [11]:
# Make predictions on the validation set
y_pred = xgb_model.predict(X_val)

# Calculate Mean Squared Error (or any other evaluation metric)
mse = MSE(y_val, y_pred)
rmse = np.sqrt(mse)   # Root Mean Squared Error
print("Validation MSE:", mse)
print("Validation RMSE:", rmse)

Validation MSE: 61265448174.18379
Validation RMSE: 247518.58147255084


In [12]:
# Save the model to json
xgb_model.save_model("xgb_model.json")