In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from torchvision.models import VGG16_Weights

In [2]:
# Load the data
file_path = 'data/socal.csv'
df = pd.read_csv(file_path)

# Folder where the images are stored
image_folder = 'data/socal_pics'

# Create a new column 'image_path' by constructing the path based on 'image_id'
df['image_path'] = df['image_id'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))

In [3]:
# Define the image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # VGG16 input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # VGG16 normalization
                         std=[0.229, 0.224, 0.225])
])

In [4]:
# Load the pretrained VGG16 model. Setting `pretrained=True` loads weights trained on ImageNet.
model = models.vgg16(weights=VGG16_Weights.DEFAULT)
# We only need the features, so we remove the classifier part by taking only `model.features`.
model = model.features
# Set the model to evaluation mode to prevent training-related behavior, such as dropout.
model.eval()

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_si

In [5]:
def extract_features(image_path):
    try:
        image = Image.open(image_path).convert('RGB')
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return np.zeros(4096)  # Return a zero vector if image is not found or corrupted
    image = transform(image)
    image = image.unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = model(image)
    return features.numpy().flatten()

In [None]:
# Extract image features
image_features_list = []
for idx, row in df.iterrows():
    image_path = row['image_path']
    features = extract_features(image_path)
    image_features_list.append(features)

In [None]:
# Convert list to numpy array
image_features = np.array(image_features_list)

# Prepare numerical features
numerical_features = df[['bed', 'bath', 'sqft']].values

# Concatenate image features and numerical features
X = np.concatenate((image_features, numerical_features), axis=1)

# Target variable
y = df['price'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R^2 Score: {r2}")
