In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from torchvision.models import VGG16_Weights

In [2]:
# Load the data
file_path = 'data/socal.csv'
df = pd.read_csv(file_path)

# Folder where the images are stored
image_folder = 'data/socal_pics'

# Create a new column 'image_path' by constructing the path based on 'image_id'
df['image_path'] = df['image_id'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))

Unnamed: 0,image_id,street,citi,n_citi,bed,bath,sqft,price,image_path
0,0,1317 Van Buren Avenue,"Salton City, CA",317,3,2.0,1560,201900,data/socal_pics\0.jpg
1,1,124 C Street W,"Brawley, CA",48,3,2.0,713,228500,data/socal_pics\1.jpg
2,2,2304 Clark Road,"Imperial, CA",152,3,1.0,800,273950,data/socal_pics\2.jpg
3,3,755 Brawley Avenue,"Brawley, CA",48,3,1.0,1082,350000,data/socal_pics\3.jpg
4,4,2207 R Carrillo Court,"Calexico, CA",55,4,3.0,2547,385100,data/socal_pics\4.jpg


In [3]:
# Define the image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # VGG16 input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # VGG16 normalization
                         std=[0.229, 0.224, 0.225])
])

In [4]:
# Initialize dataset and dataloader
csv_file = 'data/socal.csv'
image_folder = 'data/socal_pics'
dataset = RealEstateDataset(csv_file, image_folder, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
# Load the pretrained VGG16 model. Setting `pretrained=True` loads weights trained on ImageNet.
model = models.vgg16(weights=VGG16_Weights.DEFAULT)
# We only need the features, so we remove the classifier part by taking only `model.features`.
model = model.features
# Set the model to evaluation mode to prevent training-related behavior, such as dropout.
model.eval()

In [5]:
def extract_features(image_path):
    try:
        image = Image.open(image_path).convert('RGB')
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return np.zeros(4096)  # Return a zero vector if image is not found or corrupted
    image = transform(image)
    image = image.unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = model(image)
    return features.numpy().flatten()

Epoch [1/10], Loss: 78756610048.0000
Epoch [2/10], Loss: 45900976128.0000
Epoch [3/10], Loss: 136016019456.0000
Epoch [4/10], Loss: 71221051392.0000
Epoch [5/10], Loss: 60624424960.0000
Epoch [6/10], Loss: 71590608896.0000
Epoch [7/10], Loss: 72826494976.0000
Epoch [8/10], Loss: 67494244352.0000
Epoch [9/10], Loss: 171699077120.0000
Epoch [10/10], Loss: 91075543040.0000


In [None]:
# Extract image features
image_features_list = []
for idx, row in df.iterrows():
    image_path = row['image_path']
    features = extract_features(image_path)
    image_features_list.append(features)

In [None]:
# Convert list to numpy array
image_features = np.array(image_features_list)

# Prepare numerical features
numerical_features = df[['bed', 'bath', 'sqft']].values

# Concatenate image features and numerical features
X = np.concatenate((image_features, numerical_features), axis=1)

# Target variable
y = df['price'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R^2 Score: {r2}")
