In [2]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from torchvision.models import VGG16_Weights
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

In [3]:
# Load the data
file_path = 'data/socal.csv'
df = pd.read_csv(file_path)

# Folder where the images are stored
image_folder = 'data/socal_pics'

# Create a new column 'image_path' by constructing the path based on 'image_id'
df['image_path'] = df['image_id'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))

In [3]:
# Define the image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # VGG16 input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # VGG16 normalization
                         std=[0.229, 0.224, 0.225])
])

In [4]:
# Load the pretrained VGG16 model. Setting `pretrained=True` loads weights trained on ImageNet.
model = models.vgg16(weights=VGG16_Weights.DEFAULT)
# We only need the features, so we remove the classifier part by taking only `model.features`.
model = model.features
# Set the model to evaluation mode to prevent training-related behavior, such as dropout.
model.eval()

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_si

In [5]:
def extract_features(image_path):
    try:
        image = Image.open(image_path).convert('RGB')
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return np.zeros(4096)  # Return a zero vector if image is not found or corrupted
    image = transform(image)
    image = image.unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = model(image)
    return features.numpy().flatten()

In [6]:
# Extract image features
image_features_list = []
for idx, row in df.iterrows():
    image_path = row['image_path']
    features = extract_features(image_path)
    image_features_list.append(features)

In [7]:
# Convert list to numpy array
image_features = np.array(image_features_list)

# Save the image features to a numpy .npz file using np.savez
np.savez('image_features.npz', image_features=image_features)

In [8]:
# Train linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {np.sqrt(mse)}")

Mean Squared Error (MSE): 8.67146494922878e+26
Root Mean Squared Error (RMSE): 29447351237808.777


In [9]:
# Create PyTorch dataset
class CreateDataset(Dataset):
    def __init__(self,features,targets):
        self.features = torch.tensor(features,dtype=torch.float32)
        self.targets  = torch.tensor(targets,dtype=torch.float32)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self,idx):
        return self.features[idx], self.targets[idx]
    

# Define the neural network model
class NeuralNetRegressor(nn.Module):
    def __init__(self, input_size=4, hidden_size=64, output_size=2):
        super(NeuralNetRegressor, self).__init__()
        
        # Define the layers
        self.fc1 = nn.Linear(input_size, hidden_size)  # Input layer -> Hidden layer
        self.relu = nn.ReLU()                          # Activation function
        self.fc2 = nn.Linear(hidden_size, output_size) # Hidden layer -> Output layer
    
    def forward(self, x):
        x = self.fc1(x)    # Input to hidden
        x = self.relu(x)   # ReLU activation
        x = self.fc2(x)    # Hidden to output
        return x

In [8]:
def MSE(y, yhat):
    return np.mean((y - yhat) ** 2)

def RMSE(y,yhat):
    return np.sqrt(MSE(y,yhat))

In [4]:
# Extract numerical features
numerical_features = df[['sqft', 'n_citi', 'bed', 'bath']].values  # Shape: (n_samples, 4)

# Combine numerical features with image features
features = np.hstack((numerical_features, image_features))  # Shape: (n_samples, total_feature_dim)

# Extract targets
targets = df.price

# Create PyTorch dataset
X = features
Y = targets.to_numpy()
dataset = CreateDataset(X,Y)
data_loader = DataLoader(dataset,batch_size=200,shuffle=True)

# Determine the input size based on the combined features
input_size = features.shape[1]

# Create the model instance
model = NeuralNetRegressor(input_size=input_size,hidden_size=32,output_size=1)

# Define the loss function and the optimizer
cost_function = nn.MSELoss()  # Mean Squared Error Loss for regression
optimizer = optim.Adam(model.parameters(), lr=0.0001)

num_epochs = 2400
# Example of how to train the model (assuming you have your data loaders)
for epoch in range(num_epochs):
    for X,Y in data_loader:
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        Yh = model(X)
        
        # Compute the loss
        loss = cost_function(Yh,torch.unsqueeze(Y,1))
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
    if epoch % 480 == 0:
        print('epoch',epoch,'loss',loss.detach().numpy())
        
# neural network RMSE
X = torch.tensor(features,dtype=torch.float32)
Yh = model(X)
Yh = Yh.detach().numpy().flatten() #GPT4 suggested change to add flatten to match shapes
Y = targets.to_numpy()
RMSE(Y,Yh)

NameError: name 'df' is not defined

###### epoch 0 loss 530075580000.0
#epoch 480 loss 71383040000.0
#epoch 960 loss 79980945000.0
#epoch 1440 loss 81245790000.0
#epoch 1920 loss 69038590000.0
#278979.65244980756

In [4]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Load the .npz file
data = np.load("image_features.npz")

# Assuming the file contains an array under the key 'image_features'
image_features = data['image_features']

# Extract numerical features
numerical_features = df[['sqft', 'n_citi', 'bed', 'bath']].values  # Shape: (n_samples, 4)

# Combine numerical features with image features
features = np.hstack((numerical_features, image_features))  # Shape: (n_samples, total_feature_dim)

# Extract targets
targets = df.price

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, targets, test_size=0.2, random_state=42)

# Initialize the XGBoost regressor
xgb_model = xgb.XGBRegressor(
    n_estimators=3000,       # Number of boosting rounds
    learning_rate=0.1,      # Step size shrinkage used to prevent overfitting
    max_depth=5,            # Maximum depth of a tree
    random_state=42,        # Seed for reproducibility
    objective='reg:squarederror'  # Objective function for regression
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = xgb_model.predict(X_train)

Training MSE: 2060929.6903489055
Training RMSE: 1435.593845887097


In [11]:
# Calculate Mean Squared Error (MSE) and Root Mean Squared Error (RMSE) for the training set
train_mse = MSE(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)  # Root Mean Squared Error

print("Training MSE:", train_mse)
print("Training RMSE:", train_rmse)

Training MSE: 2060929.6903489055
Training RMSE: 1435.593845887097


In [9]:
# Make predictions on the validation set
y_pred = xgb_model.predict(X_val)

# Calculate Mean Squared Error (or any other evaluation metric)
mse = MSE(y_val, y_pred)
rmse = np.sqrt(mse)   # Root Mean Squared Error
print("Validation MSE:", mse)
print("Validation RMSE:", rmse)

Validation MSE: 59710451393.06767
Validation RMSE: 244357.22087359658


In [12]:
# Save the model in XGBoost's native format
xgb_model.save_model("xgb_model.json")  # You can also use "xgb_model.bin" for binary format