# Import Important libraries

In [None]:
!pip install tensorflow

In [None]:
!pip install opencv-python

# PATHS

In [None]:
import numpy as np
np.object = object # Fix for the FutureWarning

import os
import cv2
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

dataset_path = "Houses-dataset/HousesDataset/"
info_file = os.path.join(dataset_path, "HousesInfo.txt")

print(f" Target file: {info_file}")

In [None]:
import os
import pandas as pd

# 1. Search for the file automatically
def find_houses_info():
    for root, dirs, files in os.walk("."):
        if "HousesInfo.txt" in files:
            return os.path.join(root, "HousesInfo.txt")
    return None

info_file = find_houses_info()

if info_file:
    print(f"‚úÖ Found data at: {info_file}")
    # 2. Extract the folder path so we can find images later
    dataset_path = os.path.dirname(info_file)
    
    # 3. Load the data
    cols = ["bedrooms", "bathrooms", "area", "zipcode", "price"]
    df = pd.read_csv(info_file, sep=" ", header=None, names=cols)
    print("‚úÖ Data Loaded Successfully!")
    print(df.head())
else:
    print("‚ùå Error: HousesInfo.txt is still missing.")
    print("Verify that your folder isn't empty (sometimes 'No space left' results in empty folders).")

# 1. LOAD TABULAR DATA

In [None]:
import cv2
import matplotlib.pyplot as plt

# Using the directory we just discovered
dataset_path = os.path.dirname(info_file) 

def load_and_show_samples(df, path, count=5):
    plt.figure(figsize=(15, 5))
    for i in range(count):
        # The dataset typically uses 1-based indexing for filenames
        img_name = f"{i+1}_main.jpg" 
        img_path = os.path.join(path, img_name)
        
        img = cv2.imread(img_path)
        if img is not None:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            plt.subplot(1, count, i + 1)
            plt.imshow(img)
            plt.title(f"Price: ${df.iloc[i]['price']:,}")
            plt.axis("off")
        else:
            print(f"Skipping {img_name}: Not found in {path}")
    plt.show()

load_and_show_samples(df, dataset_path)

# Normalize

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# 1. Scale Tabular Features
# We use Bedrooms, Bathrooms, and Area as our features
scaler = MinMaxScaler()
tabular_features = scaler.fit_transform(df[["bedrooms", "bathrooms", "area"]])

# 2. Scale the Target (Price)
# We divide by the maximum price so the target is between 0 and 1
max_price = df["price"].max()
target_prices = df["price"].values / max_price

print(f"‚úÖ Tabular data normalized. Max price for scaling: ${max_price:,}")

# 2. LOAD IMAGES 

In [None]:
import os
# Look at the first 10 files in your dataset folder
files = os.listdir(dataset_path)
print("Actual files found in folder:")
print(files[:10])

In [None]:
import cv2
import numpy as np

def load_all_images_robust(df, path):
    all_images = []
    valid_indices = []
    
    print("Scanning for images...")
    for i in df.index:
        # We try the most common filename formats
        possible_names = [f"{i+1}_main.jpg", f"{i+1}_frontal.jpg", f"{i+1}.jpg"]
        found = False
        
        for name in possible_names:
            img_path = os.path.join(path, name)
            if os.path.exists(img_path):
                image = cv2.imread(img_path)
                if image is not None:
                    image = cv2.resize(image, (64, 64))
                    all_images.append(image / 255.0)
                    valid_indices.append(i)
                    found = True
                    break
        
        if not found and i < 5: # Only print first few errors to save space
            print(f"Still can't find image for row {i+1}")

    return np.array(all_images), valid_indices

# Execute the robust loader
images_data, valid_indices = load_all_images_robust(df, dataset_path)

# IMPORTANT: We must filter our tabular data to match only the images we actually found
df_filtered = df.iloc[valid_indices]
tabular_features_filtered = tabular_features[valid_indices]
target_prices_filtered = target_prices[valid_indices]

print(f"‚úÖ Successfully matched {len(images_data)} images with tabular data.")

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, concatenate

# BRANCH 1: MLP for Tabular Data
tab_input = Input(shape=(3,)) # bedrooms, bathrooms, area
x = Dense(16, activation="relu")(tab_input)
x = Dense(8, activation="relu")(x)

# BRANCH 2: CNN for Image Data
img_input = Input(shape=(64, 64, 3))
y = Conv2D(16, (3, 3), padding="same", activation="relu")(img_input)
y = MaxPooling2D(pool_size=(2, 2))(y)
y = Flatten()(y)
y = Dense(16, activation="relu")(y)

# MERGE BRANCHES
combined = concatenate([x, y])

# FINAL REGRESSION HEAD
z = Dense(4, activation="relu")(combined)
z = Dense(1, activation="linear")(z) # Linear for price prediction

model = Model(inputs=[tab_input, img_input], outputs=z)
model.compile(loss="mse", optimizer="adam", metrics=["mae"])

print("‚úÖ Multimodal Model Architecture Ready.")

In [None]:
import cv2
import os
import numpy as np

def find_image_folder():
    for root, dirs, files in os.walk("."):
        for file in files:
            if file.endswith(".jpg") and "_" in file:
                print(f"‚úÖ Found images in: {root}")
                return root
    return None

image_dir = find_image_folder()

if not image_dir:
    print("‚ùå ERROR: No .jpg files found. Please check if your dataset download finished.")
else:
    def load_images_any_name(df, path):
        all_images = []
        valid_indices = []
        # Get list of all files in that folder once
        files_in_folder = os.listdir(path)
        
        for i in df.index:
            # Look for ANY file that starts with the house number (e.g., "1_")
            matching_files = [f for f in files_in_folder if f.startswith(f"{i+1}_") and f.endswith(".jpg")]
            
            if matching_files:
                img_full_path = os.path.join(path, matching_files[0])
                image = cv2.imread(img_full_path)
                if image is not None:
                    image = cv2.resize(image, (64, 64))
                    all_images.append(image / 255.0)
                    valid_indices.append(i)
        
        return np.array(all_images), valid_indices

    images_data, valid_indices = load_images_any_name(df, image_dir)
    tabular_features_filtered = tabular_features[valid_indices]
    target_prices_filtered = target_prices[valid_indices]
    print(f"Final Count - Images: {len(images_data)}, Tabular: {len(tabular_features_filtered)}")

# Split and Train

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, concatenate

# 1. SPLIT: 75% for training, 25% for testing
(trainTab, testTab, trainImg, testImg, trainY, testY) = train_test_split(
    tabular_features_filtered, images_data, target_prices_filtered, 
    test_size=0.25, random_state=42
)

# 2. DEFINE MULTIMODAL ARCHITECTURE
# Tabular Branch
tab_in = Input(shape=(3,))
x = Dense(16, activation="relu")(tab_in)
x = Dense(8, activation="relu")(x)

# Image Branch (CNN)
img_in = Input(shape=(64, 64, 3))
y = Conv2D(16, (3, 3), activation='relu', padding="same")(img_in)
y = MaxPooling2D(pool_size=(2, 2))(y)
y = Flatten()(y)
y = Dense(16, activation='relu')(y)

# Merge the two "thoughts" together
combined = concatenate([x, y])

# Final prediction layers
z = Dense(4, activation="relu")(combined)
z = Dense(1, activation="linear")(z)

model = Model(inputs=[tab_in, img_in], outputs=z)
model.compile(loss="mse", optimizer="adam", metrics=["mae"])

# 3. START TRAINING
print("Model training in progress...")
model.fit(x=[trainTab, trainImg], y=trainY, 
          validation_data=([testTab, testImg], testY), 
          epochs=20, batch_size=8)

# 4. CALCULATE REAL-WORLD ERROR
preds = model.predict([testTab, testImg])
# Convert the 0-1 scaling back to actual dollars
mae_dollars = np.mean(np.abs(preds.flatten() - testY)) * max_price

print(f"\n" + "="*30)
print(f"üöÄ TASK COMPLETE!")
print(f"Final Prediction Error: ${mae_dollars:,.2f}")
print("="*30)

In [None]:
# Ensure the "history =" part is there before model.fit
history = model.fit(x=[trainTab, trainImg], y=trainY, 
                  validation_data=([testTab, testImg], testY), 
                  epochs=20, batch_size=8)

In [None]:
import matplotlib.pyplot as plt

# Plot Training vs Validation Loss
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Learning Curve (Loss)')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

# Final Results for GitHub

In [None]:
mse, mae = model.evaluate([testTab, testImg], testY)
print(f"Final MAE (Mean Absolute Error): ${mae:,.2f}")
print(f"Final RMSE: ${np.sqrt(mse):,.2f}")