In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
OUT_DIR = "/content/drive/MyDrive/satellite_property_valuation/satellite_images"


In [4]:
df = pd.read_csv("/content/cdc_train.csv")

In [5]:
df['date'] = pd.to_datetime(df['date'], format='%Y%m%dT%H%M%S')


In [6]:
df['log_price'] = df['price'].apply(lambda x: np.log(x))

In [7]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'log_price'],
      dtype='object')

In [8]:
numeric_cols= ['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [9]:
test_df= pd.read_csv('/content/cdc_ds_test.csv')

In [10]:
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])


# **Model**

In [11]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm

In [12]:
model = models.resnet18(pretrained=True)
model.fc = nn.Identity()
model.eval()



Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 44.7M/44.7M [00:00<00:00, 166MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [13]:
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [14]:
def extract_features(image_path):
    try:
        img = Image.open(image_path).convert('RGB')
        img_t = preprocess(img).unsqueeze(0).to(device)
        with torch.no_grad():
            features = model(img_t)
        return features.cpu().numpy().flatten()
    except Exception as e:
        print(f"Actual Error: {e}")  # Add this to see the hidden problem
        return np.zeros(512)

In [20]:
import os

folder_path = "/content/drive/MyDrive/House_Price_Project_Final"

if os.path.exists(folder_path):
    files = os.listdir(folder_path)
    print("Files found in folder:", files[:3])
else:
    print("Folder path is incorrect. Please check the spelling.")

Files found in folder: ['images', 'features', 'models']


In [None]:
import os
import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# 1. Configuration
image_dir = "/content/drive/MyDrive/satellite_property_valuation/satellite_images"
save_path = "/content/drive/MyDrive/satellite_property_valuation/image_features_FINAL.csv"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. Preparation
model.to(device)
model.eval()
all_features = []
house_ids = df['id'].tolist()

if not os.path.exists(image_dir):
    print(f"CRITICAL ERROR: Folder not found at {image_dir}")
else:
    print(f"Starting extraction for {len(house_ids)} images...")

    # 4. The Loop
    for i, hid in enumerate(tqdm(house_ids)):
        img_path = os.path.join(image_dir, f"{hid}.png")

        feat = extract_features(img_path)
        all_features.append(feat)

        # --- EMERGENCY SAVE EVERY 1000 IMAGES ---
        if (i + 1) % 1000 == 0:
            temp_df = pd.DataFrame(all_features, columns=[f'img_feat_{j}' for j in range(512)])
            temp_df['id'] = house_ids[:len(all_features)]
            temp_df.to_csv(save_path, index=False)
            print(f" | Progress saved to Drive at image {i+1}")

    # 5. Final Create and Save
    feat_cols = [f'img_feat_{i}' for i in range(512)]
    image_feat_df = pd.DataFrame(all_features, columns=feat_cols)
    image_feat_df['id'] = house_ids

    # FINAL SAVE TO DRIVE
    image_feat_df.to_csv(save_path, index=False)

    print("\nExtraction Complete and Saved to Drive!")
    print(f"DataFrame Shape: {image_feat_df.shape}")

Starting extraction for 16209 images...


  0%|          | 0/16209 [00:00<?, ?it/s]

 | Progress saved to Drive at image 1000
 | Progress saved to Drive at image 2000
 | Progress saved to Drive at image 3000
 | Progress saved to Drive at image 4000
 | Progress saved to Drive at image 5000
 | Progress saved to Drive at image 6000
 | Progress saved to Drive at image 7000
 | Progress saved to Drive at image 8000
 | Progress saved to Drive at image 9000
 | Progress saved to Drive at image 10000
 | Progress saved to Drive at image 11000
 | Progress saved to Drive at image 12000
 | Progress saved to Drive at image 13000
 | Progress saved to Drive at image 14000
 | Progress saved to Drive at image 15000
 | Progress saved to Drive at image 16000

Extraction Complete and Saved to Drive!
DataFrame Shape: (16209, 513)


In [None]:
# Check for any "dead" extractions
zero_rows = (image_feat_df.iloc[:, :512].sum(axis=1) == 0).sum()
print(f"Rows with zero features: {zero_rows}")

# Look at the first few rows to ensure variety
display(image_feat_df.head())

Rows with zero features: 0


Unnamed: 0,img_feat_0,img_feat_1,img_feat_2,img_feat_3,img_feat_4,img_feat_5,img_feat_6,img_feat_7,img_feat_8,img_feat_9,...,img_feat_503,img_feat_504,img_feat_505,img_feat_506,img_feat_507,img_feat_508,img_feat_509,img_feat_510,img_feat_511,id
0,0.0186,0.144798,1.311415,0.09811,0.403078,0.616295,0.909187,1.496928,1.968919,1.119215,...,0.157892,0.532916,0.560708,0.0,0.210917,0.703232,1.010655,0.664332,0.0,9117000170
1,0.472351,0.135211,3.182259,0.246961,0.07265,0.065367,0.62535,0.898638,1.593249,2.361856,...,0.319023,0.390403,1.706257,0.0,1.016085,1.792019,2.009611,0.178773,0.393116,6700390210
2,0.420289,0.310904,1.3261,0.389269,0.651462,0.44913,0.279591,1.420183,2.154287,1.26913,...,1.525771,0.520503,2.366577,0.114988,2.148061,0.528476,1.505134,0.181216,0.133635,7212660540
3,0.042957,0.426265,1.759102,0.172925,0.227874,0.401043,1.491914,0.761423,1.569707,1.30895,...,0.958589,0.638155,1.24396,0.085733,2.25353,0.013246,1.573162,0.881866,0.056145,8562780200
4,0.144827,0.198062,0.813048,0.010523,0.046097,0.662091,0.856346,1.604122,1.597418,1.217113,...,0.627851,0.00811,1.552384,0.115803,1.206524,0.439638,1.19021,0.331199,0.00033,7760400350


In [None]:
print(len(all_features))

16209


In [None]:
import cv2
import numpy as np
import os
from tqdm.auto import tqdm

def extract_color_features(img_path):
    # Load image
    img = cv2.imread(img_path)
    if img is None: return 0, 0

    # Convert to HSV
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    # Define "Green" range (lower and upper)
    lower_green = np.array([35, 40, 40])
    upper_green = np.array([85, 255, 255])

    # Define "Grey/Concrete" range
    lower_grey = np.array([0, 0, 50])
    upper_grey = np.array([180, 50, 200])

    # Create masks
    green_mask = cv2.inRange(hsv, lower_green, upper_green)
    grey_mask = cv2.inRange(hsv, lower_grey, upper_grey)

    # Calculate percentages
    total_pixels = img.shape[0] * img.shape[1]
    green_pct = np.sum(green_mask > 0) / total_pixels
    grey_pct = np.sum(grey_mask > 0) / total_pixels

    return green_pct, grey_pct

# --- APPLY TO YOUR DATAFRAME ---
tqdm.pandas()
# Assuming your image files are named by the 'id' column
results = combined_df['id'].progress_apply(lambda x: extract_color_features(os.path.join(image_dir, f"{int(x)}.png")))

# Split the results into two new columns
combined_df[['green_score', 'concrete_score']] = pd.DataFrame(results.tolist(), index=combined_df.index)

print("âœ… New Semantic Features Extracted!")

  0%|          | 0/16407 [00:00<?, ?it/s]

âœ… New Semantic Features Extracted!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Define your save path (Change 'My Drive/Project' to your actual folder)
save_path = '/content/drive/MyDrive/house_price_project/'
if not os.path.exists(save_path):
    os.makedirs(save_path)

# 1. Save as CSV (Easy to view)
combined_df.to_csv(os.path.join(save_path, 'combined_data_final_semantic.csv'), index=False)

# 2. Save as Pickle (Best for Python/Data Types)
combined_df.to_pickle(os.path.join(save_path, 'combined_data_final_semantic.pkl'))

print(f"ðŸ“‚ Data safely saved to: {save_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ðŸ“‚ Data safely saved to: /content/drive/MyDrive/house_price_project/


**TEST_DF**

In [21]:
import os
import time
import requests
import pandas as pd
import cv2

# ---------------- CONFIG ----------------
MAPBOX_TOKEN = "pk.eyJ1IjoicmlzaGl0YWExMjMiLCJhIjoiY21qYjN6eGhsMGFzcDNjcXJwMW5qMmZ4YiJ9.D531N9kivFCVO3P9xeY-lA"
ZOOM = 16
IMG_SIZE = 256
# Updated to point to your test CSV
CSV_PATH = "/content/cdc_ds_test.csv"
# Updated directory name to distinguish test images
OUT_DIR = "data/satellite_images_test"

# Ensure output directory exists
os.makedirs(OUT_DIR, exist_ok=True)

def download_images():
    # 1. Load Data
    try:
        df = pd.read_csv(CSV_PATH)
        print(f"Successfully loaded {len(df)} rows from {CSV_PATH}")
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return

    # 2. Use a Session for better performance
    with requests.Session() as session:
        for i, row in df.iterrows():
            pid = row["id"]
            lat = row["lat"]
            lon = row["long"]

            out_path = os.path.join(OUT_DIR, f"{pid}.png")

            # Skip if image already exists
            if os.path.exists(out_path):
                continue

            # Mapbox Static API URL
            url = (
                f"https://api.mapbox.com/styles/v1/mapbox/satellite-v9/static/"
                f"{lon},{lat},{ZOOM},0,0/{IMG_SIZE}x{IMG_SIZE}"
                f"?access_token={MAPBOX_TOKEN}"
            )

            try:
                response = session.get(url, timeout=15)

                if response.status_code == 200:
                    with open(out_path, "wb") as f:
                        f.write(response.content)
                elif response.status_code == 401:
                    print("Error: Invalid Mapbox Token.")
                    break
                elif response.status_code == 429:
                    print("Rate limit reached. Sleeping for 10 seconds...")
                    time.sleep(10)
                else:
                    print(f"Row {i} (ID {pid}): HTTP {response.status_code}")

            except Exception as e:
                print(f"Failed to download ID {pid}: {e}")

            # Progress monitoring
            if i % 100 == 0 and i > 0:
                print(f"Progress: {i} images processed...")

            time.sleep(0.1)

    print("\nExtraction process complete.")

if __name__ == "__main__":
    download_images()

    # Verify the results
    files = os.listdir(OUT_DIR)
    print("Total test images downloaded:", len(files))
    if len(files) > 0:
        sample = cv2.imread(os.path.join(OUT_DIR, files[0]))
        print("Sample image shape:", sample.shape)

Successfully loaded 5404 rows from /content/cdc_ds_test.csv
Progress: 100 images processed...
Progress: 200 images processed...
Progress: 300 images processed...
Progress: 400 images processed...
Progress: 500 images processed...
Progress: 600 images processed...
Progress: 700 images processed...
Progress: 800 images processed...
Progress: 900 images processed...
Progress: 1000 images processed...
Progress: 1100 images processed...
Progress: 1200 images processed...
Progress: 1300 images processed...
Progress: 1400 images processed...
Progress: 1500 images processed...
Progress: 1600 images processed...
Progress: 1700 images processed...
Progress: 1800 images processed...
Progress: 1900 images processed...
Progress: 2000 images processed...
Progress: 2100 images processed...
Progress: 2200 images processed...
Progress: 2300 images processed...
Progress: 2400 images processed...
Progress: 2500 images processed...
Progress: 2600 images processed...
Progress: 2700 images processed...
Prog

In [22]:
import os
import torch
import pandas as pd
import numpy as np
from PIL import Image
from tqdm.auto import tqdm

# ---------------- CONFIGURATION ----------------
# Use the directory where you just downloaded the test images
image_dir = "data/satellite_images_test"
# Output path for test features
save_path = "/content/drive/MyDrive/satellite_property_valuation/image_features_TEST.csv"
# Ensure the test CSV is loaded to get the IDs
test_df = pd.read_csv("/content/cdc_ds_test.csv")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------------- PREPARATION ----------------
model.to(device)
model.eval()
all_features = []
house_ids = test_df['id'].tolist()

def extract_features(image_path):
    try:
        img = Image.open(image_path).convert('RGB')
        img_t = preprocess(img).unsqueeze(0).to(device)
        with torch.no_grad():
            features = model(img_t)
        return features.cpu().numpy().flatten()
    except Exception as e:
        # If an image is missing or corrupt, return zeros
        return np.zeros(512)

# ---------------- THE EXTRACTION LOOP ----------------
if not os.path.exists(image_dir):
    print(f"CRITICAL ERROR: Folder not found at {image_dir}")
else:
    print(f"Starting extraction for {len(house_ids)} test images...")

    for i, hid in enumerate(tqdm(house_ids)):
        img_path = os.path.join(image_dir, f"{hid}.png")

        feat = extract_features(img_path)
        all_features.append(feat)

        # Emergency save every 500 images for test set
        if (i + 1) % 500 == 0:
            temp_df = pd.DataFrame(all_features, columns=[f'img_feat_{j}' for j in range(512)])
            temp_df['id'] = house_ids[:len(all_features)]
            temp_df.to_csv(save_path, index=False)
            print(f" | Progress saved at image {i+1}")

    # ---------------- FINAL SAVE ----------------
    feat_cols = [f'img_feat_{i}' for i in range(512)]
    image_feat_df = pd.DataFrame(all_features, columns=feat_cols)
    image_feat_df['id'] = house_ids

    # Final Save to Drive
    image_feat_df.to_csv(save_path, index=False)

    print("\nTest Feature Extraction Complete!")
    print(f"Test DataFrame Shape: {image_feat_df.shape}")

Starting extraction for 5404 test images...


  0%|          | 0/5404 [00:00<?, ?it/s]

 | Progress saved at image 500
 | Progress saved at image 1000
 | Progress saved at image 1500
 | Progress saved at image 2000
 | Progress saved at image 2500
 | Progress saved at image 3000
 | Progress saved at image 3500
 | Progress saved at image 4000
 | Progress saved at image 4500
 | Progress saved at image 5000

Test Feature Extraction Complete!
Test DataFrame Shape: (5404, 513)


In [23]:
import os
import pandas as pd

# 1. Define Paths
source_data = image_feat_df  # This is the dataframe created in the previous step
target_folder = "/content/drive/MyDrive/House_Price_Project_Final"
target_filename = "test_image_features.csv"
full_save_path = os.path.join(target_folder, target_filename)

# 2. Check if the folder exists, if not, create it
if not os.path.exists(target_folder):
    os.makedirs(target_folder)
    print(f"Created new directory: {target_folder}")

# 3. Save the CSV
try:
    source_data.to_csv(full_save_path, index=False)
    print("--- SUCCESS ---")
    print(f"File saved to: {full_save_path}")
    print(f"Total rows saved: {len(source_data)}")
except Exception as e:
    print(f"An error occurred while saving: {e}")

# 4. Verify the file is actually there
if os.path.exists(full_save_path):
    print("Verification: File exists in Drive.")

--- SUCCESS ---
File saved to: /content/drive/MyDrive/House_Price_Project_Final/test_image_features.csv
Total rows saved: 5404
Verification: File exists in Drive.
