# Imports 

In [1]:
# 1. Load Data
# import time
import json
import pandas as pd
import os
import config
import pandas as pd
import numpy as np
from datetime import datetime
import re

RAW_DATA_FILE = "appraisals_dataset.json"

# Load Files - Data Loader

In [5]:
import json
import pandas as pd
import os
import config
import utils


def load_appraisals_data(file_name=config.RAW_DATA_FILE):
    """Loads the appraisals dataset from a JSON file located within src/.

    Args:
        file_name (str, optional): The base name of the data file.
                                     Defaults to config.RAW_DATA_FILE.
                                     The path is constructed relative to config.py's location.
    """
    # Construct path relative to the directory of config.py (i.e., src/)
    # This ensures it works correctly whether called from src/main.py or a script in root.
    base_src_dir = os.path.dirname(config.__file__)
    full_file_path = os.path.join(base_src_dir, file_name)

    print(f"Loading {full_file_path}...")
    try:
        with open(full_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # If the loaded data is a dictionary and has an 'appraisals' key,
        # assume the actual list of appraisals is nested there.
        if isinstance(data, dict) and 'appraisals' in data:
            data = data['appraisals']
            print(
                "Note: Data was unwrapped from an 'appraisals' key in the JSON structure.")

        if isinstance(data, list):
            print(f"Successfully loaded {len(data)} appraisals.")
            return data
        else:
            # This case should be rare if the primary structure is a list or dict with 'appraisals'
            error_msg = f"Error: Loaded data from {full_file_path} is not a list of appraisals as expected. Type found: {type(data)}."
            if hasattr(data, '__len__'):
                error_msg += f" Number of top-level elements: {len(data)}."
            else:
                error_msg += " Data does not have a defined length."
            print(error_msg)
            # Decide on behavior: return None, raise error, or return data with warning
            # For now, returning None as the original error paths did.
            return None

    except FileNotFoundError:
        print(f"Error: The file {full_file_path} was not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: The file {full_file_path} is not a valid JSON file.")
        return None
    except Exception as e:  # Catch other potential errors during file/JSON processing
        print(
            f"An unexpected error occurred while loading {full_file_path}: {e}")
        return None



In [6]:
def perform_initial_eda(appraisals_data):
    """Prints initial exploratory data analysis insights."""
    if not appraisals_data:
        print("No data to perform EDA on.")
        return

    print("\n--- Exploratory Data Analysis ---")
    print(f"Total number of appraisals: {len(appraisals_data)}")

    if appraisals_data:
        first_appraisal = appraisals_data[0]
        print("\nStructure of the first appraisal (keys):")
        print(list(first_appraisal.keys()))

        subject_prop = first_appraisal.get('subject', {})
        print("\nKeys in the 'subject' property of the first appraisal:")
        print(list(subject_prop.keys()))
        print(f"Subject property address: {subject_prop.get('address')}")
        print(f"Subject GLA: {subject_prop.get('gla')}, "
              f"Beds: {subject_prop.get('num_beds')}, "
              f"Baths: {subject_prop.get('num_baths')}")

        comps = first_appraisal.get('comps', [])
        print(
            f"\nNumber of chosen comparables (comps) in the first appraisal: {len(comps)}")
        if comps:
            print("Keys in the first chosen 'comp':")
            print(list(comps[0].keys()))
            print(f"First chosen comp address: {comps[0].get('address')}")

        properties = first_appraisal.get('properties', [])
        print(
            f"\nNumber of potential comparables ('properties') in the first appraisal: {len(properties)}")
        if properties:
            print("Keys in the first 'property' from the potential list:")
            print(list(properties[0].keys()))

    # Statistics for 'properties' list per appraisal
    num_properties_list = [len(appraisal.get('properties', []))
                           for appraisal in appraisals_data]
    if num_properties_list:
        print("\nStatistics for 'properties' list per appraisal:")
        s_num_properties = pd.Series(num_properties_list)
        print(f"  Min: {s_num_properties.min()}")
        print(f"  Max: {s_num_properties.max()}")
        print(f"  Avg: {s_num_properties.mean():.2f}")
        print(f"  Median: {s_num_properties.median()}")

    # Distribution of chosen 'comps' (should always be 3 based on prior analysis)
    num_comps_list = [len(appraisal.get('comps', []))
                      for appraisal in appraisals_data]
    if num_comps_list:
        print("\nDistribution of number of chosen 'comps' per appraisal:")
        print(pd.Series(num_comps_list).value_counts())

    # Show subject GLA missingness
    subject_gla_missing_count = 0
    for appraisal in appraisals_data:
        if pd.isna(utils.safe_float(appraisal.get('subject', {}).get('gla'))):
            subject_gla_missing_count += 1
    print(
        f"\nNumber of appraisals with missing subject_gla: {subject_gla_missing_count} out of {len(appraisals_data)}")

    # Show subject lot_size_sf missingness
    subject_lot_sf_missing_count = 0
    for appraisal in appraisals_data:
        if pd.isna(utils.safe_float(appraisal.get('subject', {}).get('lot_size_sf'))):
            subject_lot_sf_missing_count += 1
    print(
        f"Number of appraisals with missing subject_lot_sf: {subject_lot_sf_missing_count} out of {len(appraisals_data)}")

    # Show subject lat/lon missingness
    subject_lat_lon_missing = 0
    for appraisal in appraisals_data:
        subj = appraisal.get('subject', {})
        if pd.isna(utils.safe_float(subj.get('latitude'))) or pd.isna(utils.safe_float(subj.get('longitude'))):
            subject_lat_lon_missing += 1
    print(
        f"Number of subjects missing direct lat/lon: {subject_lat_lon_missing} out of {len(appraisals_data)}")

    # Matches the original script's output separator
    print("\n--- End of Initial EDA --- ")

In [7]:
appraisals_data = load_appraisals_data()

Loading /Users/haroon/Projects /Headstarter/CompRecommendation/src/appraisals_dataset.json...
Note: Data was unwrapped from an 'appraisals' key in the JSON structure.
Successfully loaded 88 appraisals.


# Feature Engineering 

In [8]:
import geocoding_utils
import feature_engineering
# 3. Load Geocoding Cache
    # Now uses default file_name from config, path constructed within the function
geocoding_cache = geocoding_utils.load_geocoding_cache()
    

Loaded geocoding cache from /Users/haroon/Projects /Headstarter/CompRecommendation/src/geocoding_cache.json with 88 entries.


In [9]:
# 4. Feature Engineering
    # Pass the cache, and get the (potentially updated) cache back
df_features, geocoding_cache = feature_engineering.create_feature_dataframe(
    appraisals_data, geocoding_cache)


Processing appraisals to create features...
Value counts for is_chosen_comp directly after DataFrame creation:
is_chosen_comp
0    9717
1     103
Name: count, dtype: int64

Created DataFrame with 9820 rows and 38 columns.

Imputing medians for specific diff features and creating missingness indicators...
  Processed column: bed_diff. Missing: 386, Median used: 1.0
  Processed column: room_diff. Missing: 169, Median used: 2.0
Processing column: distance_to_subject
  Created 'distance_to_subject_missing' indicator. Median used: 1.381468883835596
Processing column: bath_diff
  Created 'bath_diff_missing' indicator column. Median used: 0.5
Processing column: age_diff
  Created 'age_diff_missing' indicator column. Median used: 13.0

Checking and imputing interaction/polynomial features...
  Imputed 2937 NaNs in 'distance_squared' with 0.0.
  Imputed 4150 NaNs in 'age_diff_squared' with 0.0.
  Imputed 5467 NaNs in 'dist_X_age_diff' with 0.0.

--- End of Feature Engineering ---


In [10]:
# 5. Save updated geocoding cache
    # Now uses default file_name from config, path constructed within the function
geocoding_utils.save_geocoding_cache(geocoding_cache)

Saved geocoding cache to /Users/haroon/Projects /Headstarter/CompRecommendation/src/geocoding_cache.json with 88 entries.


In [11]:
# 6. Describe Engineered Features (Optional)
feature_engineering.describe_engineered_features(df_features)


--- Engineered Features Description (Post-Imputation) ---
                              count        mean          std        min        25%         50%         75%           max
days_since_sale              9820.0   41.321181    32.467724   0.000000   15.00000   33.000000   64.000000    180.000000
bed_diff                     9820.0    1.141039     1.293070   0.000000    0.00000    1.000000    1.000000      7.000000
bath_diff                    9820.0    0.593941     0.405004   0.000000    0.50000    0.500000    0.500000      8.500000
age_diff                     9820.0   18.742770    19.394029   0.000000   10.00000   13.000000   17.000000    120.000000
room_diff                    9820.0    3.157637     2.752957   0.000000    1.00000    2.000000    4.000000     20.000000
distance_to_subject          9820.0    1.506970     1.074992   0.003466    1.10727    1.381469    1.615361     18.350098
gla_diff                      139.0  508.410072   420.287735  56.000000  256.00000  256.000000

# Model Training and Evaluation

In [12]:
import model_pipeline

## XGBoost

In [13]:
# XGBoost
model_pipeline.train_evaluate_model(df_features, model_name='XGBoost')




--- Starting XGBoost Model Training and Evaluation ---

Shape of X (features): (9820, 12)
Shape of y (target): (9820,)

Performing stratified train/test split...
Shape of X_train: (7856, 12), y_train: (7856,)
Shape of X_test: (1964, 12), y_test: (1964,)
Distribution of target in y_train:
is_chosen_comp
0    98.956212
1     1.043788
Name: proportion, dtype: float64
Distribution of target in y_test:
is_chosen_comp
0    98.930754
1     1.069246
Name: proportion, dtype: float64

Scaling features using StandardScaler...
Calculated scale_pos_weight: 94.80

Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost model training complete.

Evaluating XGBoost model on the test set (default 0.5 threshold)...

Classification Report (0.5 threshold):
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1943
           1       0.44      0.33      0.38        21

    accuracy                           0.99      1964
   macro avg       0.72      0.66      0.69      1964
weighted avg       0.99      0.99      0.99      1964


Average Precision Score (AUPRC) for XGBoost: 0.2987

Feature Importances (XGBoost - Gain/Weight):
prop_sold_after_eff            0.480706
fsa_match                      0.175935
distance_to_subject            0.054616
bath_diff                      0.050126
age_diff                       0.049035
bed_diff_missing               0.032142
distance_to_subject_missing    0.030815
days_since_sale                0.028803
age_diff_missing               0.028596
bed_diff                       0.027028
struct_type_match              0

## LightGBM

In [14]:
# LightGBM
model_pipeline.train_evaluate_model(df_features, model_name='LightGBM')



--- Starting LightGBM Model Training and Evaluation ---

Shape of X (features): (9820, 12)
Shape of y (target): (9820,)

Performing stratified train/test split...
Shape of X_train: (7856, 12), y_train: (7856,)
Shape of X_test: (1964, 12), y_test: (1964,)
Distribution of target in y_train:
is_chosen_comp
0    98.956212
1     1.043788
Name: proportion, dtype: float64
Distribution of target in y_test:
is_chosen_comp
0    98.930754
1     1.069246
Name: proportion, dtype: float64

Scaling features using StandardScaler...
Calculated scale_pos_weight: 94.80

Training LightGBM model...
[LightGBM] [Info] Number of positive: 82, number of negative: 7774
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 558
[LightGBM] [Info] Number of data points in the train set: 7856, number of used features

## Logistic Regression

In [15]:

# Logistic Regression
model_pipeline.train_evaluate_model(
df_features, model_name='LogisticRegression')




--- Starting LogisticRegression Model Training and Evaluation ---

Shape of X (features): (9820, 12)
Shape of y (target): (9820,)

Performing stratified train/test split...
Shape of X_train: (7856, 12), y_train: (7856,)
Shape of X_test: (1964, 12), y_test: (1964,)
Distribution of target in y_train:
is_chosen_comp
0    98.956212
1     1.043788
Name: proportion, dtype: float64
Distribution of target in y_test:
is_chosen_comp
0    98.930754
1     1.069246
Name: proportion, dtype: float64

Scaling features using StandardScaler...
Calculated scale_pos_weight: 94.80

Training Logistic Regression model...
LogisticRegression model training complete.

Evaluating LogisticRegression model on the test set (default 0.5 threshold)...

Classification Report (0.5 threshold):
              precision    recall  f1-score   support

           0       1.00      0.67      0.80      1943
           1       0.03      0.90      0.06        21

    accuracy                           0.68      1964
   macro a

## KNN 

In [16]:
# KNN
model_pipeline.train_evaluate_model(df_features, model_name='KNN')



--- Starting KNN Model Training and Evaluation ---

Shape of X (features): (9820, 12)
Shape of y (target): (9820,)

Performing stratified train/test split...
Shape of X_train: (7856, 12), y_train: (7856,)
Shape of X_test: (1964, 12), y_test: (1964,)
Distribution of target in y_train:
is_chosen_comp
0    98.956212
1     1.043788
Name: proportion, dtype: float64
Distribution of target in y_test:
is_chosen_comp
0    98.930754
1     1.069246
Name: proportion, dtype: float64

Attempting SMOTE for KNN training data...
Applying SMOTE with k_neighbors=5 for KNN.
Shape of X_train after SMOTE: (15548, 12)
Distribution of target in y_train after SMOTE:
is_chosen_comp
0    50.0
1    50.0
Name: proportion, dtype: float64

Scaling features using StandardScaler...
Calculated scale_pos_weight: 94.80

Training KNN model...
KNN model training complete.

Evaluating KNN model on the test set (default 0.5 threshold)...

Classification Report (0.5 threshold):
              precision    recall  f1-score   