# Data Exploration for Comp Recommendation System

This notebook explores the appraisals dataset to understand the structure and prepare for model development.

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add src to path
sys.path.append(str(Path.cwd().parent / "src"))

from utils.data_utils import load_appraisals_data

%matplotlib inline
sns.set_style("whitegrid")

In [None]:
# Load the data
data_path = Path.cwd().parent / "data" / "appraisals_dataset.json"
df = load_appraisals_data(str(data_path))
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Display column names
print("Columns in dataset:")
for col in df.columns:
    print(f"  - {col}")

In [None]:
# Check data types
df.dtypes

In [None]:
# Basic statistics for numeric columns
df.describe()

In [None]:
# Examine subject property fields
subject_cols = [col for col in df.columns if col.startswith("subject.")]
print(f"\nSubject property fields ({len(subject_cols)}):")
for col in subject_cols[:10]:  # First 10
    print(f"  - {col}")

In [None]:
# Examine a sample subject property
print("Sample subject property (first appraisal):")
for col in subject_cols[:5]:
    print(f"{col}: {df[col].iloc[0]}")

In [None]:
# Examine properties structure
print("Properties field (first appraisal):")
print(f"Type: {type(df[\"properties\"].iloc[0])}")
if isinstance(df["properties"].iloc[0], list):
    print(f"Number of properties: {len(df[\"properties\"].iloc[0])}")
    if len(df["properties"].iloc[0]) > 0:
        print(f"\nSample property (first one):")
        sample = df["properties"].iloc[0][0]
        if isinstance(sample, dict):
            for key, value in list(sample.items())[:5]:
                print(f"  {key}: {value}")

In [None]:
# Examine comps structure
print("Comps field (first appraisal):")
print(f"Type: {type(df[\"comps\"].iloc[0])}")
if isinstance(df["comps"].iloc[0], list):
    print(f"Number of comps selected: {len(df[\"comps\"].iloc[0])}")
    if len(df["comps"].iloc[0]) > 0:
        print(f"\nSample comp (first one):")
        sample = df["comps"].iloc[0][0]
        if isinstance(sample, dict):
            for key, value in list(sample.items())[:5]:
                print(f"  {key}: {value}")

In [None]:
# Distribution of number of properties per appraisal
num_properties = df["properties"].apply(lambda x: len(x) if isinstance(x, list) else 0)
plt.figure(figsize=(10, 6))
plt.hist(num_properties, bins=20, edgecolor="black", alpha=0.7)
plt.xlabel("Number of Properties")
plt.ylabel("Frequency")
plt.title("Distribution of Number of Properties per Appraisal")
plt.grid(axis="y", alpha=0.3)
plt.show()

print(f"Average number of properties: {num_properties.mean():.2f}")
print(f"Min: {num_properties.min()}, Max: {num_properties.max()}")

In [None]:
# Distribution of number of comps selected per appraisal
num_comps = df["comps"].apply(lambda x: len(x) if isinstance(x, list) else 0)
plt.figure(figsize=(10, 6))
plt.hist(num_comps, bins=20, edgecolor="black", alpha=0.7, color="orange")
plt.xlabel("Number of Comps Selected")
plt.ylabel("Frequency")
plt.title("Distribution of Number of Comps Selected per Appraisal")
plt.grid(axis="y", alpha=0.3)
plt.show()

print(f"Average number of comps selected: {num_comps.mean():.2f}")
print(f"Min: {num_comps.min()}, Max: {num_comps.max()}")

In [None]:
# Selection rate: comps / properties
selection_rate = num_comps / num_properties
plt.figure(figsize=(10, 6))
plt.hist(selection_rate, bins=20, edgecolor="black", alpha=0.7, color="green")
plt.xlabel("Selection Rate (Comps / Properties)")
plt.ylabel("Frequency")
plt.title("Distribution of Comp Selection Rate")
plt.grid(axis="y", alpha=0.3)
plt.show()

print(f"Average selection rate: {selection_rate.mean():.2%}")