In [3]:
# In a notebook cell add ! if running here, or run in terminal
!pip install numpy pandas scikit-learn matplotlib tensorflow




In [4]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


In [5]:
# UPDATE these paths to your files
TRAIN_CSV = r"C:\path\to\zillow\cleaned\city_time_series_train.csv"
TEST_CSV  = r"C:\path\to\zillow\cleaned\city_time_series_test.csv"

TARGET = 'zhvi_allhomes'
FEATURES = [
    'zhvi_middletier', 'zhvi_singlefamilyresidence', 'zhvi_toptier',
    'zhvipersqft_allhomes', 'zhvi_bottomtier', 'zhvi_4bedroom',
    'zhvi_3bedroom', 'zhvi_5bedroomormore', 'zhvi_2bedroom', 'zhvi_condocoop',
    'zri_allhomes', 'zri_allhomesplusmultifamily', 'zri_singlefamilyresidencerental',
    'pricetorentratio_allhomes', 'zripersqft_allhomes', 'year',
    'month', 'pctofhomesdecreasinginvalues_allhomes', 'pctofhomesincreasinginvalues_allhomes',
    'inventoryraw_allhomes', 'inventoryseasonallyadjusted_allhomes'
]

TIMESTEPS = 12   # 12 months -> predict next month
BATCH_SIZE = 128
EPOCHS = 30
MODEL_DIR = r"C:\Users\HP\Desktop\Reality_AI\zillow\lstm_model"
os.makedirs(MODEL_DIR, exist_ok=True)


In [6]:
train = pd.read_csv(TRAIN_CSV, parse_dates=['date'])
test  = pd.read_csv(TEST_CSV, parse_dates=['date'])

print("Train:", train.shape, "Test:", test.shape)

# Sort by region and date ‚Äî essential for sequence creation
train = train.sort_values(['regionname','date']).reset_index(drop=True)
test  = test.sort_values(['regionname','date']).reset_index(drop=True)


Train: (2888110, 29) Test: (857820, 29)


In [7]:
# ‚ö†Ô∏è Work with only a few regions first to avoid memory crash
sample_regions = train['regionname'].unique()[:10]  # first 10 regions
train_sample = train[train['regionname'].isin(sample_regions)]
test_sample  = test[test['regionname'].isin(sample_regions)]

print("Train sample shape:", train_sample.shape)
print("Test sample shape:", test_sample.shape)


Train sample shape: (1465, 29)
Test sample shape: (520, 29)


In [8]:
# This function converts time-series data into sequences
def create_sequences(df, features, target, timesteps):
    X_list, y_list = [], []
    groups = df.groupby('regionname')
    for region, g in groups:
        g = g.sort_values('date')
        vals = g[features].values
        targets = g[target].values
        n = len(g)
        if n <= timesteps:
            continue
        for i in range(n - timesteps):
            X_window = vals[i:i+timesteps]
            y_val = targets[i+timesteps]
            if np.isnan(X_window).any() or np.isnan(y_val):
                continue
            X_list.append(X_window)
            y_list.append(y_val)
    X = np.array(X_list, dtype=np.float32)
    y = np.array(y_list, dtype=np.float32)
    return X, y


In [9]:
TIMESTEPS = 12  # 12 months per sequence
TARGET = 'zhvi_allhomes'

X_train_seq, y_train_seq = create_sequences(train_sample, FEATURES, TARGET, TIMESTEPS)
X_test_seq, y_test_seq = create_sequences(test_sample, FEATURES, TARGET, TIMESTEPS)

print("Train sequences:", X_train_seq.shape)
print("Test sequences:", X_test_seq.shape)


Train sequences: (1345, 12, 21)
Test sequences: (400, 12, 21)


In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
n_features = X_train_seq.shape[2]

# Fit scaler on training data
X_train_flat = X_train_seq.reshape(-1, n_features)
scaler.fit(X_train_flat)

# Apply scaling to train and test
def scale_windows(X, scaler):
    s = X.reshape(-1, X.shape[2])
    s = scaler.transform(s)
    return s.reshape(X.shape)

X_train_scaled = scale_windows(X_train_seq, scaler)
X_test_scaled = scale_windows(X_test_seq, scaler)

print("Scaled train shape:", X_train_scaled.shape)
print("Scaled test shape:", X_test_scaled.shape)


Scaled train shape: (1345, 12, 21)
Scaled test shape: (400, 12, 21)


In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# LSTM model
model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(12, 21)),
    Dropout(0.2),
    LSTM(64, return_sequences=False),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

model.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 12, 128)           76800     
                                                                 
 dropout (Dropout)           (None, 12, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 128321 (501.25 KB)
Trainable params: 128

In [12]:
from sklearn.preprocessing import MinMaxScaler

# Scale target (zhvi_allhomes)
y_scaler = MinMaxScaler()
y_train_scaled = y_scaler.fit_transform(y_train_seq.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test_seq.reshape(-1, 1))

In [13]:
history = model.fit(
    X_train_scaled, y_train_scaled,
    validation_data=(X_test_scaled, y_test_scaled),
    epochs=20,
    batch_size=32,
    verbose=1
)


Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [14]:
y_pred_scaled = model.predict(X_test_scaled)
y_pred = y_scaler.inverse_transform(y_pred_scaled)
y_true = y_scaler.inverse_transform(y_test_scaled)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R¬≤ Score: {r2:.4f}")

RMSE: 7193.78
MAE: 5144.57
R¬≤ Score: 0.8933


In [15]:
# Clear memory from previous training
import gc
del X_train_seq, y_train_seq, X_test_seq, y_test_seq, X_train_scaled, X_test_scaled, model
gc.collect()

2326

In [None]:
#main training code

In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import os
import gc

# Optional: prettier plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")





In [6]:
# Load cleaned train/test data
train = pd.read_csv(r'C:\path\to\zillow\cleaned\city_time_series_train.csv', parse_dates=['date'])
test = pd.read_csv(r'C:\path\to\zillow\cleaned\city_time_series_test.csv', parse_dates=['date'])

print("Train shape:", train.shape)
print("Test shape:", test.shape)


Train shape: (2888110, 29)
Test shape: (857820, 29)


In [7]:
target = 'zhvi_allhomes'
features = [
    'zhvi_middletier', 'zhvi_singlefamilyresidence', 'zhvi_toptier',
    'zhvipersqft_allhomes', 'zhvi_bottomtier', 'zhvi_4bedroom',
    'zhvi_3bedroom', 'zhvi_5bedroomormore', 'zhvi_2bedroom', 'zhvi_condocoop',
    'zri_allhomes', 'zri_allhomesplusmultifamily', 'zri_singlefamilyresidencerental',
    'pricetorentratio_allhomes', 'zripersqft_allhomes', 'year',
    'month', 'pctofhomesdecreasinginvalues_allhomes', 'pctofhomesincreasinginvalues_allhomes',
    'inventoryraw_allhomes', 'inventoryseasonallyadjusted_allhomes'
]

TIMESTEPS = 12


In [9]:
def sequence_generator(df, features, target, feature_scaler, target_scaler, timesteps, batch_size=64):
    regions = df['regionname'].unique()
    X_batch, y_batch = [], []
    
    while True:  # Infinite generator for model.fit
        for region in regions:
            region_df = df[df['regionname'] == region].sort_values('date')
            vals = region_df[features].values
            targets = region_df[target].values
            n = len(region_df)
            
            for i in range(n - timesteps):
                X_window = vals[i:i+timesteps]
                y_val = targets[i+timesteps]
                
                # Skip if any NaN
                if np.isnan(X_window).any() or np.isnan(y_val):
                    continue
                
                # Scale features
                X_window_scaled = feature_scaler.transform(X_window)
                
                # Scale target
                y_scaled = target_scaler.transform([[y_val]])[0]
                
                X_batch.append(X_window_scaled)
                y_batch.append(y_scaled)
                
                if len(X_batch) == batch_size:
                    yield np.array(X_batch, dtype=np.float32), np.array(y_batch, dtype=np.float32)
                    X_batch, y_batch = [], []


In [10]:
n_features = len(features)

model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(TIMESTEPS, n_features)),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 12, 128)           76800     
                                                                 
 dropout (Dropout)           (None, 12, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 128321 (501.25 KB)
Trainable params: 128

In [11]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Feature scaler
feature_scaler = StandardScaler()
feature_scaler.fit(train[features].values)

# Target scaler
target_scaler = MinMaxScaler()
target_scaler.fit(train[target].values.reshape(-1,1))

print("‚úÖ Scalers created and fitted!")


‚úÖ Scalers created and fitted!


In [12]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Define callbacks
checkpoint = ModelCheckpoint(
    filepath=r"C:\Users\HP\Desktop\Reality_AI\zillow\zillow_lstm_best.keras",
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,  # stop if no improvement for 5 epochs
    verbose=1
)

# Training parameters
steps_per_epoch = 2000      # Adjust depending on your system
validation_steps = 200
epochs = 20
batch_size = 64

# Create generators
train_gen = sequence_generator(train, features, target, feature_scaler, target_scaler, TIMESTEPS, batch_size)
val_gen   = sequence_generator(test, features, target, feature_scaler, target_scaler, TIMESTEPS, batch_size)

# Train model with callbacks
history = model.fit(
    train_gen,
    validation_data=val_gen,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
    epochs=epochs,
    callbacks=[checkpoint, early_stop],
    verbose=1
)


Epoch 1/20


Epoch 1: val_loss improved from inf to 0.00041, saving model to C:\Users\HP\Desktop\Reality_AI\zillow\zillow_lstm_best.keras
Epoch 2/20
Epoch 2: val_loss did not improve from 0.00041
Epoch 3/20
Epoch 3: val_loss improved from 0.00041 to 0.00033, saving model to C:\Users\HP\Desktop\Reality_AI\zillow\zillow_lstm_best.keras
Epoch 4/20
Epoch 4: val_loss did not improve from 0.00033
Epoch 5/20
Epoch 5: val_loss improved from 0.00033 to 0.00023, saving model to C:\Users\HP\Desktop\Reality_AI\zillow\zillow_lstm_best.keras
Epoch 6/20
Epoch 6: val_loss did not improve from 0.00023
Epoch 7/20
Epoch 7: val_loss improved from 0.00023 to 0.00013, saving model to C:\Users\HP\Desktop\Reality_AI\zillow\zillow_lstm_best.keras
Epoch 8/20
Epoch 8: val_loss did not improve from 0.00013
Epoch 9/20
Epoch 9: val_loss did not improve from 0.00013
Epoch 10/20
Epoch 10: val_loss did not improve from 0.00013
Epoch 11/20
Epoch 11: val_loss improved from 0.00013 to 0.00011, saving model to C:\Users\HP

In [13]:
from tensorflow.keras.models import load_model

model.save("zillow_lstm_best.keras", save_format='tf')

print("‚úÖ Best model loaded")

‚úÖ Best model loaded


In [34]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# üîπ Function to predict using generator in batches
def batch_predict_generator(model, generator, steps):
    y_pred_scaled = []
    y_true_scaled = []
    for i, (X_batch, y_batch) in enumerate(generator):
        if i >= steps:  # stop after 'steps' batches
            break
        y_batch_pred = model.predict(X_batch, verbose=0)
        y_pred_scaled.append(y_batch_pred)
        y_true_scaled.append(y_batch)
    return np.vstack(y_pred_scaled), np.vstack(y_true_scaled)

# üîπ Predict from your generator
y_pred_scaled, y_true_scaled = batch_predict_generator(model, val_gen, validation_steps)

# üîπ Inverse-transform
y_pred = target_scaler.inverse_transform(y_pred_scaled)
y_true = target_scaler.inverse_transform(y_true_scaled)

# üîπ Metrics
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R¬≤ Score: {r2:.4f}")


RMSE: 72190.36
MAE: 62374.30
R¬≤ Score: 0.7962


In [21]:
# Save the model for later use
model.save(r"C:\Users\HP\Desktop\Reality_AI\zillow\zillow_lstm_final.keras")
