## PyTorch training notebook:

- Loads a small sample of the transformed dataset (from `default.flights_2006_transformed` or parquet path),
- Converts to pandas and then to torch tensors,
- Trains a tiny feedforward network to predict `ArrDelay` (regression),
- Reports train/test RMSE.

In [1]:
import sys
!{sys.executable} -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu


In [2]:
pip install --no-cache-dir nbconvert nbclient

Note: you may need to restart the kernel to use updated packages.


In [3]:
import sys, importlib
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

import torch
print(torch.version.cuda)          # Should show a CUDA version
print(torch.cuda.is_available())   # Should return True
print(torch.cuda.device_count())   # Number of GPUs visible

In [6]:
import os, io, traceback
import torch
import torch.nn as nn

local_path = 'models/pytorch_small_regressor.pth'

def try_load_local(path):
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    print('Loading from local path:', path)
    data = torch.load(path, map_location='cpu')
    return data

def try_load_via_spark_copy(local_target=local_path, hdfs_src='/data/models/pytorch_small_regressor.pth'):
    try:
        from pyspark.sql import SparkSession
        try:
            spark
        except NameError:
            spark = SparkSession.builder.enableHiveSupport().getOrCreate()
        sc = spark.sparkContext
        jconf = sc._jsc.hadoopConfiguration()
        fs_default = jconf.get('fs.defaultFS')
        if not fs_default or fs_default.startswith('file://'):
            jconf.set('fs.defaultFS', 'hdfs://namenode:8020')
        fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(jconf)
        Path = sc._jvm.org.apache.hadoop.fs.Path
        src = Path(hdfs_src)
        dst = Path('file://' + os.path.abspath(local_target))
        os.makedirs(os.path.dirname(local_target), exist_ok=True)
        print('Copying from HDFS to local:', hdfs_src, '->', local_target)
        fs.copyToLocalFile(False, src, dst)
        return try_load_local(local_target)
    except Exception:
        print('Spark-based HDFS copy failed:')
        traceback.print_exc()
        raise

def try_load_via_pyarrow(hdfs_src='/data/models/pytorch_small_regressor.pth'):
    try:
        import pyarrow as pa
        print('Attempting to read from HDFS via pyarrow:', hdfs_src)
        fs = pa.hdfs.connect('namenode', 8020)
        with fs.open(hdfs_src, 'rb') as f:
            data_bytes = f.read()
        return torch.load(io.BytesIO(data_bytes), map_location='cpu')
    except Exception:
        print('pyarrow HDFS read failed:')
        traceback.print_exc()
        raise

loader_sequence = [
    ('local', try_load_local),
    ('spark_copy', try_load_via_spark_copy),
    ('pyarrow', try_load_via_pyarrow),
]

loaded = None
for name, fn in loader_sequence:
    try:
        loaded = fn(local_path)
        print('Loaded checkpoint via:', name)
        break
    except FileNotFoundError:
        print('Not found locally for loader:', name)
        continue
    except Exception as e:
        print('Loader', name, 'failed with:', type(e).__name__, e)
        continue

if loaded is None:
    raise RuntimeError('All loading strategies failed; cannot load model checkpoint.')

print('\nCheckpoint top-level keys:', list(loaded.keys()) if isinstance(loaded, dict) else 'non-dict checkpoint')

# Extract model_state_dict if present
state = loaded.get('model_state_dict') if isinstance(loaded, dict) and 'model_state_dict' in loaded else loaded
if not isinstance(state, dict):
    raise RuntimeError('Checkpoint does not contain a state dict.')

# Find a weight tensor to infer input dim
weight_tensor = None
for k, v in state.items():
    if 'weight' in k and hasattr(v, 'shape'):
        weight_tensor = v
        print('Found weight key:', k, 'shape:', tuple(v.shape))
        break

if weight_tensor is None:
    raise RuntimeError('Could not find weight tensor in state dict to infer model shape')

in_features = weight_tensor.shape[1]
print('Inferred input features =', in_features)

# Reconstruct a single Linear layer and load state (for simple models)
model = nn.Linear(in_features, 1)
model_state = {k.replace('module.', ''): v for k, v in state.items()}  # handle possible DataParallel prefix
try:
    model.load_state_dict(model_state)
    print('State dict loaded into nn.Linear')
except Exception:
    print('Full load failed; attempting to load matching keys only')
    ms = model.state_dict()
    matched = {k: v for k, v in model_state.items() if k in ms and ms[k].shape == v.shape}
    ms.update(matched)
    model.load_state_dict(ms)
    print('Partial state loaded (matched keys).')

# Smoke test inference
model.eval()
with torch.no_grad():
    import numpy as np
    x = torch.from_numpy(np.random.randn(2, in_features).astype('float32'))
    y = model(x)
    print('Smoke forward ok, output shape =', tuple(y.shape), 'sample outputs =', y.reshape(-1).tolist())

print('\nIf you want to inspect scaler in the checkpoint (if present):')
if isinstance(loaded, dict) and 'scaler' in loaded:
    print('scaler present in checkpoint (object type):', type(loaded['scaler']))

print('\nDone.')

Loading from local path: models/pytorch_small_regressor.pth
Loaded checkpoint via: local

Checkpoint top-level keys: ['weight', 'bias']
Found weight key: weight shape: (1, 6)
Inferred input features = 6
State dict loaded into nn.Linear
Smoke forward ok, output shape = (2, 1) sample outputs = [0.8670730590820312, 0.35125476121902466]

If you want to inspect scaler in the checkpoint (if present):

Done.


In [9]:
from sklearn.model_selection import train_test_split
import warnings

# Initialize or reuse existing Spark session
try:
    spark
except NameError:
    spark = SparkSession.builder.enableHiveSupport().getOrCreate()

# Configuration
tbl_name = 'default.flights_2006_transformed'
parquet_path = 'hdfs://namenode:8020/data/parquet/flights_2006_transformed'

def load_small_sample(spark, tbl_name, parquet_path, sample_fraction=0.0005, max_rows=3000, seed=42):
    """Load a small sample from Hive metastore with parquet fallback."""
    df_spark = None

    # Try loading from metastore
    try:
        df_spark = spark.table(tbl_name)
        print('Loaded table from metastore:', tbl_name)
    except Exception as e:
        warnings.warn(f'Metastore load failed: {e}; will try parquet fallback')

    # Attempt sampling from metastore
    if df_spark is not None:
        df_sample = df_spark.sample(withReplacement=False, fraction=sample_fraction, seed=seed)
        cnt = df_sample.count()
        print('Sample rows (from metastore sampling):', cnt)
        if cnt > 0:
            return df_sample
        else:
            warnings.warn('Metastore sampling returned 0 rows; will try parquet fallback with larger fraction/limit')

    # Parquet fallback
    try:
        df_par = spark.read.parquet(parquet_path)
        print('Loaded parquet fallback:', parquet_path)
    except Exception as e:
        raise RuntimeError(f'Failed to read parquet fallback {parquet_path}: {e}')

    # Larger sample from parquet
    larger_fraction = max(sample_fraction * 20, 0.001)
    try:
        df_sample = df_par.sample(withReplacement=False, fraction=larger_fraction, seed=seed)
        cnt = df_sample.count()
        print('Sample rows (from parquet sampling, fraction=', larger_fraction, '):', cnt)
        if cnt > 0:
            return df_sample.limit(max_rows)
    except Exception as e:
        warnings.warn(f'Parquet sampling failed: {e}; will try direct limit()')

    # Final fallback: direct limit()
    try:
        df_direct = df_par.limit(max_rows)
        cnt = df_direct.count()
        print('Sample rows (direct limit from parquet):', cnt)
        if cnt > 0:
            return df_direct
    except Exception as e:
        warnings.warn(f'Direct parquet limit() failed: {e}')

    # Nothing found
    return None


# === Main script ===
if __name__ == "__main__":
    df_sample = load_small_sample(spark, tbl_name, parquet_path,
                                  sample_fraction=0.0005, max_rows=3000, seed=42)

    if df_sample is None:
        print("\nERROR: No rows returned from metastore or parquet fallback.\n")
        print("Actions to investigate:")
        print(" - Verify the Hive metastore table exists: spark.sql('SHOW TABLES IN default').show()")
        print(" - Inspect the parquet path: hadoop fs -ls hdfs://namenode:8020/data/parquet/flights_2006_transformed")
        print(" - Ensure fs.defaultFS is correctly configured for your SparkSession")
        raise RuntimeError("Could not obtain any sample rows from metastore or parquet fallback")

    # Convert to pandas safely (sample should be small)
    df_pd = df_sample.toPandas()
    print('Sample rows (pandas):', len(df_pd))

    if len(df_pd) == 0:
        raise RuntimeError('Pandas sample is empty after all fallbacks — aborting to avoid scaler errors')

    # Ensure target exists and numeric extraction remains robust
    if 'ArrDelay' not in df_pd.columns:
        try:
            df_pd['ArrDelay'] = pd.to_numeric(df_pd.get('ArrDelay', pd.Series(dtype=float)), errors='coerce')
        except Exception:
            df_pd['ArrDelay'] = pd.Series([None] * len(df_pd))

    numeric = df_pd.select_dtypes(include=[np.number]).copy()
    if 'ArrDelay' not in numeric.columns:
        numeric['ArrDelay'] = pd.to_numeric(df_pd['ArrDelay'], errors='coerce')

    # Prepare feature and target arrays
    top_k = min(8, max(1, numeric.shape[1] - 1))
    feat_cols = [c for c in numeric.columns if c != 'ArrDelay'][:top_k]
    X = numeric[feat_cols].copy()
    y = numeric['ArrDelay'].values.astype(np.float32) if 'ArrDelay' in numeric.columns else np.zeros(len(X), dtype=np.float32)

    if len(X) == 0 or X.shape[1] == 0:
        raise RuntimeError('No numeric features available after fallbacks. Check the transformed dataset contents.')

    if len(X) > 3000:
        sampled_idx = np.random.RandomState(42).choice(len(X), size=3000, replace=False)
        X = X.iloc[sampled_idx].reset_index(drop=True)
        y = y[sampled_idx]

    print('Features shape:', X.shape)

    # Split, scale, and prepare numpy arrays
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_scaled = X_train_scaled.astype(np.float32)
    X_test_scaled = X_test_scaled.astype(np.float32)
    y_train = y_train.astype(np.float32)
    y_test = y_test.astype(np.float32)

    print('Prepared numpy arrays:')
    print('  X_train:', X_train_scaled.shape)
    print('  y_train:', y_train.shape)

Loaded table from metastore: default.flights_2006_transformed
Sample rows (from metastore sampling): 0




Loaded parquet fallback: hdfs://namenode:8020/data/parquet/flights_2006_transformed
Sample rows (from parquet sampling, fraction= 0.01 ): 70163
Sample rows (pandas): 3000
Features shape: (3000, 8)
Prepared numpy arrays:
  X_train: (2400, 8)
  y_train: (2400,)


In [10]:
from sklearn.metrics import mean_squared_error, r2_score

# -------------------------------------------------------------------
# Configuration
# -------------------------------------------------------------------
local_ckpt = 'models/pytorch_small_regressor.pth'
tbl_name = 'default.flights_2006_transformed'
parquet_path = 'hdfs://namenode:8020/data/parquet/flights_2006_transformed'


# -------------------------------------------------------------------
# Utility functions
# -------------------------------------------------------------------
def load_checkpoint(path):
    """Load a PyTorch checkpoint file."""
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    print('Loading checkpoint from', path)
    return torch.load(path, map_location='cpu')


def get_small_sample(spark, tbl_name, parquet_path, sample_fraction=0.0005, max_rows=3000, seed=42):
    """Try to get a small sample from Hive metastore; fallback to Parquet."""
    # Try metastore
    try:
        df_spark = spark.table(tbl_name)
        df_sample = df_spark.sample(withReplacement=False, fraction=sample_fraction, seed=seed)
        if df_sample.count() > 0:
            print('Loaded sample from metastore (sampled)')
            return df_sample
        else:
            print('Metastore sampling returned 0 rows; trying parquet fallback')
    except Exception as e:
        print('Metastore read failed, will try parquet fallback:', e)

    # Parquet fallback
    df_par = spark.read.parquet(parquet_path)
    try:
        larger_fraction = max(sample_fraction * 20, 0.001)
        df_sample = df_par.sample(withReplacement=False, fraction=larger_fraction, seed=seed)
        if df_sample.count() > 0:
            print('Loaded sample from parquet (sampled)')
            return df_sample.limit(max_rows)
    except Exception as e:
        print('Parquet sampling failed or empty, will try direct limit:', e)

    # Last resort: direct limit()
    df_direct = df_par.limit(max_rows)
    if df_direct.count() > 0:
        print('Loaded sample via direct parquet.limit()')
        return df_direct

    return None


# -------------------------------------------------------------------
# Initialize Spark
# -------------------------------------------------------------------
try:
    spark
except NameError:
    spark = SparkSession.builder.enableHiveSupport().getOrCreate()


# -------------------------------------------------------------------
# 1. Load checkpoint
# -------------------------------------------------------------------
try:
    ckpt = load_checkpoint(local_ckpt)
except FileNotFoundError:
    raise RuntimeError(f'Local checkpoint not found at {local_ckpt}. '
                       'Copy it from HDFS first or run the training step.')

# Extract state dict
state = ckpt.get('model_state_dict') if isinstance(ckpt, dict) and 'model_state_dict' in ckpt else ckpt
if not isinstance(state, dict):
    raise RuntimeError('Checkpoint did not contain a state dict')

# Infer input dimension
weight_tensor = None
for k, v in state.items():
    if 'weight' in k and hasattr(v, 'shape'):
        weight_tensor = v
        break
if weight_tensor is None:
    raise RuntimeError('Could not find weight tensor in checkpoint state dict')
in_features = weight_tensor.shape[1]
print('Inferred model input features =', in_features)

# Build model
model = nn.Linear(in_features, 1)
try:
    model.load_state_dict(state)
    print('Loaded full state dict into model')
except Exception:
    ms = model.state_dict()
    matched = {k: v for k, v in state.items() if k in ms and ms[k].shape == v.shape}
    ms.update(matched)
    model.load_state_dict(ms)
    print('Loaded partial/matched state into model')
model.eval()


# -------------------------------------------------------------------
# 2. Load sample data
# -------------------------------------------------------------------
df_sample = get_small_sample(spark, tbl_name, parquet_path, sample_fraction=0.0005, max_rows=3000, seed=42)
if df_sample is None:
    raise RuntimeError('No rows found in metastore or parquet fallback — cannot compute metrics')

df_pd = df_sample.toPandas()
print('Pandas sample rows:', len(df_pd))


# -------------------------------------------------------------------
# 3. Prepare numeric features and target
# -------------------------------------------------------------------
if 'ArrDelay' not in df_pd.columns:
    try:
        df_pd['ArrDelay'] = pd.to_numeric(df_pd.get('ArrDelay', pd.Series(dtype=float)), errors='coerce')
    except Exception:
        df_pd['ArrDelay'] = pd.Series([np.nan] * len(df_pd))

numeric = df_pd.select_dtypes(include=[np.number]).copy()
if 'ArrDelay' not in numeric.columns:
    numeric['ArrDelay'] = pd.to_numeric(df_pd['ArrDelay'], errors='coerce')

feat_cols = [c for c in numeric.columns if c != 'ArrDelay'][:in_features]
if len(feat_cols) < in_features:
    raise RuntimeError(f'Not enough numeric features ({len(feat_cols)}) to match model input ({in_features})')

X = numeric[feat_cols].astype(np.float32).reset_index(drop=True)
y = numeric['ArrDelay'].values.astype(np.float32)

if len(X) == 0:
    raise RuntimeError('Pandas sample is empty after robust loading — cannot compute metrics')


# -------------------------------------------------------------------
# 4. Split and scale data
# -------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

if isinstance(ckpt, dict) and 'scaler' in ckpt:
    scaler = ckpt['scaler']
    print('Using scaler loaded from checkpoint (type:', type(scaler), ')')
else:
    print('No scaler in checkpoint; fitting StandardScaler on training split')
    scaler = StandardScaler()
    scaler.fit(X_train.values)

X_train_scaled = scaler.transform(X_train.values).astype(np.float32)
X_test_scaled = scaler.transform(X_test.values).astype(np.float32)


# -------------------------------------------------------------------
# 5. Run inference and compute metrics
# -------------------------------------------------------------------
with torch.no_grad():
    Xt = torch.from_numpy(X_test_scaled)
    preds = model(Xt).cpu().numpy().reshape(-1)

rmse = float(np.sqrt(mean_squared_error(y_test, preds)))
r2 = float(r2_score(y_test, preds))

print('\nPyTorch baseline metrics:')
print(f' RMSE = {rmse:.4f}')
print(f' R2   = {r2:.4f}')


# -------------------------------------------------------------------
# 6. Compare with previous PySpark baselines
# -------------------------------------------------------------------
prev_lr_rmse = 11.2483
prev_lr_r2 = 0.9056
prev_rf_rmse = 17.0213
prev_rf_r2 = 0.7837

print('\nPrevious PySpark baselines:')
print(f' LinearRegression RMSE={prev_lr_rmse:.4f}, R2={prev_lr_r2:.4f}')
print(f' RandomForest     RMSE={prev_rf_rmse:.4f}, R2={prev_rf_r2:.4f}')

print('\nComparison: lower RMSE and higher R² indicate better performance.')


# -------------------------------------------------------------------
# 7. Show a few prediction samples
# -------------------------------------------------------------------
res_df = pd.DataFrame({
    'ArrDelay': y_test.flatten(),
    'prediction': preds
})
print('\nSample predictions:')
print(res_df.head(10).to_string(index=False))

print('\nDone — use these metrics to compare with your earlier models.')

Loading checkpoint from models/pytorch_small_regressor.pth
Inferred model input features = 6
Loaded full state dict into model
Metastore sampling returned 0 rows; trying parquet fallback
Loaded sample from parquet (sampled)
Pandas sample rows: 3000
No scaler in checkpoint; fitting StandardScaler on training split

PyTorch baseline metrics:
 RMSE = 31.4780
 R2   = -0.0723

Previous PySpark baselines:
 LinearRegression RMSE=11.2483, R2=0.9056
 RandomForest     RMSE=17.0213, R2=0.7837

Comparison: lower RMSE and higher R² indicate better performance.

Sample predictions:
 ArrDelay  prediction
     -4.0    0.060429
    -22.0    0.743498
      2.0    0.907675
     47.0    0.745521
     -3.0    0.381357
     73.0    0.833490
    -15.0    0.293262
     -4.0   -0.274133
     87.0   -0.096472
    -26.0    0.011505

Done — use these metrics to compare with your earlier models.


## 3 Models Performance Findings:

Among the three models tested, Linear Regression performed best with the lowest RMSE (11.25) and highest R² (0.91), outperforming both the Random Forest and PyTorch models in predicting flight delays.

**Reasons:**
1. Data is mostly linear in nature – The relationship between predictors (like departure time, day, and weather) and arrival delay might be largely linear, making Linear Regression a natural fit.

2. Feature scaling and preprocessing – Since Linear Regression is sensitive to feature scaling (and you standardized the data), it likely benefited more from clean, normalized inputs than the Random Forest or PyTorch models.

3. Limited feature interactions or complexity – If the dataset doesn’t have strong nonlinear relationships or high-order interactions, more complex models (like Random Forest or neural nets) can overfit or perform worse.

4. Model simplicity and interpretability – Linear Regression has fewer parameters and assumptions, so it generalizes better when the dataset is moderate in size or not highly variable.

5. Training configuration differences – The PyTorch model may have been undertrained (e.g., too few epochs, poor learning rate) or lacked sufficient feature scaling, while the Random Forest might not have been optimally tuned for depth or number of trees.

## 2. Load a small sample and prepare tensors
We attempt to load `default.flights_2006_transformed` from the Hive metastore first,
then fall back to the Parquet path. We keep only numeric columns (except the target `ArrDelay`)
and convert a small sample to pandas for training a tiny PyTorch model.

In [None]:
from pyspark.sql import SparkSession
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# reuse existing SparkSession if available
try:
    spark
except NameError:
    spark = SparkSession.builder.enableHiveSupport().getOrCreate()

# Try metastore table first, then parquet fallback
tbl_name = 'default.flights_2006_transformed'
parquet_path = 'hdfs://namenode:8020/data/parquet/flights_2006_transformed'

try:
    df_spark = spark.table(tbl_name)
    print('Loaded from metastore:', tbl_name)
except Exception as e:
    print('Metastore load failed, trying parquet fallback:', e)
    df_spark = spark.read.parquet(parquet_path)
    print('Loaded from parquet:', parquet_path)

# take a very small sample that fits in memory (prefer speed over accuracy)
sample_fraction = 0.0005  # very small sample fraction for fastest runs
df_sample = df_spark.sample(withReplacement=False, fraction=sample_fraction, seed=42)
print('Sample rows (spark):', df_sample.count())

# convert to pandas (careful: keep sample small)
df_pd = df_sample.toPandas()
print('Sample rows (pandas):', len(df_pd))

# Ensure target exists
if 'ArrDelay' not in df_pd.columns:
    raise ValueError('ArrDelay column not found in sample')

# Select numeric features and drop target from features
numeric = df_pd.select_dtypes(include=[np.number]).copy()
if 'ArrDelay' not in numeric.columns:
    # sometimes ArrDelay is object; cast if possible
    numeric['ArrDelay'] = pd.to_numeric(df_pd['ArrDelay'], errors='coerce')

# Drop rows with NA in ArrDelay
numeric = numeric.dropna(subset=['ArrDelay'])

# Reduce the feature set to a small number of numeric features to save memory/time
top_k = min(8, max(1, numeric.shape[1]-1))  # pick up to 8 features (excluding target)
feat_cols = [c for c in numeric.columns if c != 'ArrDelay'][:top_k]
X = numeric[feat_cols].copy()
y = numeric['ArrDelay'].values.astype(np.float32)

# If the pandas sample is still large, downsample to at most 3000 rows for memory safety
if len(X) > 3000:
    sampled_idx = np.random.RandomState(42).choice(len(X), size=3000, replace=False)
    X = X.iloc[sampled_idx].reset_index(drop=True)
    y = y[sampled_idx]

# If there are zero features (unlikely), raise with helpful message
if X.shape[1] == 0:
    raise ValueError('No numeric features available after dropping ArrDelay. Check transformed dataset.')

print('Features shape:', X.shape)

# simple train/test split and scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# keep these for the PyTorch cells below
X_train_scaled = X_train_scaled.astype(np.float32)
X_test_scaled = X_test_scaled.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

print('Prepared numpy arrays: X_train', X_train_scaled.shape, 'y_train', y_train.shape)

In [None]:
# Small PyTorch model training (CPU)
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# convert numpy arrays to tensors
Xtr = torch.from_numpy(X_train_scaled)
Xte = torch.from_numpy(X_test_scaled)
ytr = torch.from_numpy(y_train).unsqueeze(1)
yte = torch.from_numpy(y_test).unsqueeze(1)

# smaller batch size for faster iterations in constrained environments
batch_size = 32
train_ds = TensorDataset(Xtr, ytr)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

n_features = Xtr.shape[1]
# Very simple single-layer linear model (one linear layer)
model = nn.Linear(n_features, 1)

device = torch.device('cpu')
model.to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

# minimal epochs to shorten run time for a quick baseline
epochs = 1
for ep in range(1, epochs+1):
    model.train()
    running = 0.0
    count = 0
    for xb, yb in train_dl:
        xb = xb.to(device); yb = yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        opt.zero_grad(); loss.backward(); opt.step()
        running += loss.item() * xb.size(0)
        count += xb.size(0)
    avg_loss = running / max(1, count)
    print(f'Epoch {ep}/{epochs}  train_mse={avg_loss:.4f}')

In [None]:
# Evaluate on test set and save model locally
import numpy as np
from sklearn.metrics import mean_squared_error

model.eval()
with torch.no_grad():
    preds = model(Xte.to(device)).cpu().numpy().reshape(-1)
    y_true = y_test.reshape(-1)

rmse = np.sqrt(mean_squared_error(y_true, preds))
print('Test RMSE:', rmse)

# Save model locally (adjust path as needed). To persist to HDFS, copy this file with hadoop/fs commands or Spark write APIs.
import os
os.makedirs('models', exist_ok=True)
model_path = os.path.join('models', 'pytorch_small_regressor.pth')
torch.save({'model_state_dict': model.state_dict(), 'scaler': scaler}, model_path)
print('Saved model to', model_path)