In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib


In [2]:

# ============================================================
# 0. PATHS 
# ============================================================
# Data base
if os.path.isdir("/data"):
    DATA_BASE = "/data"
else:
    DATA_BASE = "../data"

PROCESSED_DIR = os.path.join(DATA_BASE, "processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)

# Models base
if os.path.isdir("/models"):
    MODELS_DIR = "/models"
else:
    MODELS_DIR = "../models"
os.makedirs(MODELS_DIR, exist_ok=True)

# Output base
if os.path.isdir("/output"):
    OUTPUT_DIR = "/output"
else:
    OUTPUT_DIR = "../output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

engineered_path = os.path.join(PROCESSED_DIR, "engineered_features_2023.parquet")
if not os.path.exists(engineered_path):
    engineered_path = os.path.join(PROCESSED_DIR, "engineered_features_2023_sample.parquet")

print("Engineered features input:", engineered_path)

# Where to save prepared data
X_train_path = os.path.join(PROCESSED_DIR, "X_train.parquet")
X_test_path  = os.path.join(PROCESSED_DIR, "X_test.parquet")
y_train_path = os.path.join(PROCESSED_DIR, "y_train.parquet")
y_test_path  = os.path.join(PROCESSED_DIR, "y_test.parquet")

scaler_path  = os.path.join(MODELS_DIR, "scaler.pkl")
encoder_path = os.path.join(MODELS_DIR, "encoder.pkl")

corr_matrix_path   = os.path.join(OUTPUT_DIR, "correlation_matrix.csv")
split_indices_path = os.path.join(PROCESSED_DIR, "train_test_split_indices.csv")

print("Models dir:", MODELS_DIR)
print("Output dir:", OUTPUT_DIR)

# ============================================================
# 1. LOAD ENGINEERED DATA
# ============================================================
df = pd.read_parquet(engineered_path)
print("\nLoaded engineered dataset:", df.shape)

if "pickup_date" not in df.columns:
    raise KeyError("pickup_date not found in engineered features; needed for time-based split.")

df["pickup_date"] = pd.to_datetime(df["pickup_date"])

# ============================================================
# 2. TIME-BASED TRAIN / TEST SPLIT (80/20)
# ============================================================
df = df.sort_values("pickup_date").reset_index(drop=True)

n_samples = len(df)
split_idx = int(0.8 * n_samples)

train_df = df.iloc[:split_idx].copy()
test_df  = df.iloc[split_idx:].copy()

print(f"\nTrain size: {train_df.shape}, Test size: {test_df.shape}")

# Save split indices (for reproducibility / reference)
split_indices = pd.DataFrame({
    "index": np.arange(n_samples),
    "set": ["train"] * split_idx + ["test"] * (n_samples - split_idx)
})
split_indices.to_csv(split_indices_path, index=False)
print("Split indices saved to:", split_indices_path)

# Drop datetime columns from features 
datetime_cols = train_df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns.tolist()
print("Dropping datetime columns from features:", datetime_cols)

train_df = train_df.drop(columns=datetime_cols)
test_df  = test_df.drop(columns=datetime_cols)

# ============================================================
# 3. HANDLE MISSING VALUES 
# ============================================================
target_col = "fare_amount"
if target_col not in df.columns:
    raise KeyError("Target column 'fare_amount' not found in engineered dataset.")

numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = train_df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

print("\nNumeric columns:", len(numeric_cols))
print("Categorical columns:", len(categorical_cols))

for col in numeric_cols:
    # Compute median using TRAINING DATA ONLY to avoid leakage.
    median_train = train_df[col].median()

    # If the median is NaN (e.g., all values are missing), fall back to a safe constant.
    if pd.isna(median_train):
        median_val = 0.0
    else:
        median_val = median_train

    train_df[col] = train_df[col].fillna(median_val)
    test_df[col]  = test_df[col].fillna(median_val)


for col in categorical_cols:
    # Compute mode using TRAINING DATA ONLY to avoid leakage.
    mode_train = train_df[col].mode(dropna=True)
    if len(mode_train) > 0:
        mode_val = mode_train.iloc[0]
    else:
        # If even training data has no valid category, fall back to a generic "missing" label.
        mode_val = "missing"

    train_df[col] = train_df[col].fillna(mode_val)
    test_df[col]  = test_df[col].fillna(mode_val)


remaining_train_nans = train_df[numeric_cols + categorical_cols].isna().sum()
remaining_test_nans  = test_df[numeric_cols + categorical_cols].isna().sum()

if remaining_train_nans.sum() != 0 or remaining_test_nans.sum() != 0:
    print("\n Still found NaNs after imputation:")
    print("Train NaNs:\n", remaining_train_nans[remaining_train_nans > 0])
    print("Test NaNs:\n", remaining_test_nans[remaining_test_nans > 0])
    raise AssertionError("Train/test still have NaNs after robust imputation!")

# ============================================================
# 4. TARGET VARIABLE & REMOVE FROM FEATURES
# ============================================================
y_train = train_df[target_col].copy()
y_test  = test_df[target_col].copy()

train_df = train_df.drop(columns=[target_col])
test_df  = test_df.drop(columns=[target_col])

numeric_features = [c for c in numeric_cols if c != target_col]
categorical_features = categorical_cols[:]  

print("\n Initial numeric features:", len(numeric_features))
print("Initial categorical features:", len(categorical_features))

# ============================================================
# 4b. LIMIT ONE-HOT ENCODING TO LOW-CARDINALITY CATEGORICALS
# ============================================================
print("\nCategorical cardinalities:")
cardinalities = {}
for c in categorical_features:
    nuniq = train_df[c].nunique()
    cardinalities[c] = nuniq
    print(f"  {c}: {nuniq}")

LOW_CARD_THRESHOLD = 50  

categorical_low_card = [
    c for c in categorical_features
    if cardinalities[c] <= LOW_CARD_THRESHOLD
]

categorical_high_card = [
    c for c in categorical_features
    if c not in categorical_low_card
]

print("\nWill ONE-HOT encode these (low-card):", categorical_low_card)
print("Will NOT one-hot these (high-card, excluded from X):", categorical_high_card)

cols_to_drop_completely = ["zone_pair_encoded"]  
drop_now = [c for c in cols_to_drop_completely if c in train_df.columns]

if drop_now:
    print("Dropping extremely high-card columns:", drop_now)
    train_df = train_df.drop(columns=drop_now)
    test_df  = test_df.drop(columns=drop_now)
    categorical_low_card = [c for c in categorical_low_card if c not in drop_now]
    categorical_high_card = [c for c in categorical_high_card if c not in drop_now]

print("\nFinal low-card categoricals to one-hot:", categorical_low_card)
print("High-card categoricals (ignored in model features):", categorical_high_card)

# ============================================================
# 5. FEATURE SCALING (StandardScaler on numeric features)
# ============================================================
scaler = StandardScaler()
scaler.fit(train_df[numeric_features])

X_train_num = scaler.transform(train_df[numeric_features])
X_test_num  = scaler.transform(test_df[numeric_features])

# Save scaler
joblib.dump(scaler, scaler_path)
print("Scaler saved to:", scaler_path)

# ============================================================
# 6. CATEGORICAL ENCODING (OneHotEncoder, drop_first=True)
#    NOTE: sklearn>=1.2 uses sparse_output instead of sparse
# ============================================================
encoder = OneHotEncoder(
    handle_unknown="ignore",
    drop="first",
    sparse_output=False 
)

if categorical_low_card:
    encoder.fit(train_df[categorical_low_card])
    X_train_cat = encoder.transform(train_df[categorical_low_card])
    X_test_cat  = encoder.transform(test_df[categorical_low_card])
    encoded_cat_feature_names = encoder.get_feature_names_out(categorical_low_card)
else:
    X_train_cat = np.empty((len(train_df), 0))
    X_test_cat  = np.empty((len(test_df), 0))
    encoded_cat_feature_names = np.array([])

# Save encoder
joblib.dump(encoder, encoder_path)
print("Encoder saved to:", encoder_path)

# ============================================================
# 7. COMBINE NUMERIC + CATEGORICAL INTO FINAL MATRICES
# ============================================================
X_train = np.hstack([X_train_num, X_train_cat])
X_test  = np.hstack([X_test_num, X_test_cat])

feature_names = numeric_features + encoded_cat_feature_names.tolist()

X_train_df = pd.DataFrame(X_train, columns=feature_names)
X_test_df  = pd.DataFrame(X_test,  columns=feature_names)

# ============================================================
# 8. VALIDATION: SHAPES, NANs, INFs
# ============================================================
print("\nX_train shape:", X_train_df.shape)
print("X_test shape :", X_test_df.shape)
print("y_train shape:", y_train.shape)
print("y_test shape :", y_test.shape)

assert np.isfinite(X_train_df.to_numpy()).all(), "X_train contains NaN or inf!"
assert np.isfinite(X_test_df.to_numpy()).all(), "X_test contains NaN or inf!"

print("\nFeature dtypes:")
print(X_train_df.dtypes.head())
print("\nTrain feature summary (head):")
print(X_train_df.describe().T.head())
print("\nTest feature summary (head):")
print(X_test_df.describe().T.head())

# ============================================================
# 9. SAVE PREPARED DATASETS
# ============================================================
X_train_df.to_parquet(X_train_path, index=False)
X_test_df.to_parquet(X_test_path, index=False)
y_train.to_frame(name=target_col).to_parquet(y_train_path, index=False)
y_test.to_frame(name=target_col).to_parquet(y_test_path, index=False)  

print("\nSaved:")
print("X_train ->", X_train_path)
print("X_test  ->", X_test_path)
print("y_train ->", y_train_path)
print("y_test  ->", y_test_path)


# ============================================================
# 10. TARGET DISTRIBUTION & HISTOGRAMS
# ============================================================
print("\nTarget (fare_amount) stats – train:")
print(y_train.describe())
print("\nTarget (fare_amount) stats – test:")
print(y_test.describe())

# Histograms of target
plt.figure(figsize=(8, 5))
plt.hist(y_train, bins=80, alpha=0.7)
plt.title("Fare Amount – Train")
plt.xlabel("fare_amount")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "target_hist_train.png"))
plt.close()

plt.figure(figsize=(8, 5))
plt.hist(y_test, bins=80, alpha=0.7)
plt.title("Fare Amount – Test")
plt.xlabel("fare_amount")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "target_hist_test.png"))
plt.close()

print("Target histograms saved to:", OUTPUT_DIR)

# ============================================================
# 11. CORRELATION ANALYSIS (on numeric features, train only)
# ============================================================
train_numeric_only = pd.DataFrame(
    X_train_num, columns=numeric_features
)
corr_matrix = train_numeric_only.corr()
corr_matrix.to_csv(corr_matrix_path)
print("Correlation matrix saved to:", corr_matrix_path)

high_corr_pairs = []
threshold = 0.9
for i in range(len(numeric_features)):
    for j in range(i+1, len(numeric_features)):
        corr_val = corr_matrix.iloc[i, j]
        if abs(corr_val) > threshold:
            high_corr_pairs.append(
                (numeric_features[i], numeric_features[j], corr_val)
            )

print("\nHighly correlated numeric feature pairs (|corr| > 0.9):")
for a, b, c in high_corr_pairs:
    print(f"{a} - {b}: corr={c:.3f}")

# ============================================================
# 12. FINAL SUMMARY
# ============================================================
print("\n=== FINAL DATA PREP SUMMARY ===")
print(f"Total samples: {len(df)}")
print(f"Train samples: {len(y_train)}")
print(f"Test samples : {len(y_test)}")
print(f"Train/Test ratio: {len(y_train) / len(df):.3f} / {len(y_test) / len(df):.3f}")
print(f"Total features after encoding: {X_train_df.shape[1]}")

print("\nPrepared data locations:")
print("X_train:", X_train_path)
print("X_test :", X_test_path)
print("y_train:", y_train_path)
print("y_test :", y_test_path)
print("Scaler :", scaler_path)
print("Encoder:", encoder_path)
print("Correlation matrix:", corr_matrix_path)

print("\n Data is ready for modeling.")


Engineered features input: ../data/processed/engineered_features_2023_sample.parquet
Models dir: ../models
Output dir: ../output

Loaded engineered dataset: (1960167, 54)

Train size: (1568133, 54), Test size: (392034, 54)
Split indices saved to: ../data/processed/train_test_split_indices.csv
Dropping datetime columns from features: ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'pickup_date']

Numeric columns: 41
Categorical columns: 10


  return np.nanmean(a, axis, out=out, keepdims=keepdims)



 Initial numeric features: 40
Initial categorical features: 10

Categorical cardinalities:
  store_and_fwd_flag: 2
  pickup_borough: 6
  pickup_zone: 246
  temp_category: 1
  dropoff_borough: 6
  dropoff_zone: 257
  zone_pair_encoded: 16955
  traffic_volume_category: 2
  time_of_day_factor: 4
  day_type: 3

Will ONE-HOT encode these (low-card): ['store_and_fwd_flag', 'pickup_borough', 'temp_category', 'dropoff_borough', 'traffic_volume_category', 'time_of_day_factor', 'day_type']
Will NOT one-hot these (high-card, excluded from X): ['pickup_zone', 'dropoff_zone', 'zone_pair_encoded']
Dropping extremely high-card columns: ['zone_pair_encoded']

Final low-card categoricals to one-hot: ['store_and_fwd_flag', 'pickup_borough', 'temp_category', 'dropoff_borough', 'traffic_volume_category', 'time_of_day_factor', 'day_type']
High-card categoricals (ignored in model features): ['pickup_zone', 'dropoff_zone']
Scaler saved to: ../models/scaler.pkl
Encoder saved to: ../models/encoder.pkl

X_trai