In [None]:
!pip install tensorflow

In [None]:
# CELL 1: Settings / Device selection
# Change DEVICE to "CPU", "GPU", or "TPU" before running. Set runtime in Colab accordingly.
DEVICE = "TPU"   # "CPU" | "GPU" | "TPU"

# Basic params (edit paths after mounting)
CSV_TRAIN_PATH = "/content/drive/MyDrive/train.csv"   # path to train.csv in Drive
CSV_TEST_PATH  = "/content/drive/MyDrive/test.csv"    # path to test.csv in Drive
RESULTS_DIR = f"/content/drive/MyDrive/taxi_results_{DEVICE}"
SAMPLE_SIZE = 500000
CHUNK_SIZE = 500000
RANDOM_SEED = 42
EPOCHS = 25
BATCH_SIZE = 1024      # increase for GPU/TPU; TPU may require multiples of 8/128
TEST_SPLIT = 0.1

import os, time, math
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Device hints
if DEVICE == "CPU":
    os.environ["CUDA_VISIBLE_DEVICES"] = ""   # hide GPU
    print("Running CPU-only.")
elif DEVICE == "GPU":
    print("Running on GPU. Ensure Colab runtime -> GPU is selected.")
elif DEVICE == "TPU":
    print("TPU requested. Ensure Colab runtime -> TPU is selected.")
else:
    raise ValueError("DEVICE must be one of 'CPU','GPU','TPU'.")


In [None]:
# CELL 2: Mount Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.makedirs(RESULTS_DIR, exist_ok=True)
print("Results directory:", RESULTS_DIR)


In [None]:
# CELL 3: TPU init (run but harmless on non-TPU)
strategy = tf.distribute.get_strategy()
if DEVICE == "TPU":
    try:
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = tf.distribute.TPUStrategy(resolver)
        print("TPU initialized. Strategy:", type(strategy))
        # adjust batch size recommendation
        if BATCH_SIZE < 2048:
            print("Consider increasing BATCH_SIZE for TPU (e.g., 4096)")
    except Exception as e:
        print("TPU init failed, falling back to default strategy:", e)
        strategy = tf.distribute.get_strategy()
else:
    strategy = tf.distribute.get_strategy()
print("Using strategy:", type(strategy))


In [None]:
# === FIXED MINIMAL preprocess_df (no filtering, no row dropping) ===

def preprocess_df(df):
    df = df.copy()


    # --- Add missing fare_amount for TEST ---
    if 'fare_amount' not in df.columns:
        df['fare_amount'] = 0   # placeholder for test rows
    # --- 1. Datetime features ---
    if 'pickup_datetime' in df.columns:
        df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')
        df['hour'] = df['pickup_datetime'].dt.hour.fillna(0).astype(int)
        df['dow']  = df['pickup_datetime'].dt.dayofweek.fillna(0).astype(int)
        df['month']= df['pickup_datetime'].dt.month.fillna(0).astype(int)
    else:
        df['hour'] = 0
        df['dow'] = 0
        df['month'] = 0

    # --- 2. Convert numeric columns (but DO NOT drop rows if invalid) ---
    num_cols = ['pickup_longitude','pickup_latitude',
                'dropoff_longitude','dropoff_latitude',
                'passenger_count','fare_amount']
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

    # Fill missing numeric values with 0 (or fallback)
    df[num_cols] = df[num_cols].fillna(0)

    # --- 3. Compute distance (NaNs handled because we filled above) ---
    R = 6371.0
    φ1 = np.radians(df['pickup_latitude'])
    φ2 = np.radians(df['dropoff_latitude'])
    Δφ = np.radians(df['dropoff_latitude'] - df['pickup_latitude'])
    Δλ = np.radians(df['dropoff_longitude'] - df['pickup_longitude'])
    a = np.sin(Δφ/2)**2 + np.cos(φ1) * np.cos(φ2) * np.sin(Δλ/2)**2
    df['distance_km'] = R * 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

    # --- 4. Simple additional diffs ---
    df['abs_lon_diff'] = (df['pickup_longitude'] - df['dropoff_longitude']).abs()
    df['abs_lat_diff'] = (df['pickup_latitude'] - df['dropoff_latitude']).abs()

    # --- 5. Passenger count ---
    df['passenger_count'] = df['passenger_count'].astype(int).clip(0, 10)

    # --- 6. FEATURES ---
    features = [
        'hour','dow','month',
        'pickup_longitude','pickup_latitude',
        'dropoff_longitude','dropoff_latitude',
        'distance_km','abs_lon_diff','abs_lat_diff',
        'passenger_count'
    ]

    # keep ALL rows
    X = df[features]

    # Target (only present in train)
    y = df['fare_amount'] if 'fare_amount' in df.columns else pd.Series(np.zeros(len(df)))

    return X, y


In [None]:
# CELL 5: Load sampled training data (stream chunks)
def load_sampled_dataframe(csv_path, sample_size=SAMPLE_SIZE, chunk_size=CHUNK_SIZE):
    reader = pd.read_csv(csv_path, chunksize=chunk_size, iterator=True)
    feats, tars = [], []
    total = 0
    for chunk in reader:
        Xc, yc = preprocess_df(chunk)
        if len(Xc)==0:
            continue
        if sample_size > 0:
            remaining = sample_size - total
            if remaining <= 0:
                break
            if len(Xc) > remaining:
                sel = Xc.sample(n=remaining, random_state=RANDOM_SEED)
                feats.append(sel)
                tars.append(yc.loc[sel.index])
                total += remaining
                break
            else:
                feats.append(Xc); tars.append(yc); total += len(Xc)
        else:
            feats.append(Xc); tars.append(yc)
    if len(feats)==0:
        raise ValueError("No data loaded - check CSV path and columns.")
    X = pd.concat(feats, ignore_index=True)
    y = pd.concat(tars, ignore_index=True)
    print(f"Loaded {len(X)} rows from train (sample_size={sample_size}).")
    return X, y

print("Loading training sample (this may take some minutes)...")
X_full, y_full = load_sampled_dataframe(CSV_TRAIN_PATH)
X_train_df, X_val_df, y_train_s, y_val_s = train_test_split(X_full, y_full, test_size=TEST_SPLIT, random_state=RANDOM_SEED)
print("Train rows:", len(X_train_df), "Val rows:", len(X_val_df))


In [None]:
# CELL 6: Scaling + tf.data datasets
num_cols = X_train_df.columns.tolist()
scaler = StandardScaler(); scaler.fit(X_train_df[num_cols])
X_train = scaler.transform(X_train_df[num_cols]).astype(np.float32)
X_val   = scaler.transform(X_val_df[num_cols]).astype(np.float32)
y_train = y_train_s.values.astype(np.float32)
y_val   = y_val_s.values.astype(np.float32)

train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(100000, seed=RANDOM_SEED).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_ds   = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Save scaler for later testing/inference
print(len(X_train))
import joblib
joblib.dump(scaler, os.path.join(RESULTS_DIR, f"scaler_{DEVICE}.joblib"))
print("Saved scaler to Drive.")


In [None]:
# CELL 7: Model builder + epoch timer
def build_model(input_dim, size="small"):
    inp = keras.Input(shape=(input_dim,))
    if size=="small":
        x = layers.Dense(64, activation='relu')(inp)
        x = layers.Dense(32, activation='relu')(x)
    elif size=="medium":
        x = layers.Dense(128, activation='relu')(inp)
        x = layers.Dense(128, activation='relu')(x)
        x = layers.Dense(64, activation='relu')(x)
    else: # large
        x = layers.Dense(256, activation='relu')(inp)
        x = layers.Dense(128, activation='relu')(x)
        x = layers.Dense(64, activation='relu')(x)
        x = layers.Dense(32, activation='relu')(x)
    out = layers.Dense(1, activation='linear')(x)
    model = keras.Model(inp, out)
    model.compile(optimizer=keras.optimizers.Adam(1e-3),
                  loss='mse',
                  metrics=[keras.metrics.RootMeanSquaredError(name='rmse'),
                           keras.metrics.MeanAbsoluteError(name='mae')])
    return model

class EpochTimer(callbacks.Callback):
    def on_train_begin(self, logs=None):
        self.epoch_times=[]
    def on_epoch_begin(self, epoch, logs=None):
        self._t0 = time.time()
    def on_epoch_end(self, epoch, logs=None):
        self.epoch_times.append(time.time()-self._t0)


In [None]:
# CELL 8: Train models (small/medium/large), save models & metrics
sizes = ['small','medium','large']
histories = {}
epoch_times_all = {}
metrics_summary = []

for size in sizes:
    print(f"\n--- Training {size} on {DEVICE} ---")
    with strategy.scope():
        model = build_model(X_train.shape[1], size=size)
    model.summary()
    timer_cb = EpochTimer()
    early = callbacks.EarlyStopping(monitor='val_rmse', patience=5, restore_best_weights=True)
    start_time = time.time()
    hist = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, callbacks=[timer_cb, early], verbose=2)
    total_train_time = time.time() - start_time

    histories[size] = hist.history
    epoch_times_all[size] = timer_cb.epoch_times
    epochs_trained = len(timer_cb.epoch_times)
    avg_epoch_time = float(np.mean(timer_cb.epoch_times)) if epochs_trained>0 else None

    # Evaluate on validation arrays (fast)
    preds_val = model.predict(X_val, batch_size=BATCH_SIZE).ravel()
    rmse = math.sqrt(mean_squared_error(y_val, preds_val))
    mae  = float(np.mean(np.abs(y_val - preds_val)))
    r2   = float(r2_score(y_val, preds_val))

    # Save model and metrics
    model_path = os.path.join(RESULTS_DIR, f"model_{DEVICE}_{size}.h5")
    model.save(model_path)
    np.savez(os.path.join(RESULTS_DIR, f"metrics_{DEVICE}_{size}.npz"),
             rmse=rmse, mae=mae, r2=r2, preds=preds_val, y_val=y_val)
    metrics_summary.append({
        'device': DEVICE, 'size': size, 'epochs_trained': epochs_trained,
        'avg_epoch_time_s': avg_epoch_time, 'total_train_time_s': float(total_train_time),
        'final_val_rmse': float(rmse), 'final_val_mae': float(mae), 'final_val_r2': float(r2)
    })
    print(f"{size} done: epochs={epochs_trained}, avg_epoch_s={avg_epoch_time:.3f}, total_s={total_train_time:.1f}, rmse={rmse:.4f}, r2={r2:.4f}")

# Save metrics summary
metrics_df = pd.DataFrame(metrics_summary)
metrics_df.to_csv(os.path.join(RESULTS_DIR, f"metrics_summary_{DEVICE}.csv"), index=False)
print("Saved metrics summary CSV.")


In [None]:
# CELL 9: Plotting (epoch times, loss/RMSE curves, summary)
plt.figure(figsize=(10,6))
for size in sizes:
    times = epoch_times_all.get(size, [])
    if len(times)>0:
        plt.plot(range(1,len(times)+1), times, marker='o', label=size)
plt.xlabel("Epoch"); plt.ylabel("Epoch time (s)"); plt.title(f"Epoch time per epoch ({DEVICE})")
plt.legend(); plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, f"epoch_time_{DEVICE}.png")); plt.show()

for size in sizes:
    h = histories.get(size, {})
    if not h: continue
    ep = range(1, len(h.get('loss',[])) + 1)
    plt.figure(figsize=(12,4))
    plt.subplot(1,2,1)
    plt.plot(ep, h.get('loss',[]), label='train_loss'); plt.plot(ep, h.get('val_loss',[]), label='val_loss')
    plt.title(f"{size} Loss ({DEVICE})"); plt.legend()
    plt.subplot(1,2,2)
    plt.plot(ep, h.get('rmse',[]), label='train_rmse'); plt.plot(ep, h.get('val_rmse',[]), label='val_rmse')
    plt.title(f"{size} RMSE ({DEVICE})"); plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, f"loss_rmse_{DEVICE}_{size}.png")); plt.show()

# Summary bar charts
avg_epoch_times = [m['avg_epoch_time_s'] for m in metrics_summary]
final_rmse = [m['final_val_rmse'] for m in metrics_summary]
plt.figure(figsize=(10,4))
plt.subplot(1,2,1); plt.bar(sizes, avg_epoch_times); plt.title('Avg epoch time (s)'); plt.ylabel('s')
plt.subplot(1,2,2); plt.bar(sizes, final_rmse); plt.title('Final val RMSE'); plt.ylabel('RMSE')
plt.tight_layout(); plt.savefig(os.path.join(RESULTS_DIR, f"summary_bar_{DEVICE}.png")); plt.show()
print("Saved all plots to Drive.")


In [None]:
# ============================================================
#  FINAL FULL TEST PREDICTION CODE (PRINT LOOP INCLUDED)
# ============================================================

# 1. Feature list (used by BOTH train + test)
FEATURE_COLUMNS = [
    'hour','dow','month',
    'pickup_longitude','pickup_latitude',
    'dropoff_longitude','dropoff_latitude',
    'distance_km','abs_lon_diff','abs_lat_diff',
    'passenger_count'
]


# 2. Final safe test prediction function
def predict_test_for_model(model_path, scaler, size_tag):
    print(f"\n Predicting test for model size: {size_tag}")

    # load model
    model = keras.models.load_model(model_path, compile=False)

    # output path in Drive
    out_file = os.path.join(RESULTS_DIR, f"predictions_{DEVICE}_{size_tag}.csv")

    # stream test.csv in chunks
    reader = pd.read_csv(CSV_TEST_PATH, chunksize=CHUNK_SIZE, iterator=True)
    header_written = False
    total_rows = 0

    for chunk_idx, chunk in enumerate(reader, start=1):

        # detect ID column safely
        if 'key' in chunk.columns:
            ids = chunk['key']
        elif 'id' in chunk.columns:
            ids = chunk['id']
        else:
            ids = pd.Series(np.arange(len(chunk)))

        # preprocess WITHOUT filtering
        Xc, _ = preprocess_df(chunk)

        # **CRITICAL LINE** – select ONLY model features
        Xc = Xc[FEATURE_COLUMNS]

        # scale
        Xc_sc = scaler.transform(Xc).astype(np.float32)

        # predict
        preds = model.predict(Xc_sc, batch_size=2048, verbose=0).ravel()

        # limit ids length
        ids_aligned = ids.iloc[:len(preds)].values

        # create dataframe
        out_df = pd.DataFrame({
            'key': ids_aligned,
            'predicted_fare': preds
        })

        # write to file
        if not header_written:
            out_df.to_csv(out_file, index=False, mode='w')
            header_written = True
        else:
            out_df.to_csv(out_file, index=False, header=False, mode='a')

        total_rows += len(preds)
        print(f"   Chunk {chunk_idx}: wrote {len(preds)} rows (total = {total_rows})")

    print(f"\n Saved: {out_file}")
    print(f" Total predictions generated: {total_rows}")

    return out_file



# 3. === RUN FOR ALL MODEL SIZES (prints output) ===

print(" Starting prediction on TEST.csv ...")

for m in metrics_summary:
    model_size = m['size']
    model_file = os.path.join(RESULTS_DIR, f"model_{DEVICE}_{model_size}.h5")
    predict_test_for_model(model_file, scaler, model_size)

print("\n All test predictions completed successfully!")


In [None]:
# ==========================================================
#  CELL 12: Accuracy Metrics + Plots for ALL model sizes
# ==========================================================

from sklearn.metrics import r2_score, mean_absolute_percentage_error

print("Generating accuracy metrics for all model sizes...\n")

for m in metrics_summary:
    size_tag = m['size']

    metrics_path = os.path.join(RESULTS_DIR, f"metrics_{DEVICE}_{size_tag}.npz")
    if not os.path.exists(metrics_path):
        print(f"Metrics file missing for model size: {size_tag}. Skipping.")
        continue

    data = np.load(metrics_path, allow_pickle=True)
    preds = data['preds']
    yval  = data['y_val']

    rmse = np.sqrt(np.mean((yval - preds)**2))
    r2 = r2_score(yval, preds)
    mape = mean_absolute_percentage_error(yval, preds) * 100

    print(f"\n===== Model: {DEVICE}-{size_tag} =====")
    print(f"RMSE  : {rmse:.4f}")
    print(f"R²    : {r2:.4f}")
    print(f"MAPE  : {mape:.2f}%")
    print("========================================\n")

    # Pred vs Actual
    plt.figure(figsize=(8,8))
    plt.scatter(yval, preds, s=2, alpha=0.25)
    mx = max(yval.max(), preds.max())
    plt.plot([0, mx], [0, mx], 'r--', linewidth=2)
    plt.xlabel('Actual Fare', fontsize=12)
    plt.ylabel('Predicted Fare', fontsize=12)
    plt.title(f'Pred vs Actual ({DEVICE}-{size_tag})', fontsize=15)
    plt.grid(True, alpha=0.2)
    plt.ylim(0, 100)

    out1 = os.path.join(RESULTS_DIR, f"pred_vs_actual_{DEVICE}_{size_tag}.png")
    plt.savefig(out1, dpi=120, bbox_inches='tight')
    plt.show()

    # Residual distribution
    resid = yval - preds
    plt.figure(figsize=(12,4))
    plt.hist(resid, bins=80, color='steelblue', edgecolor='black', alpha=0.8)
    plt.title(f'Residuals ({DEVICE}-{size_tag})', fontsize=15)
    plt.xlabel('Residual (Actual - Predicted)', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.grid(True, alpha=0.2)

    out2 = os.path.join(RESULTS_DIR, f"residuals_{DEVICE}_{size_tag}.png")
    plt.savefig(out2, dpi=120, bbox_inches='tight')
    plt.show()

    print(f"Saved plots:\n  - {out1}\n  - {out2}\n")

print("\nCompleted accuracy reporting for all models.")


In [None]:
# CELL 12: Final summary
metrics_df = pd.read_csv(os.path.join(RESULTS_DIR, f"metrics_summary_{DEVICE}.csv"))
print(metrics_df)
print("Notebook finished for device:", DEVICE)
