In [3]:
import os
import time
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [4]:


# ============================================================
# 0. PATH SETUP
# ============================================================
# Data base
if os.path.isdir("/data"):
    DATA_BASE = "/data"
else:
    DATA_BASE = "../data"

PROCESSED_DIR = os.path.join(DATA_BASE, "processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)


if os.path.isdir("/models"):
    MODELS_DIR = "/models"
else:
    MODELS_DIR = "../models"
os.makedirs(MODELS_DIR, exist_ok=True)


if os.path.isdir("/output"):
    OUTPUT_DIR = "/output"
else:
    OUTPUT_DIR = "../output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def pick_path(filename_sample, filename_full):
    sample_path = os.path.join(PROCESSED_DIR, filename_sample)
    full_path   = os.path.join(PROCESSED_DIR, filename_full)
    if os.path.exists(sample_path):
        return sample_path
    return full_path

X_train_path = pick_path("X_train_sample.parquet", "X_train.parquet")
X_test_path  = pick_path("X_test_sample.parquet",  "X_test.parquet")
y_train_path = pick_path("y_train_sample.parquet", "y_train.parquet")
y_test_path  = pick_path("y_test_sample.parquet",  "y_test.parquet")

print("Using these data paths:")
print("X_train:", X_train_path)
print("X_test :", X_test_path)
print("y_train:", y_train_path)
print("y_test :", y_test_path)

coefficients_out_path = os.path.join(OUTPUT_DIR, "linear_regression_coefficients.csv")
predictions_out_path  = os.path.join(OUTPUT_DIR, "linear_regression_predictions.csv")
evaluation_out_path   = os.path.join(OUTPUT_DIR, "linear_regression_evaluation.txt")
model_out_path        = os.path.join(MODELS_DIR, "linear_regression_model.pkl")

residual_hist_path    = os.path.join(OUTPUT_DIR, "linear_regression_residuals_hist.png")
residual_vs_pred_path = os.path.join(OUTPUT_DIR, "linear_regression_residuals_vs_pred.png")
actual_vs_pred_path   = os.path.join(OUTPUT_DIR, "linear_regression_actual_vs_pred.png")

# ============================================================
# 1. LOAD TRAIN/TEST DATA
# ============================================================
X_train = pd.read_parquet(X_train_path)
X_test  = pd.read_parquet(X_test_path)

y_train_df = pd.read_parquet(y_train_path)
y_test_df  = pd.read_parquet(y_test_path)

y_train = y_train_df.iloc[:, 0]
y_test  = y_test_df.iloc[:, 0]

print("\nShapes:")
print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)

feature_names = X_train.columns.tolist()

# ============================================================
# 2. MODEL INITIALIZATION & TRAINING
# ============================================================
model = LinearRegression()

start_time = time.time()
model.fit(X_train, y_train)
train_duration = time.time() - start_time

print(f"\nTraining completed in {train_duration:.2f} seconds.")

# ============================================================
# 3. PREDICTIONS
# ============================================================
y_pred_train = model.predict(X_train)
y_pred_test  = model.predict(X_test)

pred_df = pd.DataFrame({
    "set":      ["train"] * len(y_train) + ["test"] * len(y_test),
    "actual":   pd.concat([y_train, y_test], ignore_index=True),
    "predicted": np.concatenate([y_pred_train, y_pred_test])
})
pred_df.to_csv(predictions_out_path, index=False)
print("Predictions saved to:", predictions_out_path)

# ============================================================
# 4. METRICS FUNCTIONS
# ============================================================
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def mape(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    mask = y_true != 0
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100.0

def rmsle(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    y_true_clipped = np.clip(y_true, a_min=0, a_max=None)
    y_pred_clipped = np.clip(y_pred, a_min=0, a_max=None)
    return np.sqrt(
        np.mean(
            (np.log1p(y_true_clipped) - np.log1p(y_pred_clipped)) ** 2
        )
    )

# ============================================================
# 5. MODEL EVALUATION (TRAIN & TEST)
# ============================================================
metrics = {}

for split_name, y_true, y_pred in [
    ("train", y_train, y_pred_train),
    ("test",  y_test,  y_pred_test),
]:
    split_metrics = {
        "RMSE":  rmse(y_true, y_pred),
        "MAE":   mae(y_true, y_pred),
        "MAPE":  mape(y_true, y_pred),
        "R2":    r2_score(y_true, y_pred),
        "RMSLE": rmsle(y_true, y_pred),
    }
    metrics[split_name] = split_metrics

print("\n=== Linear Regression Performance ===")
for split_name in ["train", "test"]:
    m = metrics[split_name]
    print(f"\n{split_name.upper()} METRICS:")
    print(f"  RMSE : {m['RMSE']:.4f}")
    print(f"  MAE  : {m['MAE']:.4f}")
    print(f"  MAPE : {m['MAPE']:.2f}%")
    print(f"  R^2  : {m['R2']:.4f}")
    print(f"  RMSLE: {m['RMSLE']:.4f}")

# ============================================================
# 6. RESIDUAL ANALYSIS (TEST SET)
# ============================================================
residuals = y_test - y_pred_test

res_mean = residuals.mean()
res_std  = residuals.std()
res_min  = residuals.min()
res_max  = residuals.max()

print("\n=== Residual Analysis (Test) ===")
print(f"Mean residual      : {res_mean:.4f}")
print(f"Std of residuals   : {res_std:.4f}")
print(f"Min residual       : {res_min:.4f}")
print(f"Max residual       : {res_max:.4f}")

# Residual distribution plot
plt.figure(figsize=(8, 5))
plt.hist(residuals, bins=80, alpha=0.7)
plt.title("Residual Distribution (Test)")
plt.xlabel("Residual (actual - predicted)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(residual_hist_path)
plt.close()

# Residuals vs predicted values
plt.figure(figsize=(8, 5))
plt.scatter(y_pred_test, residuals, s=5)
plt.axhline(0, linestyle="--")
plt.title("Residuals vs Predicted (Test)")
plt.xlabel("Predicted fare_amount")
plt.ylabel("Residual (actual - predicted)")
plt.tight_layout()
plt.savefig(residual_vs_pred_path)
plt.close()

print("Residual plots saved to:")
print("  Histogram :", residual_hist_path)
print("  Res vs Pred:", residual_vs_pred_path)

# ============================================================
# 7. MODEL COEFFICIENTS / FEATURE IMPORTANCE
# ============================================================
coefs = model.coef_
intercept = model.intercept_

coef_df = pd.DataFrame({
    "feature": feature_names,
    "coefficient": coefs,
    "abs_coefficient": np.abs(coefs),
}).sort_values("abs_coefficient", ascending=False)

coef_df.to_csv(coefficients_out_path, index=False)
print("Coefficient table saved to:", coefficients_out_path)
print(f"Intercept: {intercept:.4f}")

top_n = 15
top_coef = coef_df.head(top_n)

plt.figure(figsize=(10, 6))
plt.barh(top_coef["feature"][::-1], top_coef["coefficient"][::-1])
plt.title("Top 15 Features by Coefficient Magnitude (Linear Regression)")
plt.xlabel("Coefficient value")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "linear_regression_top_features.png"))
plt.close()

print("Top-15 feature importance plot saved to:",
      os.path.join(OUTPUT_DIR, "linear_regression_top_features.png"))

# ============================================================
# 8. OVERFITTING CHECK
# ============================================================
train_rmse = metrics["train"]["RMSE"]
test_rmse  = metrics["test"]["RMSE"]
train_r2   = metrics["train"]["R2"]
test_r2    = metrics["test"]["R2"]

print("\n=== Overfitting Check ===")
print(f"Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")
print(f"Train R^2 : {train_r2:.4f}, Test R^2 : {test_r2:.4f}")

# ============================================================
# 9. SAVE MODEL
# ============================================================
joblib.dump(model, model_out_path)
print("\nModel saved to:", model_out_path)

# ============================================================
# 10. PREDICTIONS ANALYSIS (TEST SET)
# ============================================================
test_pred_df = pd.DataFrame({
    "actual":    y_test.values,
    "predicted": y_pred_test,
})
test_pred_df["abs_error"] = np.abs(test_pred_df["actual"] - test_pred_df["predicted"])

plt.figure(figsize=(8, 8))
plt.scatter(test_pred_df["actual"], test_pred_df["predicted"], s=5)
min_val = min(test_pred_df["actual"].min(), test_pred_df["predicted"].min())
max_val = max(test_pred_df["actual"].max(), test_pred_df["predicted"].max())
plt.plot([min_val, max_val], [min_val, max_val], linestyle="--")
plt.title("Actual vs Predicted Fare (Test)")
plt.xlabel("Actual fare_amount")
plt.ylabel("Predicted fare_amount")
plt.tight_layout()
plt.savefig(actual_vs_pred_path)
plt.close()

print("Actual vs predicted plot saved to:", actual_vs_pred_path)

largest_errors = test_pred_df.sort_values("abs_error", ascending=False).head(20)
print("\nTop 20 largest absolute prediction errors (test):")
print(largest_errors)

bins = [0, 10, 20, 40, 80, np.inf]
labels = ["0-10", "10-20", "20-40", "40-80", "80+"]

test_pred_df["fare_range"] = pd.cut(test_pred_df["actual"], bins=bins, labels=labels)
range_stats = test_pred_df.groupby("fare_range")["abs_error"].agg(["mean", "median", "count"])
print("\nAbsolute error by fare range (test):")
print(range_stats)

# ============================================================
# 11. PERFORMANCE REPORT (TEXT FILE)
# ============================================================
with open(evaluation_out_path, "w") as f:
    f.write("Linear Regression Model â€“ Fare Prediction\n")
    f.write("=========================================\n\n")
    f.write(f"Training duration: {train_duration:.2f} seconds\n\n")

    for split_name in ["train", "test"]:
        m = metrics[split_name]
        f.write(f"{split_name.upper()} METRICS:\n")
        f.write(f"  RMSE : {m['RMSE']:.4f}\n")
        f.write(f"  MAE  : {m['MAE']:.4f}\n")
        f.write(f"  MAPE : {m['MAPE']:.2f}%\n")
        f.write(f"  R^2  : {m['R2']:.4f}\n")
        f.write(f"  RMSLE: {m['RMSLE']:.4f}\n\n")

    f.write("Overfitting check:\n")
    f.write(f"  Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}\n")
    f.write(f"  Train R^2 : {train_r2:.4f}, Test R^2 : {test_r2:.4f}\n\n")

    f.write("Residual analysis (test):\n")
    f.write(f"  Mean residual    : {res_mean:.4f}\n")
    f.write(f"  Std residual     : {res_std:.4f}\n")
    f.write(f"  Min residual     : {res_min:.4f}\n")
    f.write(f"  Max residual     : {res_max:.4f}\n\n")

    f.write("Top 15 features by |coefficient|:\n")
    for _, row in top_coef.iterrows():
        f.write(f"  {row['feature']}: coef={row['coefficient']:.6f}, "
                f"|coef|={row['abs_coefficient']:.6f}\n")

    f.write("\nAbsolute error by fare range (test):\n")
    f.write(range_stats.to_string())
    f.write("\n")

print("\nEvaluation report saved to:", evaluation_out_path)
print("\n Linear Regression training & evaluation completed.")


Using these data paths:
X_train: ../data/processed/X_train.parquet
X_test : ../data/processed/X_test.parquet
y_train: ../data/processed/y_train.parquet
y_test : ../data/processed/y_test.parquet

Shapes:
X_train: (1568133, 57)
X_test : (392034, 57)
y_train: (1568133,)
y_test : (392034,)

Training completed in 2.41 seconds.
Predictions saved to: ../output/linear_regression_predictions.csv

=== Linear Regression Performance ===

TRAIN METRICS:
  RMSE : 5.3076
  MAE  : 2.1167
  MAPE : 21.45%
  R^2  : 0.9101
  RMSLE: 0.1882

TEST METRICS:
  RMSE : 12.5087
  MAE  : 2.1964
  MAPE : 18.63%
  R^2  : 0.3874
  RMSLE: 0.1909

=== Residual Analysis (Test) ===
Mean residual      : 0.1608
Std of residuals   : 12.5077
Min residual       : -7279.0647
Max residual       : 348.5867
Residual plots saved to:
  Histogram : ../output/linear_regression_residuals_hist.png
  Res vs Pred: ../output/linear_regression_residuals_vs_pred.png
Coefficient table saved to: ../output/linear_regression_coefficients.csv
In

  range_stats = test_pred_df.groupby("fare_range")["abs_error"].agg(["mean", "median", "count"])
