In [1]:
"""
Oasis Infobyte Internship — Task 5: Advertising Dataset (Sales Prediction)

This is a runnable Python script to train and evaluate models that predict
product Sales from advertising budgets on TV, Radio, and Newspaper.

Assumptions:
- The dataset is a CSV named 'advertising.csv' with columns: TV, Radio, Newspaper, Sales
- Place this CSV in the same folder as this script or update DATA_PATH

What the script does:
1. Loads data and performs quick EDA
2. Visualizes relationships (saves plots to outputs/)
3. Prepares features and target
4. Trains several regression models using pipelines:
   - Linear Regression
   - Ridge Regression (with CV)
   - Lasso Regression (with CV)
   - Random Forest Regressor
5. Hyperparameter tuning for Ridge/Lasso/RF using GridSearchCV
6. Evaluates using RMSE, MAE, R2 and cross-validation
7. Picks the best model and saves it
8. Provides a `predict_single` helper to predict on new budgets

Run:
    python oasis_infobyte_task5_advertising.py

Requirements:
    pip install pandas numpy matplotlib scikit-learn joblib

"""

import os
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# ----------------------
# Config
# ----------------------
RANDOM_STATE = 42
DATA_PATH = "advertising.csv"  # change if needed
OUTPUT_DIR = Path("outputs")
MODEL_DIR = Path("models")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------
# Utils
# ----------------------

def load_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # Ensure expected columns exist
    expected = {"TV", "Radio", "Newspaper", "Sales"}
    if not expected.issubset(set(df.columns)):
        raise ValueError(f"CSV must contain columns: {expected}. Found: {df.columns}")
    return df


def quick_eda(df: pd.DataFrame):
    print("Dataset shape:", df.shape)
    print(df.head())
    print(df.describe())
    print("Missing values:\n", df.isnull().sum())

    # Correlation
    corr = df.corr()
    print("Correlation with Sales:\n", corr["Sales"].sort_values(ascending=False))

    # Scatter plots
    features = ["TV", "Radio", "Newspaper"]
    for feat in features:
        plt.figure(figsize=(6, 4))
        plt.scatter(df[feat], df["Sales"], alpha=0.6)
        plt.xlabel(feat)
        plt.ylabel("Sales")
        plt.title(f"Sales vs {feat}")
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(OUTPUT_DIR / f"sales_vs_{feat.lower()}.png")
        plt.close()

    # Pairwise matrix (simple)
    pd.plotting.scatter_matrix(df, figsize=(9, 9))
    plt.suptitle("Pairwise scatter matrix")
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / "pairwise_matrix.png")
    plt.close()


def prepare_data(df: pd.DataFrame):
    X = df[["TV", "Radio", "Newspaper"]].values
    y = df["Sales"].values
    return train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)


def evaluate_regression_model(name, model, X_test, y_test):
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print(f"--- {name} ---")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE : {mae:.4f}")
    print(f"R2  : {r2:.4f}")
    print()
    return {"rmse": rmse, "mae": mae, "r2": r2}


def main():
    if not os.path.exists(DATA_PATH):
        print(f"Data file '{DATA_PATH}' not found. Please place your CSV in the working directory or update DATA_PATH.")
        return

    df = load_data(DATA_PATH)
    quick_eda(df)

    X_train, X_test, y_train, y_test = prepare_data(df)

    # ------------------
    # Baseline: Linear Regression (with scaling)
    # ------------------
    lr_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("lr", LinearRegression())
    ])
    lr_pipe.fit(X_train, y_train)
    lr_metrics = evaluate_regression_model("LinearRegression", lr_pipe, X_test, y_test)

    # Cross-validated scores for baseline
    cv_scores = cross_val_score(lr_pipe, np.vstack((X_train, X_test)), np.concatenate((y_train, y_test)), cv=5, scoring="r2")
    print("LinearRegression CV R2 (5-fold):", cv_scores.mean())
    print()

    # ------------------
    # Ridge Regression with GridSearchCV
    # ------------------
    ridge_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(random_state=RANDOM_STATE))
    ])
    ridge_params = {"ridge__alpha": [0.01, 0.1, 1, 10, 50, 100]}
    ridge_grid = GridSearchCV(ridge_pipe, ridge_params, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
    ridge_grid.fit(X_train, y_train)
    print("Best Ridge params:", ridge_grid.best_params_)
    ridge_best = ridge_grid.best_estimator_
    ridge_metrics = evaluate_regression_model("Ridge", ridge_best, X_test, y_test)

    # ------------------
    # Lasso Regression with GridSearchCV
    # ------------------
    lasso_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("lasso", Lasso(random_state=RANDOM_STATE, max_iter=10000))
    ])
    lasso_params = {"lasso__alpha": [0.001, 0.01, 0.1, 1, 5]}
    lasso_grid = GridSearchCV(lasso_pipe, lasso_params, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
    lasso_grid.fit(X_train, y_train)
    print("Best Lasso params:", lasso_grid.best_params_)
    lasso_best = lasso_grid.best_estimator_
    lasso_metrics = evaluate_regression_model("Lasso", lasso_best, X_test, y_test)

    # ------------------
    # Random Forest Regressor (heavier model)
    # ------------------
    rf_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("rf", RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1))
    ])
    rf_params = {
        "rf__n_estimators": [100, 200],
        "rf__max_depth": [None, 5, 10]
    }
    rf_grid = GridSearchCV(rf_pipe, rf_params, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
    rf_grid.fit(X_train, y_train)
    print("Best RF params:", rf_grid.best_params_)
    rf_best = rf_grid.best_estimator_
    rf_metrics = evaluate_regression_model("RandomForest", rf_best, X_test, y_test)

    # ------------------
    # Compare models (by RMSE on test)
    # ------------------
    results = {
        "LinearRegression": lr_metrics,
        "Ridge": ridge_metrics,
        "Lasso": lasso_metrics,
        "RandomForest": rf_metrics,
    }
    print("Summary of RMSE on test set:")
    for name, m in results.items():
        print(f"{name:15s} RMSE = {m['rmse']:.4f}")

    # Choose best model by RMSE
    best_name = min(results.keys(), key=lambda k: results[k]["rmse"])
    best_model = {
        "LinearRegression": lr_pipe,
        "Ridge": ridge_best,
        "Lasso": lasso_best,
        "RandomForest": rf_best,
    }[best_name]

    print(f"\nSelected best model: {best_name}")

    # Save best model
    joblib.dump(best_model, MODEL_DIR / "best_advertising_model.joblib")
    print(f"Saved best model to {MODEL_DIR / 'best_advertising_model.joblib'}")

    # Save simple feature importance (for RF) or coefficients (for linear models)
    if best_name == "RandomForest":
        # Extract feature importances
        rf = best_model.named_steps["rf"]
        importances = rf.feature_importances_
        feats = ["TV", "Radio", "Newspaper"]
        fi_df = pd.DataFrame({"feature": feats, "importance": importances}).sort_values(by="importance", ascending=False)
        fi_df.to_csv(OUTPUT_DIR / "feature_importances.csv", index=False)
        print("Saved feature importances to outputs/feature_importances.csv")
    else:
        # Linear model coefficients
        coef = best_model.named_steps[list(best_model.named_steps.keys())[-1]].coef_
        feats = ["TV", "Radio", "Newspaper"]
        coef_df = pd.DataFrame({"feature": feats, "coefficient": coef}).sort_values(by="coefficient", ascending=False)
        coef_df.to_csv(OUTPUT_DIR / "coefficients.csv", index=False)
        print("Saved model coefficients to outputs/coefficients.csv")

    # ------------------
    # Demo predictions
    # ------------------
    def predict_single(tv, radio, newspaper):
        arr = np.array([[tv, radio, newspaper]])
        pred = best_model.predict(arr)[0]
        return float(pred)

    demo = [
        (230.1, 37.8, 69.2),
        (44.5, 39.3, 45.1),
        (30.0, 10.0, 5.0),
    ]
    print("\nExample predictions (TV, Radio, Newspaper) -> Sales")
    for t, r, n in demo:
        print(f"{t, r, n} -> {predict_single(t, r, n):.3f}")

    print("\nAll outputs saved in the 'outputs/' folder and model in 'models/'.")


if __name__ == "__main__":
    main()


Dataset shape: (200, 5)
   Unnamed: 0     TV  Radio  Newspaper  Sales
0           1  230.1   37.8       69.2   22.1
1           2   44.5   39.3       45.1   10.4
2           3   17.2   45.9       69.3    9.3
3           4  151.5   41.3       58.5   18.5
4           5  180.8   10.8       58.4   12.9
       Unnamed: 0          TV       Radio   Newspaper       Sales
count  200.000000  200.000000  200.000000  200.000000  200.000000
mean   100.500000  147.042500   23.264000   30.554000   14.022500
std     57.879185   85.854236   14.846809   21.778621    5.217457
min      1.000000    0.700000    0.000000    0.300000    1.600000
25%     50.750000   74.375000    9.975000   12.750000   10.375000
50%    100.500000  149.750000   22.900000   25.750000   12.900000
75%    150.250000  218.825000   36.525000   45.100000   17.400000
max    200.000000  296.400000   49.600000  114.000000   27.000000
Missing values:
 Unnamed: 0    0
TV            0
Radio         0
Newspaper     0
Sales         0
dtype: in