## ML Model

In [0]:
import os
import joblib
import boto3
import datetime
import warnings
import numpy as np
import pandas as pd
import json
import mlflow
from mlflow.tracking import MlflowClient
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
warnings.filterwarnings("ignore")

# ---------- CONFIG ----------
SECRET_SCOPE = "aws_scope"
AWS_KEY_SECRET = "aws-access-key-id"
AWS_SECRET_SECRET = "aws-secret-access-key"
AWS_REGION = "ap-south-1" 

S3_BUCKET = "smart-enterprise-modernization-data"
S3_PREFIX = "ML_Model_Output_2/"
MLFLOW_EXPERIMENT = "/Users/bharatshruti02@gmail.com/smart-enterprise-random-forest-model"
REGISTERED_MODEL_NAME = "smart_enterprise_random_forest_model"
GOLD_TABLE = "enterprise_modernization.gold.gold_car_sales_analytics"
RANDOM_STATE = 42
LOCAL_TMP_DIR = "/tmp"

os.makedirs(LOCAL_TMP_DIR, exist_ok=True)

# ---------- FETCH AWS CREDS FROM SECRET SCOPE ----------
access_key = dbutils.secrets.get(scope="aws_scope", key="aws-access-key-id")
secret_key = dbutils.secrets.get(scope="aws_scope", key="aws-secret-access-key")

# Export to env vars so MLflow & boto3 both pick them up
os.environ["AWS_ACCESS_KEY_ID"] = access_key
os.environ["AWS_SECRET_ACCESS_KEY"] = secret_key
os.environ["AWS_DEFAULT_REGION"] = AWS_REGION

# ---------- INIT BOTO3 SESSION ----------
session = boto3.Session(
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    region_name=AWS_REGION
)
s3 = session.resource("s3")

# Test connection
print("‚úÖ Connected to S3 bucket:", S3_BUCKET)
for obj in s3.Bucket(S3_BUCKET).objects.limit(1):
    print("Sample object:", obj.key)

# ---------- LOAD GOLD TABLE ----------
print("Loading gold table:", GOLD_TABLE)
gold_df = spark.table(GOLD_TABLE).toPandas()
print("Loaded rows:", len(gold_df))

# ---------- FEATURE SELECTION ----------
num_features = [
    "crm_Engine_Size", "crm_Vehicle_Age", "crm_Mileage",
    "sap_Net_Sale", "fleet_Maintenance_Cost",
    "fleet_Fuel_Consumption", "fleet_Accidents_Count"
]
cat_features = [
    "crm_Manufacturer", "crm_Model", "crm_Fuel_type",
    "sap_Region", "sap_Payment_Mode", "fleet_Fleet_Type"
]
target_col = "crm_Price"

missing_cols = [c for c in num_features + cat_features + [target_col] if c not in gold_df.columns]
if missing_cols:
    raise ValueError(f"Missing columns in gold table: {missing_cols}")

gold_df = gold_df[gold_df[target_col].notnull()].copy()
X = gold_df[num_features + cat_features]
y = gold_df[target_col]

# ---------- TRAIN / TEST SPLIT ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# ---------- PREPROCESSING PIPELINE ----------
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Compatible across sklearn versions
try:
    categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
except TypeError:
    categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_features),
    ("cat", categorical_transformer, cat_features)
])

# ---------- MODEL + GRID ----------
rf = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)
pipeline = Pipeline([("preprocessor", preprocessor), ("estimator", rf)])

param_grid = {
    "estimator__n_estimators": [100, 200],
    "estimator__max_depth": [10, 20],
    "estimator__min_samples_split": [2, 5]
}

grid = GridSearchCV(pipeline, param_grid, cv=3, scoring="r2", n_jobs=1, verbose=1)

# ---------- MLflow Experiment Setup ----------
client = MlflowClient()
artifact_location = f"s3://{S3_BUCKET}/{S3_PREFIX}"

experiment = client.get_experiment_by_name(MLFLOW_EXPERIMENT)
if experiment is None:
    print(f"Creating new MLflow experiment: {MLFLOW_EXPERIMENT}")
    exp_id = mlflow.create_experiment(name=MLFLOW_EXPERIMENT, artifact_location=artifact_location)
else:
    exp_id = experiment.experiment_id
mlflow.set_experiment(MLFLOW_EXPERIMENT)


# ---------- TRAIN & LOG ----------
with mlflow.start_run() as run:
    run_id = run.info.run_id
    print("MLflow run id:", run_id)

    # Train
    print("Starting training with GridSearchCV...")
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    print("Best params:", grid.best_params_)

    # Evaluate
    y_pred = best_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"R2: {r2:.4f} | MAE: {mae:.2f} | RMSE: {rmse:.2f}")

    # Log metrics and params
    mlflow.log_metrics({"r2": r2, "mae": mae, "rmse": rmse})
    for k, v in grid.best_params_.items():
        mlflow.log_param(k, v)
    mlflow.log_param("gold_table", GOLD_TABLE)

    # ---------- SAVE LOCAL MODEL ----------
    ts = datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    local_model_path = os.path.join(LOCAL_TMP_DIR, f"rf_model_{run_id}_{ts}.pkl")
    joblib.dump(best_model, local_model_path)
    print("‚úÖ Model saved locally at:", local_model_path)

    # ---------- UPLOAD TO S3 ----------
    s3_key_latest = f"{S3_PREFIX}random_forest_model_latest.pkl"
    #s3_key_versioned = f"{S3_PREFIX}rf_model_{run_id}_{ts}.pkl"

    try:
        #s3.Bucket(S3_BUCKET).upload_file(local_model_path, s3_key_versioned)
        s3.Bucket(S3_BUCKET).upload_file(local_model_path, s3_key_latest)
        print("‚úÖ Uploaded model to S3:")
        print(f" - Latest: s3://{S3_BUCKET}/{s3_key_latest}")
        #print(f" - Versioned: s3://{S3_BUCKET}/{s3_key_versioned}")
        
        
        # Save metrics locally
        metrics = {"rmse": float(rmse), "mae": float(mae), "r2": float(r2)}
        metrics_local_path = os.path.join("/tmp", f"model_metrics_{ts}.json")
        
        with open(metrics_local_path, "w") as f:
            json.dump(metrics, f)
            
        print("‚úÖ Metrics saved locally at:", metrics_local_path)
        

        # Upload Metrics to S3 automatically
        metrics_s3_key = f"{S3_PREFIX}model_metrics.json"
        s3_client = session.client('s3')
        s3_client.upload_file(metrics_local_path, S3_BUCKET, metrics_s3_key)

        print("‚úÖ Uploaded metrics JSON to S3:")
        print(f" - s3://{S3_BUCKET}/{metrics_s3_key}")

        mlflow.log_param("s3_model_latest_uri", f"s3://{S3_BUCKET}/{s3_key_latest}")
        mlflow.log_param("s3_metrics_uri", f"s3://{S3_BUCKET}/{metrics_s3_key}")
        #mlflow.log_param("s3_model_versioned_uri", f"s3://{S3_BUCKET}/{s3_key_versioned}")

    except Exception as e:
        print("‚ùå Failed to upload to S3:", e)

print("üéØ Training complete. Artifacts & metrics logged in MLflow.")
print(f"üì¶ S3 model path (latest): s3://{S3_BUCKET}/{s3_key_latest}")
print(f"üìä Metrics JSON path: s3://{S3_BUCKET}/{metrics_s3_key}")