## ML Model

In [0]:
import mlflow
import boto3
from mlflow.tracking import MlflowClient
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Read the credentials from the table you uploaded
creds_df = spark.table("delta_lake.default.shruti_hackathon_access_keys")
creds_df.show()  # For debugging: see the structure

# Extract credentials as local Python variables
creds_row = creds_df.first()
access_key = creds_row['Access key ID']
secret_key = creds_row['Secret access key']

# Create a session with the extracted credentials
session = boto3.Session(
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key
)

s3 = session.resource('s3')
bucket = s3.Bucket('smart-enterprise-modernization-data')

# List objects as a test
for obj in bucket.objects.all():
    print(obj.key)


# Read and prepare data
gold_df = spark.table("enterprise_modernization.gold.customer_vehicle_fleet")
features = ["price", "engine_size", "mileage", "fault_count", "avg_odometer"]
target = "sales"
gold_pd = gold_df.select(features + [target]).dropna().toPandas()
gold_pd[features] = gold_pd[features].astype(float)
X = gold_pd[features]
y = gold_pd[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test MSE: {mse:.2f}")
print(f"Test R^2 Score: {r2:.2f}")

# Set experiment with S3 artifact location (do this once)
experiment_name = "/Users/bharatshruti02@gmail.com/vehicle_sales_prediction"
artifact_location = "s3://smart-enterprise-modernization-data/ML_Model_Output/vehicle_sales_prediction_model"
client = MlflowClient()
experiment = client.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(
        name=experiment_name,
        artifact_location=artifact_location
    )
else:
    experiment_id = experiment.experiment_id
mlflow.set_experiment(experiment_name)

# Log model and metrics
with mlflow.start_run() as run:
    input_example = X_train.head(10).to_dict(orient='records')[0]
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="random_forest_model",
        input_example=input_example
    )
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2_score", r2)

    # Register model in Model Registry (versioning is automatic)
    mlflow.register_model(
        model_uri=f"runs:/{run.info.run_id}/random_forest_model",
        name="vehicle_sales_prediction_model")


In [0]:
import mlflow.pyfunc
import pandas as pd

model_name = "vehicle_sales_prediction_model"
model_version = 1

model_uri = f"models:/{model_name}/{model_version}"
model = mlflow.pyfunc.load_model(model_uri)

# Load new data
new_data_df = spark.read.table("enterprise_modernization.gold.customer_vehicle_fleet")

selected_features = ["price", "engine_size", "mileage", "fault_count", "avg_odometer"]

input_df = (new_data_df.select(selected_features).toPandas().dropna().astype(float))

# Predict
predictions = model.predict(input_df)

predictions_df = pd.DataFrame(predictions, columns=["predicted_sales"])

display(predictions_df)

In [0]:
import requests
import json


instance_url = dbutils.secrets.get("my_scope", "databricks_instance_url").rstrip("/")
pat_token = dbutils.secrets.get("my_scope", "databricks_pat_token")
endpoint_name = "vehicle_sales_prediction_endpoint"

endpoint_url = f"{instance_url}/serving-endpoints/{endpoint_name}/invocations"
headers = {
    "Authorization": f"Bearer {pat_token}",
    "Content-Type": "application/json"
}


input_df = (new_data_df.select(features).toPandas().dropna().astype(float))

if not input_df.empty:
    payload = {"dataframe_records": input_df.to_dict(orient="records")}
    response = requests.post(
        endpoint_url,
        headers=headers,
        data=json.dumps(payload)
    )
    predictions = response.json()
    display(predictions)
else:
    print("Input data is empty. No predictions to display.")


