In [1]:
#!pip uninstall azure-ai-ml azure-core msrest -y
#!pip install azure-ai-ml==1.28.1 azure-identity --upgrade


In [2]:
import json
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.identity import AzureCliCredential
from azure.ai.ml import MLClient

credential = AzureCliCredential()  # uses az login session

# ---------------------------------------------------------
# Load workspace credentials from config.json
# ---------------------------------------------------------
with open("config.json") as f:
    config = json.load(f)

subscription_id = config["subscription_id"]
resource_group = config["resource_group"]
workspace_name = config["workspace_name"]

# ---------------------------------------------------------
# Connect to ML Workspace
# ---------------------------------------------------------
ml_client = MLClient(
    credential=credential,
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name,
)


In [None]:
# ===============================
# Dataset Preparation
# ===============================
# Load dataset from local or Azure datastore
import pandas as pd
data_path = './walmart_sales_dataset_kaggle/train.csv'  # e.g., Favorita or Walmart dataset
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


In [5]:

# ===============================
# Feature Engineering

# ===============================
df.columns = df.columns.str.strip().str.lower()
# Now columns are: 'store', 'dept', 'date', 'weekly_sales', 'isholiday'

# ===============================
# Convert 'date' to datetime safely
# ===============================
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])  # drop rows without valid dates

# ===============================
# Basic feature engineering
# ===============================
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)

# ===============================
# Rolling features (7-day and 30-day)
# ===============================
df = df.sort_values(['store', 'dept', 'date'])
df['rolling_7'] = df.groupby(['store', 'dept'])['weekly_sales'].transform(lambda x: x.rolling(7, min_periods=1).mean())
df['rolling_30'] = df.groupby(['store', 'dept'])['weekly_sales'].transform(lambda x: x.rolling(30, min_periods=1).mean())

# ===============================
# Handle remaining NaNs
# ===============================
df.fillna(0, inplace=True)
df.to_csv("./walmart_data/walmart.csv", index=False)

df.head()

Unnamed: 0,store,dept,date,weekly_sales,isholiday,day_of_week,month,year,week_of_year,rolling_7,rolling_30
0,1,1,2010-02-05,24924.5,False,4,2,2010,5,24924.5,24924.5
1,1,1,2010-02-12,46039.49,True,4,2,2010,6,35481.995,35481.995
2,1,1,2010-02-19,41595.55,False,4,2,2010,7,37519.846667,37519.846667
3,1,1,2010-02-26,19403.54,False,4,2,2010,8,32990.77,32990.77
4,1,1,2010-03-05,21827.9,False,4,3,2010,9,30758.196,30758.196


In [None]:
# ---------------------------------------------------------
# Upload Walmart Dataset 
# ---------------------------------------------------------

from azure.ai.ml.entities import Data
# Register dataset
walmart_data = ml_client.data.create_or_update(
    Data(
        
        name="walmart-dataset",
        description="Feature engineered Walmart Kaggle dataset",
        path="./walmart_data",   # 👈 folder with MLTable
        type="mltable",
    )
)

print("Registered dataset:", walmart_data.name, walmart_data.version)

Registered dataset: walmart-dataset 3


In [None]:
# ---------------------------------------------------------
# Configure AutoML Regression Job for Free Tier
# ---------------------------------------------------------
from azure.ai.ml import MLClient
from azure.ai.ml import Input
from azure.ai.ml.automl import regression
from azure.ai.ml import automl
# Use dataset by name and version
training_data_input = Input(
    type="mltable",
    path="azureml:walmart-dataset:3"  # Registered MLTable dataset
)

regression_job = regression(
    compute="cpu-cluster",
    experiment_name="walmart-automl-regression",
    training_data=training_data_input,  # ✅ pass Input object, not Data object
    target_column_name="weekly_sales",
    primary_metric="r2_score",
    n_cross_validations=3,
    outputs={"best_model": {"type": "mlflow_model"}}  # ✅ register best model
)

regression_job.set_limits(
    timeout_minutes=30,
    max_trials=4,
    max_concurrent_trials=1
)


# Submit job
returned_job = ml_client.jobs.create_or_update(regression_job)
ml_client.jobs.stream(returned_job.name)



In [17]:
# Get the AutoML parent job
automl_job = ml_client.jobs.get(returned_job.name)

# List child jobs (trials)
children = list(ml_client.jobs.list(parent_job_name=automl_job.name))

# Find best trial
best_child = max(children, key=lambda x: float(x.properties.get("score", "-inf")))
print("Best child run:", best_child.name, "Score:", best_child.properties["score"])


Best child run: brave_clock_vn26600n18_2 Score: 0.9805759765144004


In [None]:
# ---------------------------------------------------------
# 5. Retrieve Best Model
# ---------------------------------------------------------
#best_child_run = ml_client.jobs.get(returned_job.name).properties["best_child_run_id"]
#print("Best run ID:", best_child_run)

best_model = ml_client.models.get(name="best",version=1)
print(best_model)

In [25]:
# Download best model locally
ml_client.models.download(
    name="best",
    version=1,
    download_path="./local_model"
)


Downloading the model ExperimentRun/dcid.brave_clock_vn26600n18_2/outputs/mlflow-model at ./local_model\best\mlflow-model



In [31]:
import mlflow


# Load MLflow model
model = mlflow.pyfunc.load_model("./local_model/best/mlflow-model")

# Example: use a row from your engineered walmart.csv
sample = pd.read_csv("./walmart_data/walmart.csv").head(10).drop(columns=["weekly_sales"])
# Cast columns to match MLTable schema
sample ['store'] = sample ['store'].astype(str)      # AutoML expects string
sample ['dept'] = sample ['dept'].astype(int)        # AutoML expects long (int64)
sample['isholiday'] = sample['isholiday'].astype(bool)

# Dates already handled as datetime
sample['date'] = pd.to_datetime(sample['date'], errors='coerce')

# Save clean dataset
sample.to_csv("walmart.csv", index=False)

# Predict
prediction = model.predict(sample)
print("Predicted Weekly Sales:", prediction)


 - asttokens (current: uninstalled, required: asttokens==3.0.0)
 - attrs (current: 21.4.0, required: attrs==25.3.0)
 - azure-core (current: 1.35.0, required: azure-core==1.33.0)
 - azure-identity (current: 1.24.0, required: azure-identity==1.21.0)
 - azure-mgmt-containerregistry (current: 13.0.0, required: azure-mgmt-containerregistry==10.3.0)
 - azure-mgmt-core (current: 1.6.0, required: azure-mgmt-core==1.5.0)
 - azure-mgmt-keyvault (current: 11.0.0, required: azure-mgmt-keyvault==10.3.1)
 - azure-mgmt-network (current: 29.0.0, required: azure-mgmt-network==28.1.0)
 - azure-mgmt-resource (current: 24.0.0, required: azure-mgmt-resource==23.3.0)
 - azure-mgmt-storage (current: 23.0.0, required: azure-mgmt-storage==22.1.1)
 - azure-storage-blob (current: 12.26.0, required: azure-storage-blob==12.19.0)
 - azure-storage-queue (current: 12.13.0, required: azure-storage-queue==12.12.0)
 - azureml-core (current: 1.60.0.post1, required: azureml-core==1.60.0)
 - azureml-inference-server-http (

Predicted Weekly Sales: [24294.40945699 40575.95483054 37210.49598947 27877.87220515
 27499.04705998 26364.52126504 25515.23061713 28077.24454628
 55138.10967538 36689.71842458]


In [33]:
import mlflow

# Point to your local model folder or Azure ML artifact URI
model_uri = "./local_model/best/mlflow-model"

# Get environment spec
env = mlflow.pyfunc.get_model_dependencies(model_uri)

# Save environment to file
with open("conda.yaml", "w") as f:
    f.write(env)


2025/08/22 23:33:52 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r C:\Users\jnare\OneDrive\Documents\sowmya\github_projects\local_model\best\mlflow-model\requirements.txt'.


In [35]:
%pip install -r C:\Users\jnare\OneDrive\Documents\sowmya\github_projects\local_model\best\mlflow-model\requirements.txt

Collecting asttokens==3.0.0
  Downloading asttokens-3.0.0-py3-none-any.whl (26 kB)
Collecting attrs==25.3.0
  Downloading attrs-25.3.0-py3-none-any.whl (63 kB)
     ---------------------------------------- 63.8/63.8 kB 3.6 MB/s eta 0:00:00
Collecting azure-core==1.33.0
  Downloading azure_core-1.33.0-py3-none-any.whl (207 kB)
     -------------------------------------- 207.1/207.1 kB 6.3 MB/s eta 0:00:00
Collecting azure-identity==1.21.0
  Downloading azure_identity-1.21.0-py3-none-any.whl (189 kB)
     -------------------------------------- 189.2/189.2 kB 3.8 MB/s eta 0:00:00
Collecting azure-mgmt-containerregistry==10.3.0
  Downloading azure_mgmt_containerregistry-10.3.0-py3-none-any.whl (2.3 MB)
     ---------------------------------------- 2.3/2.3 MB 10.5 MB/s eta 0:00:00
Collecting azure-mgmt-core==1.5.0
  Using cached azure_mgmt_core-1.5.0-py3-none-any.whl (30 kB)
Collecting azure-mgmt-keyvault==10.3.1
  Using cached azure_mgmt_keyvault-10.3.1-py3-none-any.whl (901 kB)
Collecting

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\jnare\\anaconda3\\Lib\\site-packages\\~mq\\backend\\cython\\_zmq.cp39-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [2]:
import mlflow
import pandas as pd

# Load MLflow model
model = mlflow.pyfunc.load_model("./local_model/best/mlflow-model")

# Example: use a row from your engineered walmart.csv
sample = pd.read_csv("./walmart_data/walmart.csv").head(10).drop(columns=["weekly_sales"])
# Cast columns to match MLTable schema
sample ['store'] = sample ['store'].astype(str)      # AutoML expects string
sample ['dept'] = sample ['dept'].astype(int)        # AutoML expects long (int64)
sample['isholiday'] = sample['isholiday'].astype(bool)

# Dates already handled as datetime
sample['date'] = pd.to_datetime(sample['date'], errors='coerce')

# Save clean dataset
sample.to_csv("walmart.csv", index=False)

# Predict
prediction = model.predict(sample)
print("Predicted Weekly Sales:", prediction)



Predicted Weekly Sales: [24294.40945699 40575.95483054 37210.49598947 27877.87220515
 27499.04705998 26364.52126504 25515.23061713 28077.24454628
 55138.10967538 36689.71842458]
