<a href="https://colab.research.google.com/github/rajinikanthvadla-ai/AWS-Sagemaker-MLOPS-Lab-01/blob/main/sagemaker_mlops_lab_1ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q --upgrade pip setuptools wheel
!pip install -q "numpy==1.26.4" "pandas==2.1.4" "scikit-learn==1.3.2" "pyarrow==14.0.2" "sagemaker==2.224.0" "datasets==2.16.1"

In [None]:
# Cell 2: Imports and SageMaker Configuration
import boto3
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from datasets import load_dataset

# SageMaker session and role setup
sess = sagemaker.Session()
region = sess.boto_region_name
role = get_execution_role()

bucket = "sagemaker-hyd-house-rajini-2026"
prefix = "hyd-house-price"

print(f"Environment ready in {region}")
print(f"Using Role: {role}")

In [None]:
# Cell 3: Load and Clean Dataset
# Load from Hugging Face
ds = load_dataset("Saathwik56/houseprice")
df = ds["train"].to_pandas()

# Clean column names (lower case, remove spaces)
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

print(f"Dataset loaded with shape: {df.shape}")
df.head()

In [None]:
# Cell 4: Data Cleaning and Splitting
import os
from sklearn.model_selection import train_test_split

# Identify target column
target = "price" if "price" in df.columns else df.columns[-1]

# Drop rows where target is missing
df = df.dropna(subset=[target])

# Clean numeric columns that are currently strings (like "1,200,000")
for c in df.columns:
    if df[c].dtype == "object":
        # Remove commas and attempt conversion to numeric
        temp_col = df[c].str.replace(",", "", regex=False)
        df[c] = pd.to_numeric(temp_col, errors="ignore")

# Split the data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Create local directory and save files
os.makedirs("data", exist_ok=True)
train_df.to_csv("data/train.csv", index=False)
test_df.to_csv("data/test.csv", index=False)

print(f"Files saved locally: data/train.csv ({len(train_df)} rows), data/test.csv ({len(test_df)} rows)")


In [None]:
# Cell 5: Upload to S3
train_path = sess.upload_data(path="data/train.csv", bucket=bucket, key_prefix=f"{prefix}/train")
test_path = sess.upload_data(path="data/test.csv", bucket=bucket, key_prefix=f"{prefix}/test")

print(f"Train data uploaded to: {train_path}")
print(f"Test data uploaded to: {test_path}")

In [None]:
%%writefile train.py

import pandas as pd
import os
import joblib
import argparse
from sklearn.ensemble import RandomForestRegressor

if __name__ == "__main__":
    # 1. Handle SageMaker Paths
    # Using environment variables is the 'best practice' way for SageMaker
    train_dir = os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")
    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")

    # 2. Load Data
    train_path = os.path.join(train_dir, "train.csv")
    df = pd.read_csv(train_path)

    # 3. Identify Target and Features
    target = "price" if "price" in df.columns else df.columns[-1]

    # Random Forest only accepts numbers.
    # We drop columns that are still objects (text) for a quick fix.
    X = df.drop(columns=[target]).select_dtypes(include=['number'])
    y = df[target]

    print(f"Training with features: {list(X.columns)}")

    # 4. Train Model
    model = RandomForestRegressor(n_estimators=200, random_state=42)
    model.fit(X, y)

    # 5. Save Model
    # SageMaker automatically picks up anything in model_dir and saves it to S3
    joblib.dump(model, os.path.join(model_dir, "model.joblib"))
    print("Model saved successfully!")

In [None]:
%%writefile train.py

import pandas as pd
import os
import joblib
from sklearn.ensemble import RandomForestRegressor

# --- Training Logic ---
if __name__ == "__main__":
    train_dir = os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/train")
    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")

    train_path = os.path.join(train_dir, "train.csv")
    df = pd.read_csv(train_path)

    target = "price" if "price" in df.columns else df.columns[-1]
    X = df.drop(columns=[target]).select_dtypes(include=['number'])
    y = df[target]

    model = RandomForestRegressor(n_estimators=200, random_state=42)
    model.fit(X, y)

    joblib.dump(model, os.path.join(model_dir, "model.joblib"))

# --- Inference Logic (REQUIRED for Deployment) ---
def model_fn(model_dir):
    """Load the model from the model_dir"""
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    return model

In [None]:
from sagemaker.sklearn.estimator import SKLearn

# We use the path where we uploaded the training data in Cell 5
# If you used 'train_path' in the previous cell, we use that here
estimator = SKLearn(
    entry_point="train.py",
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version="1.2-1",
    base_job_name="hyd-house-training",
    output_path=f"s3://{bucket}/{prefix}/output",
    py_version="py3"
)

# Launching the job
# Note: 'train_path' was defined in Cell 5 during the S3 upload
estimator.fit({"train": train_path})

In [None]:
import boto3
sm = boto3.client('sagemaker')
sm.delete_endpoint(EndpointName='hyd-house-endpoint')

In [None]:
# Cell 9: Deploy the model
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium", # You can also use "ml.t2.medium" for lower cost
    endpoint_name="hyd-house-endpoint-rajini"
)

print(f"\n--- Success! ---")
print(f"Endpoint Created: {predictor.endpoint_name}")

In [None]:
import numpy as np

# 1. Grab a sample row from your test data (excluding the target 'price')
# We only take the numeric columns just like we did in train.py
sample_data = test_df.select_dtypes(include=['number']).drop(columns=['price']).iloc[0:1]

print("Sending features to endpoint:")
print(sample_data)

# 2. Get prediction
try:
    prediction = predictor.predict(sample_data.values)

    print("\n--- Prediction Result ---")
    print(f"Predicted House Price: {prediction[0]:,.2f}")
    print(f"Actual House Price: {test_df['price'].iloc[0]:,.2f}")

except Exception as e:
    print(f"Error during prediction: {e}")

In [None]:
import boto3

# 1. Initialize the SageMaker client
sm_client = boto3.client("sagemaker")

# 2. Define the name you used
endpoint_name = "hyd-house-endpoint-rajini"

try:
    # Delete the endpoint
    sm_client.delete_endpoint(EndpointName=endpoint_name)
    print(f"Successfully deleted endpoint: {endpoint_name}")

    # Delete the endpoint configuration
    sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name)
    print(f"Successfully deleted endpoint configuration: {endpoint_name}")

except Exception as e:
    print(f"Error: {e}")

In [None]:
# 1. Delete the Model from SageMaker dashboard
try:
    # Note: The model name usually matches the training job name
    # We can get it from the estimator we used earlier
    model_name = estimator.latest_training_job.name
    sm_client.delete_model(ModelName=model_name)
    print(f"Successfully deleted model: {model_name}")
except Exception as e:
    print(f"Model cleanup skipped or failed: {e}")

# 2. Remove local data folder
import shutil
if os.path.exists("data"):
    shutil.rmtree("data")
    print("Local 'data' folder removed.")

# 3. Remove the python script
if os.path.exists("train.py"):
    os.remove("train.py")
    print("train.py removed.")