#### What We will Learn

1. S3 Buckets- Boto3
2. Iam Roles and Users
3. Complete Infrastructure of AWS Sagemaker-Training, Endpoints

In [None]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

sm_boto3=boto3.client("sagemaker")
sess=sagemaker.Session()
region=sess.boto_session.region_name
bucket="jailer8"
print("Using bucket" + bucket)


In [None]:
print(region)

In [None]:
df=pd.read_csv("mob_price_classification_train.csv")
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df['price_range'].value_counts()

In [None]:
df['price_range'].nunique()

In [None]:
df['price_range'].unique()

In [None]:
features=list(df.columns)
features

In [None]:
label = features.pop(-1)
label

In [None]:
features

In [157]:
x=df[features]
y=df[label]

In [158]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.15, random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [160]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [None]:
trainX

In [None]:
X_train

In [163]:
trainX.to_csv("train-V-1.csv",index = False)
testX.to_csv("test-V-1.csv", index = False)

In [None]:
bucket

In [None]:
## send data to S3. Sagemaker will take the data for training from s3
sk_prefix="sagemaker/mobile_price_classification/sklearncontainer"
trainpath=sess.upload_data(path='train-V-1.csv',bucket=bucket,key_prefix=sk_prefix)

testpath=sess.upload_data(path='test-V-1.csv',bucket=bucket,key_prefix=sk_prefix)

print(trainpath)
print(testpath)

In [None]:
import boto3

session = boto3.Session()
creds = session.get_credentials()
print(creds.access_key)
print(creds.secret_key)
print(creds.token)  # This should be None unless using temporary credentials


#### Script used by AWS Sagemaker To Train Models

In [None]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
import sklearn
import joblib
import argparse
import os
import numpy as np
import pandas as pd

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--max_depth", type=int, default=None)
    parser.add_argument("--random_state", type=int, default=0)
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()

    print("SKLearn Version:", sklearn.__version__)
    print("Joblib Version:", joblib.__version__)

    print("[INFO] Reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label = features.pop(-1)

    X_train = train_df[features]
    y_train = train_df[label]
    X_test = test_df[features]
    y_test = test_df[label]

    print("Training RandomForest Model with Cross-Validation...")
    model = RandomForestClassifier(
        n_estimators=args.n_estimators,
        max_depth=args.max_depth,
        random_state=args.random_state,
        n_jobs=-1
    )
    
    # Perform 5-fold cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    
    print("Cross-validation scores:", cv_scores)
    print("Mean CV accuracy:", np.mean(cv_scores))

    # Train the model on the full training set
    model.fit(X_train, y_train)

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model saved at", model_path)

    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred_test)
    test_rep = classification_report(y_test, y_pred_test)

    print("\n---- METRICS RESULTS FOR TESTING DATA ----")
    print("Total Rows:", X_test.shape[0])
    print('[TESTING] Model Accuracy:', test_acc)
    print('[TESTING] Testing Report:')
    print(test_rep)

    # Save the mean CV score to a file that SageMaker can read
    with open(os.path.join(args.model_dir, 'cv_accuracy.txt'), 'w') as f:
        f.write(str(np.mean(cv_scores)))





### AWS Sagemaker Entry Point To Execute the Training script

In [204]:
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.tuner import IntegerParameter, HyperparameterTuner

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::054037137868:role/sagemakeraccess",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn-cv",
    hyperparameters={
        "random_state": 0
    },
    use_spot_instances=True,
    max_run=3600
)

hyperparameter_ranges = {
    'n_estimators': IntegerParameter(50, 150),
    'max_depth': IntegerParameter(3, 15)
}

objective_metric_name = 'cv:accuracy'
objective_type = 'Maximize'

tuner = HyperparameterTuner(
    sklearn_estimator,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=20,
    max_parallel_jobs=3,
    objective_type=objective_type
)



In [None]:

# Launch training job
tuner.fit({"train": trainpath, "test": testpath}, wait=True)

# After tuning completes
best_training_job = tuner.best_training_job()
best_model = tuner.attach_to_best_training_job()

print(f"Best training job: {best_training_job}")
print(f"Best hyperparameters: {best_model.hyperparameters()}")

### To get the model from S3

In [None]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

In [None]:
artifact

### Deploy the Model For Endpoint

In [110]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime,strftime


model_name="Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model=SKLearnModel(
    name=model_name,
    model_data=artifact,
    role="arn:aws:iam::054037137868:role/sagemakeraccess",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION
)

In [None]:
model

In [None]:
## Endpoint deployment
endpoint_name="Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor=model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name

)

In [None]:
predictor

In [None]:
testX[features][0:2]

In [None]:
print(predictor.predict(testX[features][:2].values.tolist()))

In [None]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)