In [None]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

In [None]:
sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = "sagemakerprojectaianit"
# print(f"Using Bucket {bucket}")

In [None]:
df = pd.read_csv("diabetes_data.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
len(df.columns)

In [None]:
df.isnull().mean() * 100

In [None]:
features = list(df.columns)
labels = features.pop(-1)

x = df[features]
y = df[labels]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=0)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
trainX = pd.DataFrame(x_train)
trainX[labels] = y_train

testX = pd.DataFrame(x_test)
testX[labels] = y_test

In [None]:
print(trainX.shape)
print(testX.shape)

In [None]:
trainX.to_csv("train-V-1.csv", index=False)
testX.to_csv("test-V-1.csv", index=False)

In [None]:
sk_prefix = "sagemaker"
tarinpath = sess.upload_data(
    path="train-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

In [None]:
%%writefile script.py

import argparse
import joblib
import pathlib
from io import StringIO
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import sklearn
import boto3
import pandas as pd


def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ == "__main__":
    print("[INFO]Extracting Arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")
    parser.add_argument(
        "--features", type=str
    )
    parser.add_argument(
        "--target", type=str
    )

    args, _ = parser.parse_known_args()
    print("reading data")
    train_df = pd.read_csv("train-V-1.csv")
    test_df = pd.read_csv("test-V-1.csv")

    features = list(df.columns)
    labels = features.pop(-1)

    print("building training and testing datasets")
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[labels]
    y_test = test_df[labels]

    # train
    print("training model")
    model = RandomForestClassifier(
        n_estimators=args.n_estimators, min_samples_leaf=args.min_samples_leaf, n_jobs=-1
    )

    model.fit(X_train, y_train)

    # print abs error
    print("validating model")
    abs_err = np.abs(model.predict(X_test) - y_test)

    # print couple perf metrics
    for q in [10, 50, 90]:
        print("AE-at-" + str(q) + "th-percentile: " + str(np.percentile(a=abs_err, q=q)))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)
    print(args.min_samples_leaf)