In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

pd.options.display.max_columns = 50

In [2]:
!wget https://raw.githubusercontent.com/silverstone1903/hr-attrition-mlops/master/data/HR-Employee-Attrition.csv 

--2023-12-24 11:44:57--  https://raw.githubusercontent.com/silverstone1903/hr-attrition-mlops/master/data/HR-Employee-Attrition.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 226506 (221K) [text/plain]
Saving to: ‘HR-Employee-Attrition.csv.1’


2023-12-24 11:44:57 (40.5 MB/s) - ‘HR-Employee-Attrition.csv.1’ saved [226506/226506]



In [3]:
data_name = "HR-Employee-Attrition.csv"

In [11]:
# Github: https://github.com/silverstone1903/hr-attrition-mlops/tree/master/ml
class config:
    columns = [
        "BusinessTravel",
        "DailyRate",
        "Department",
        "DistanceFromHome",
        "Education",
        "EducationField",
        "EnvironmentSatisfaction",
        "Gender",
        "JobInvolvement",
        "JobLevel",
        "JobRole",
        "JobSatisfaction",
        "MaritalStatus",
        "MonthlyIncome",
        "NumCompaniesWorked",
        "OverTime",
        "StockOptionLevel",
        "WorkLifeBalance",
    ]


def read_data(path):
    data = pd.read_csv(path)
    data["Attrition"] = data["Attrition"].map({"Yes": 1, "No": 0})
    constant_cols = data.nunique()[data.nunique() == 1].keys().tolist()
    data.drop(constant_cols, axis=1, inplace=True)
    cols = [c for c in data.columns if c in config().columns + ["Attrition"]]

    return data[cols]


def replace_categories(df, var, target):
    ordered_labels = (
        df.groupby([var])[target].mean().to_frame().sort_values(target).index
    )
    ordinal_label = {k: i for i, k in enumerate(ordered_labels, 0)}
    df[var] = df[var].map(ordinal_label)

    return ordinal_label


def preprocessing(data):
    target = "Attrition"
    cat_cols = [cat for cat in data.select_dtypes("O").columns.tolist()]

    df = data.copy()
    df = df.replace([np.inf, -np.inf], np.nan)

    label_list = []
    for var in cat_cols:
        lbl = replace_categories(df, var, target)
        label_list.append((var, lbl))

    return df, dict(label_list)


def data_split(data):
    target = "Attrition"
    train_cols = [c for c in data.columns if c not in [target]]

    x_train, x_test, y_train, y_test = train_test_split(
        data[train_cols],
        data[target],
        test_size=0.2,
        random_state=2023,
        stratify=data[target],
    )

    x_train, x_val, y_train, y_val = train_test_split(
        x_train[train_cols], y_train, test_size=0.2, random_state=2023, stratify=y_train
    )

    return x_train, x_test, x_val, y_train, y_test, y_val, train_cols, target

In [12]:
df = read_data(data_name)
df, labels = preprocessing(df)
x_train, x_test, x_val, y_train, y_test, y_val, train_cols, target = data_split(df)

In [13]:
x_train.shape, x_test.shape, x_val.shape

((940, 18), (294, 18), (236, 18))

In [14]:
bucket = "bucket_path"
prefix = "raw_data"

In [15]:
import sagemaker

sess = sagemaker.Session()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [None]:
pd.concat([y_train, x_train], axis=1).to_csv("train.csv", index=False, header=False)
sess.upload_data(path="train.csv", bucket=bucket, key_prefix=prefix + "/train")

In [None]:
pd.concat([y_val, x_val], axis=1).to_csv("validation.csv", index=False, header=False)
sess.upload_data(
    path="validation.csv", bucket=bucket, key_prefix=prefix + "/validation"
)

In [18]:
from sagemaker.inputs import TrainingInput

s3_input_train = TrainingInput(
    s3_data=f"s3://{bucket}/{prefix}/train", content_type="csv"
)

s3_input_validation = TrainingInput(
    s3_data=f"s3://{bucket}/{prefix}/validation/", content_type="csv"
)

In [19]:
import boto3

region = boto3.Session().region_name
role = sagemaker.get_execution_role()
instance_type = "ml.m4.xlarge"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [20]:
container = sagemaker.image_uris.retrieve(
    framework="xgboost", region=region, version="latest"
)

In [21]:
from sagemaker.estimator import Estimator

xgb = Estimator(
    container,
    role,
    sagemaker_session=sess,
    instance_count=1,
    instance_type=instance_type,
    input_mode="File",
    output_path=f"s3://{bucket}/{prefix}/output",
    train_use_spot_instance=True,
)

In [22]:
xgb.set_hyperparameters(
    eta=0.1,
    max_depth=7,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    num_round=150,
)

In [23]:
%%time
xgb.fit({'train': s3_input_train, 
         'validation': s3_input_validation})

INFO:sagemaker:Creating training-job with name: xgboost-2023-12-24-11-48-15-310


2023-12-24 11:48:15 Starting - Starting the training job......
2023-12-24 11:48:50 Starting - Preparing the instances for training......
2023-12-24 11:49:59 Downloading - Downloading input data...
2023-12-24 11:50:29 Downloading - Downloading the training image......
2023-12-24 11:51:40 Training - Training image download completed. Training in progress.
2023-12-24 11:51:40 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2023-12-24:11:51:33:INFO] Running standalone xgboost training.[0m
[34m[2023-12-24:11:51:33:INFO] File size need to be processed in the node: 0.05mb. Available memory size in the node: 8536.98mb[0m
[34m[2023-12-24:11:51:33:INFO] Determined delimiter of CSV input is ','[0m
[34m[11:51:33] S3DistributionType set as FullyReplicated[0m
[34m[11:51:33] 940x18 matrix with 16920 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-12-24:11:51:33:INFO] Determined delimiter of CSV input is ','[0m
[

### Deployment

In [25]:
model_name = "sagemaker-hr-attrition"

xgb_predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    model_name=model_name,
    endpoint_name=model_name,
)

INFO:sagemaker:Creating model with name: sagemaker-hr-attrition
INFO:sagemaker:Creating endpoint-config with name sagemaker-hr-attrition
INFO:sagemaker:Creating endpoint with name sagemaker-hr-attrition


------!

### Evaluation

In [27]:
from sagemaker.serializers import CSVSerializer

xgb_predictor.serializer = CSVSerializer()

In [28]:
predictions = xgb_predictor.predict(x_test.values)
y_pred = np.fromstring(predictions.decode("utf-8"), sep=",")

In [31]:
from sklearn.metrics import confusion_matrix, accuracy_score

conf_matrix = confusion_matrix(y_test, y_pred.round())
print(conf_matrix)
print("Accuracy:", round(accuracy_score(y_test, y_pred.round()), 6))

[[241   6]
 [ 28  19]]
Accuracy: 0.884354


### Endpoint Predictor

In [32]:
from sagemaker.predictor import Predictor

predictor_ep = Predictor(endpoint_name=model_name)
predictor_ep.serializer = CSVSerializer()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [34]:
predictions_ep = predictor_ep.predict(x_test.values)
y_pred_ep = np.fromstring(predictions_ep.decode("utf-8"), sep=",")

In [41]:
x_test.iloc[[0]].to_csv(header=False, index=False)

'1,474,0,3,3,2,1,0,3,1,4,4,1,2061,1,0,0,3\n'