### Import Libraries

In [2]:
import pickle
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score

import pandas as pd
import datetime

In [1]:
print(np.__version__)

NameError: name 'np' is not defined

### Code

The following functions are used for creating binary classifier<br/>
1. **transform_data** - Transforms Data using a MinMaxScaler, fits the data and saves the scaler
2. **train_model** - Performs Test/Train Split, trains a KNN Classifier. Returns Model and metrics.
3. **scale_data** - Scale the given input as per the given scaler.

In [3]:
def transform_data(data, scaler_name):
    X = data.drop(columns=["label"], axis=1)
    y = data[["label"]]

    scaler = None

    try:
        with open("./"+scaler_name+".pkl", "rb") as fp:
            scaler = pickle.load(fp)
    except FileNotFoundError as e:
        print("Scaler Not Found! Will create a new one.")
        scaler = MinMaxScaler()

    scaler.partial_fit(X)
    X = scaler.transform(X)

    scale = {
        "min": scaler.data_min_,
        "max": scaler.data_max_
    }

    with open("./"+scaler_name+".pkl", "wb") as fp:
        print('Saving Scaler...', scale)
        pickle.dump(scaler, fp)

    return X, y["label"]

In [4]:
def train_model(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "cv_score": cross_val_score(model, X, y, cv=10),
        "class_report": classification_report(y_test, y_pred, output_dict=True),
        "confusion_matrix": confusion_matrix(y_test, y_pred)
    }
    return metrics

In [5]:
def scale_data(scaler_name,datadf):
    scaler = None
    with open("./"+scaler_name+".pkl", "rb") as fp:
        scaler = pickle.load(fp)
    data = scaler.transform(datadf)
    return data

---

### Training and Tracking  - KNN

Train model on individual data and then test the model on testdata

In [6]:
import mlflow
from mlflow.models.signature import infer_signature
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("Red-Blue-Experiment")

2024/10/23 20:16:27 INFO mlflow.tracking.fluent: Experiment with name 'Red-Blue-Experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/774240891014511625', creation_time=1729707387233, experiment_id='774240891014511625', last_update_time=1729707387233, lifecycle_stage='active', name='Red-Blue-Experiment', tags={}>

In [7]:
datas = ["./data/sample"+str(i)+".csv" for i in range(1,11)]
print(datas)

['./data/sample1.csv', './data/sample2.csv', './data/sample3.csv', './data/sample4.csv', './data/sample5.csv', './data/sample6.csv', './data/sample7.csv', './data/sample8.csv', './data/sample9.csv', './data/sample10.csv']


In [8]:
model = KNeighborsClassifier(n_neighbors=7,leaf_size=40)
params = {"neighbours":7,"leaf_size":40}

In [9]:
for i,data in enumerate(datas):
    print("Training on: ",data)
    
    df = pd.read_csv(data)
    X, y = transform_data(df,"scaler-knn")  
    
    tag = {"data":"sample"+str(i), "model": "KNN"}
    runname = "knn-test-run-" + str(datetime.datetime.now()).replace(" ","T")
    with mlflow.start_run(run_name=runname) as run:
        mlflow.set_tags(tag)                                    # Tags to help in tracking

        metrics = train_model(X, y, model)                      # Training the model
        mlflow.log_params(params)                               # Log params/hyperparameters used in experiement
        
        mlflow.log_metric("Avg CV",sum(metrics["cv_score"])/10) # Log metrics of the experiement
        mlflow.log_metric("Accuracy", metrics["accuracy"])
        
        signature = infer_signature(X, model.predict(X))
        mlflow.sklearn.log_model(model, artifact_path="models", signature=signature) # Log model created
    mlflow.end_run()
print("Training Complete.")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Training on:  ./data/sample1.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:16:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run knn-test-run-2024-10-23T20:16:27.262208 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/6b871b9703274e6abaf8be6612b2f89d.
2024/10/23 20:16:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample2.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:16:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run knn-test-run-2024-10-23T20:16:28.367909 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/4f5371d0228d48a4a58d947769fe6402.
2024/10/23 20:16:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample3.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:16:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run knn-test-run-2024-10-23T20:16:29.148514 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/c500f46437494830816edf98cc0f74d8.
2024/10/23 20:16:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample4.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:16:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run knn-test-run-2024-10-23T20:16:29.927748 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/48b48b56557344fc822fbb65619557a0.
2024/10/23 20:16:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample5.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:16:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run knn-test-run-2024-10-23T20:16:30.713438 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/e5b0074eafec4488afc29623c23d6855.
2024/10/23 20:16:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample6.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:16:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run knn-test-run-2024-10-23T20:16:31.488127 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/9a3ed3fe687b45a3bf4d425925a67517.
2024/10/23 20:16:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample7.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:16:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run knn-test-run-2024-10-23T20:16:32.255574 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/8ca979c61f85405bac517073848c59f9.
2024/10/23 20:16:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample8.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:16:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run knn-test-run-2024-10-23T20:16:33.037001 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/c707be33052c4701b8cc7c66174ff25d.
2024/10/23 20:16:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample9.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:16:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run knn-test-run-2024-10-23T20:16:33.824849 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/7e1bdd04c46d478391c375bb20fade56.
2024/10/23 20:16:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample10.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:16:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run knn-test-run-2024-10-23T20:16:34.592652 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/a5723987a6b243229e246e0ef444ecf8.
2024/10/23 20:16:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training Complete.


### Test Prediction

In [10]:
testdata = pd.read_csv("./data/testdata.csv")
testdata1 = testdata[:10].copy(deep=True)
testdata1.drop(columns=["label"],axis=1,inplace=True)

In [11]:
testdata1

Unnamed: 0,Coord_X,Coord_Y
0,-12.118184,-4.776587
1,-9.893147,0.342965
2,-10.483853,-9.362718
3,-9.926457,-5.337223
4,6.855357,9.917814
5,-7.032794,-2.027453
6,-8.412141,-3.723905
7,-10.343032,-5.507503
8,-11.132471,-3.226087
9,9.028458,4.500849


In [12]:
testdata[:10]

Unnamed: 0,Coord_X,Coord_Y,label
0,-12.118184,-4.776587,Blue
1,-9.893147,0.342965,Blue
2,-10.483853,-9.362718,Blue
3,-9.926457,-5.337223,Blue
4,6.855357,9.917814,Red
5,-7.032794,-2.027453,Blue
6,-8.412141,-3.723905,Blue
7,-10.343032,-5.507503,Blue
8,-11.132471,-3.226087,Blue
9,9.028458,4.500849,Red


In [13]:
scaleddata = scale_data("scaler-knn",testdata1)



In [14]:
scaleddata

array([[0.14124111, 0.33147147],
       [0.21711604, 0.50240871],
       [0.19697265, 0.17834464],
       [0.21598018, 0.31275232],
       [0.78824895, 0.82210437],
       [0.31465563, 0.42326257],
       [0.26761915, 0.36661955],
       [0.20177475, 0.30706681],
       [0.17485446, 0.38324128],
       [0.86235285, 0.64123676]])

In [16]:
logged_model = 'runs:/4f5371d0228d48a4a58d947769fe6402/models'
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model.predict(scaleddata)

array(['Blue', 'Blue', 'Blue', 'Blue', 'Red', 'Blue', 'Blue', 'Blue',
       'Blue', 'Red'], dtype=object)

---

### Training and Tracking - Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
model = RandomForestClassifier(n_estimators=250,max_depth=7)
params = {"estimators":250,"max_depth":7}

In [19]:
for i,data in enumerate(datas):
    print("Training on: ",data)
    df = pd.read_csv(data)
    X, y = transform_data(df,"scaler-rfc")  
    tag = {"data":"sample"+str(i), "model": "RandomForestClassifier", }
    with mlflow.start_run(run_name="random-forest-test-run"+str(i)):
        mlflow.set_tags(tag)
        metrics = train_model(X, y, model)
        mlflow.log_params(params)

        mlflow.log_metric("Avg CV",sum(metrics["cv_score"])/10)
        mlflow.log_metric("Accuracy", metrics["accuracy"])
        mlflow.sklearn.log_model(model, artifact_path="models")
    mlflow.end_run()

Training on:  ./data/sample1.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
2024/10/23 20:17:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run random-forest-test-run0 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/9b662285b6ae45f8bc34fec4dc711e8a.
2024/10/23 20:17:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample2.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:17:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run random-forest-test-run1 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/f977b014f9b843ef9854c5b2bb4c7233.
2024/10/23 20:17:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample3.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:17:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run random-forest-test-run2 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/2042b0ee50a346f09c46af32a88c2160.
2024/10/23 20:17:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample4.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:17:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run random-forest-test-run3 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/0d51be4793f147af993f0a85ee9bd7e0.
2024/10/23 20:17:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample5.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:17:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run random-forest-test-run4 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/4e438e966b5a4c178db0d21d9ecb3def.
2024/10/23 20:17:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample6.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:17:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run random-forest-test-run5 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/a8661f3aeaeb4c72aab77ab8176a6595.
2024/10/23 20:17:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample7.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:18:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run random-forest-test-run6 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/602e6991b63d4e8386947a4bd1a0def7.
2024/10/23 20:18:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample8.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:18:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run random-forest-test-run7 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/327a8de4b1294f6f9473ddf2b7ec94cd.
2024/10/23 20:18:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample9.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:18:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run random-forest-test-run8 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/d1015458e0a44aa095d1db02caa55c42.
2024/10/23 20:18:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


Training on:  ./data/sample10.csv
Saving Scaler... {'min': array([-16.26008744, -14.70411995]), 'max': array([13.06496848, 15.24576771])}


2024/10/23 20:18:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run random-forest-test-run9 at: http://127.0.0.1:5000/#/experiments/774240891014511625/runs/249efa07dc3a41f4a5ec6a37c1c3f1db.
2024/10/23 20:18:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/774240891014511625.


### Test Prediction

In [20]:
testdata = pd.read_csv("./data/testdata.csv")
testdata1 = testdata[30:40].copy(deep=True)
testdata1.drop(columns=["label"],axis=1,inplace=True)

In [21]:
testdata1

Unnamed: 0,Coord_X,Coord_Y
30,6.622613,5.391746
31,-10.323982,-2.31744
32,4.245213,7.159069
33,5.033033,7.05928
34,-9.004518,-1.801242
35,-10.162748,-3.043643
36,8.347467,3.078762
37,4.891885,4.554404
38,-8.968798,-5.160931
39,3.005623,9.855042


In [22]:
testdata[30:40]

Unnamed: 0,Coord_X,Coord_Y,label
30,6.622613,5.391746,Red
31,-10.323982,-2.31744,Blue
32,4.245213,7.159069,Red
33,5.033033,7.05928,Red
34,-9.004518,-1.801242,Blue
35,-10.162748,-3.043643,Blue
36,8.347467,3.078762,Red
37,4.891885,4.554404,Red
38,-8.968798,-5.160931,Blue
39,3.005623,9.855042,Red


In [23]:
scaleddata = scale_data("scaler-rfc",testdata1)



In [24]:
logged_model = 'runs:/602e6991b63d4e8386947a4bd1a0def7/models'
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model.predict(scaleddata)

array(['Red', 'Blue', 'Red', 'Red', 'Blue', 'Blue', 'Red', 'Red', 'Blue',
       'Red'], dtype=object)