# Let's play around with nearest neighbor and deploy it
MLflow tracking is now sitting pretty up in the cloud, connected to Postgres and S3 for artifact storage. My last serverless deployment did not work due to no conda. Hopefully it works this time.

In [65]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from joblib import dump
from mlflow.pyfunc import PythonModel
from mlflow.models import infer_signature

In [2]:
df = pd.read_parquet('/home/ryan/mlops-countries/data/data.parquet')

In [3]:
df

Unnamed: 0,OBJECTID,CID,Longitude,Latitude,Elev
0,1,252,-174.051758,-76.946091,0
1,2,252,-166.497513,-74.885017,0
2,3,252,-170.586243,-76.785477,0
3,4,252,-176.382172,-76.520096,0
4,5,252,-166.488525,-72.629555,0
...,...,...,...,...,...
260994,260995,252,172.614502,87.737358,0
260995,260996,252,177.343994,81.780121,0
260996,260997,252,173.388092,78.165291,0
260997,260998,252,169.568558,89.440628,0


In [4]:
df = df.sample(frac=1) # shuffle the data

In [5]:
df

Unnamed: 0,OBJECTID,CID,Longitude,Latitude,Elev
18181,18182,10,-62.647358,-29.964741,76
206702,206703,96,-2.537722,49.472713,1
11799,11800,252,-169.961044,-16.951956,0
155601,155602,145,158.153061,6.895745,18
249520,249521,114,144.167709,43.637287,323
...,...,...,...,...,...
21617,21618,190,-5.697285,-15.914624,161
98494,98495,195,-61.202251,13.195112,558
130707,130708,45,19.150009,5.998329,517
103066,103067,93,-61.670029,16.189217,181


In [6]:
features = df[["Longitude", "Latitude", "Elev"]].to_numpy()
target = df[["CID"]].to_numpy()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, target.ravel(), test_size=0.2, random_state=42)

In [8]:
clf = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=11))

In [9]:
clf.fit(X_train, y_train)

In [10]:
y_pred = clf.predict(X_test)

In [13]:
f1 = f1_score(y_test, y_pred, average='macro')
f1

0.9228738333445369

Oh my... that is much, much better than LSVC...

In [14]:
predictions = clf.predict(X_test)

feature_names = ['lon', 'lat', 'elev']

# Convert X_test validation feature data to a Pandas DataFrame
result = pd.DataFrame(X_test, columns=feature_names)

# Add the actual classes to the DataFrame
result["actual_class"] = y_test

# Add the model predictions to the DataFrame
result["predicted_class"] = predictions

result[:10]

Unnamed: 0,lon,lat,elev,actual_class,predicted_class
0,95.778572,-27.574751,0.0,252,252
1,73.837952,38.089542,3947.0,222,222
2,-91.832222,14.493006,76.0,95,95
3,-3.95426,8.924938,245.0,57,57
4,50.563255,26.12225,41.0,18,18
5,-77.554466,23.758369,4.0,17,17
6,15.044183,48.31115,658.0,14,14
7,-16.542316,13.232882,24.0,84,84
8,-36.376877,-54.5411,303.0,211,211
9,-64.713821,32.334091,4.0,25,25


I don't know... this model is not going to be very fun to interact with...

How about Naive Bayes?

In [55]:
clf = GaussianNB()

In [56]:
clf.fit(X_train, y_train)

In [57]:
y_pred = clf.predict(X_test)

In [58]:
f1 = f1_score(y_test, y_pred, average='macro')
f1

0.9108835990451875

Well, thanks to Ozy for suggesting that one! Still a little too good...

In [59]:
clf = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=201))

In [60]:
clf.fit(X_train, y_train)

In [61]:
y_pred = clf.predict(X_test)

In [62]:
f1 = f1_score(y_test, y_pred, average='macro')
f1

0.739468857385424

In [63]:
predictions = clf.predict(X_test)

feature_names = ['lon', 'lat', 'elev']

# Convert X_test validation feature data to a Pandas DataFrame
result = pd.DataFrame(X_test, columns=feature_names)

# Add the actual classes to the DataFrame
result["actual_class"] = y_test

# Add the model predictions to the DataFrame
result["predicted_class"] = predictions

result[:10]

Unnamed: 0,lon,lat,elev,actual_class,predicted_class
0,95.778572,-27.574751,0.0,252,252
1,73.837952,38.089542,3947.0,222,222
2,-91.832222,14.493006,76.0,95,69
3,-3.95426,8.924938,245.0,57,57
4,50.563255,26.12225,41.0,18,18
5,-77.554466,23.758369,4.0,17,17
6,15.044183,48.31115,658.0,14,62
7,-16.542316,13.232882,24.0,84,84
8,-36.376877,-54.5411,303.0,211,211
9,-64.713821,32.334091,4.0,25,25


That seems to be okay... it should be wrong some of the time...

In [66]:
model_directory = "/tmp/sklearn_model.joblib"
dump(clf, model_directory)

['/tmp/sklearn_model.joblib']

In [67]:
class ModelWrapper(PythonModel):
    def __init__(self):
        self.model = None

    def load_context(self, context):
        import sklearn
        from joblib import load

        self.model = load(context.artifacts["model_path"])

    # Return the country string, not the country id from the prediction method
    def predict(self, context, model_input):
        countries_dict = {
             1: 'Afghanistan',
             2: 'Albania',
             3: 'Algeria',
             4: 'American Samoa',
             5: 'Andorra',
             6: 'Angola',
             7: 'Anguilla',
             8: 'Antarctica',
             9: 'Antigua and Barbuda',
             10: 'Argentina',
             11: 'Armenia',
             12: 'Aruba',
             13: 'Australia',
             14: 'Austria',
             15: 'Azerbaijan',
             16: 'Azores',
             17: 'Bahamas',
             18: 'Bahrain',
             19: 'Bangladesh',
             20: 'Barbados',
             21: 'Belarus',
             22: 'Belgium',
             23: 'Belize',
             24: 'Benin',
             25: 'Bermuda',
             26: 'Bhutan',
             27: 'Bolivia',
             28: 'Bonaire',
             29: 'Bosnia and Herzegovina',
             30: 'Botswana',
             31: 'Bouvet Island',
             32: 'Brazil',
             33: 'British Indian Ocean Territory',
             34: 'British Virgin Islands',
             35: 'Brunei Darussalam',
             36: 'Bulgaria',
             37: 'Burkina Faso',
             38: 'Burundi',
             39: 'Cabo Verde',
             40: 'Cambodia',
             41: 'Cameroon',
             42: 'Canada',
             43: 'Canarias',
             44: 'Cayman Islands',
             45: 'Central African Republic',
             46: 'Chad',
             47: 'Chile',
             48: 'China',
             49: 'Christmas Island',
             50: 'Cocos Islands',
             51: 'Colombia',
             52: 'Comoros',
             53: 'Congo',
             54: 'Congo DRC',
             55: 'Cook Islands',
             56: 'Costa Rica',
             57: "Côte d'Ivoire",
             58: 'Croatia',
             59: 'Cuba',
             60: 'Curacao',
             61: 'Cyprus',
             62: 'Czech Republic',
             63: 'Denmark',
             64: 'Djibouti',
             65: 'Dominica',
             66: 'Dominican Republic',
             67: 'Ecuador',
             68: 'Egypt',
             69: 'El Salvador',
             70: 'Equatorial Guinea',
             71: 'Eritrea',
             72: 'Estonia',
             73: 'Eswatini',
             74: 'Ethiopia',
             75: 'Falkland Islands',
             76: 'Faroe Islands',
             77: 'Fiji',
             78: 'Finland',
             79: 'France',
             80: 'French Guiana',
             81: 'French Polynesia',
             82: 'French Southern Territories',
             83: 'Gabon',
             84: 'Gambia',
             85: 'Georgia',
             86: 'Germany',
             87: 'Ghana',
             88: 'Gibraltar',
             89: 'Glorioso Islands',
             90: 'Greece',
             91: 'Greenland',
             92: 'Grenada',
             93: 'Guadeloupe',
             94: 'Guam',
             95: 'Guatemala',
             96: 'Guernsey',
             97: 'Guinea',
             98: 'Guinea-Bissau',
             99: 'Guyana',
             100: 'Haiti',
             101: 'Heard Island and McDonald Islands',
             102: 'Honduras',
             103: 'Hungary',
             104: 'Iceland',
             105: 'India',
             106: 'Indonesia',
             107: 'Iran',
             108: 'Iraq',
             109: 'Ireland',
             110: 'Isle of Man',
             111: 'Israel',
             112: 'Italy',
             113: 'Jamaica',
             114: 'Japan',
             115: 'Jersey',
             116: 'Jordan',
             117: 'Juan De Nova Island',
             118: 'Kazakhstan',
             119: 'Kenya',
             120: 'Kiribati',
             121: 'Kuwait',
             122: 'Kyrgyzstan',
             123: 'Laos',
             124: 'Latvia',
             125: 'Lebanon',
             126: 'Lesotho',
             127: 'Liberia',
             128: 'Libya',
             129: 'Liechtenstein',
             130: 'Lithuania',
             131: 'Luxembourg',
             132: 'Madagascar',
             133: 'Madeira',
             134: 'Malawi',
             135: 'Malaysia',
             136: 'Maldives',
             137: 'Mali',
             138: 'Malta',
             139: 'Marshall Islands',
             140: 'Martinique',
             141: 'Mauritania',
             142: 'Mauritius',
             143: 'Mayotte',
             144: 'Mexico',
             145: 'Micronesia',
             146: 'Moldova',
             147: 'Monaco',
             148: 'Mongolia',
             149: 'Montenegro',
             150: 'Montserrat',
             151: 'Morocco',
             152: 'Mozambique',
             153: 'Myanmar',
             154: 'Namibia',
             155: 'Nauru',
             156: 'Nepal',
             157: 'Netherlands',
             158: 'New Caledonia',
             159: 'New Zealand',
             160: 'Nicaragua',
             161: 'Niger',
             162: 'Nigeria',
             163: 'Niue',
             164: 'Norfolk Island',
             165: 'North Korea',
             166: 'North Macedonia',
             167: 'Northern Mariana Islands',
             168: 'Norway',
             169: 'Oman',
             170: 'Pakistan',
             171: 'Palau',
             172: 'Palestinian Territory',
             173: 'Panama',
             174: 'Papua New Guinea',
             175: 'Paraguay',
             176: 'Peru',
             177: 'Philippines',
             178: 'Pitcairn',
             179: 'Poland',
             180: 'Portugal',
             181: 'Puerto Rico',
             182: 'Qatar',
             183: 'Réunion',
             184: 'Romania',
             185: 'Russian Federation',
             186: 'Rwanda',
             187: 'Saba',
             188: 'Saint Barthelemy',
             189: 'Saint Eustatius',
             190: 'Saint Helena',
             191: 'Saint Kitts and Nevis',
             192: 'Saint Lucia',
             193: 'Saint Martin',
             194: 'Saint Pierre and Miquelon',
             195: 'Saint Vincent and the Grenadines',
             196: 'Samoa',
             197: 'San Marino',
             198: 'Sao Tome and Principe',
             199: 'Saudi Arabia',
             200: 'Senegal',
             201: 'Serbia',
             202: 'Seychelles',
             203: 'Sierra Leone',
             204: 'Singapore',
             205: 'Sint Maarten',
             206: 'Slovakia',
             207: 'Slovenia',
             208: 'Solomon Islands',
             209: 'Somalia',
             210: 'South Africa',
             211: 'South Georgia and South Sandwich Islands',
             212: 'South Korea',
             213: 'South Sudan',
             214: 'Spain',
             215: 'Sri Lanka',
             216: 'Sudan',
             217: 'Suriname',
             218: 'Svalbard',
             219: 'Sweden',
             220: 'Switzerland',
             221: 'Syria',
             222: 'Tajikistan',
             223: 'Tanzania',
             224: 'Thailand',
             225: 'Timor-Leste',
             226: 'Togo',
             227: 'Tokelau',
             228: 'Tonga',
             229: 'Trinidad and Tobago',
             230: 'Tunisia',
             231: 'Turkiye',
             232: 'Turkmenistan',
             233: 'Turks and Caicos Islands',
             234: 'Tuvalu',
             235: 'Uganda',
             236: 'Ukraine',
             237: 'United Arab Emirates',
             238: 'United Kingdom',
             239: 'United States',
             240: 'United States Minor Outlying Islands',
             241: 'Uruguay',
             242: 'US Virgin Islands',
             243: 'Uzbekistan',
             244: 'Vanuatu',
             245: 'Vatican City',
             246: 'Venezuela',
             247: 'Vietnam',
             248: 'Wallis and Futuna',
             249: 'Yemen',
             250: 'Zambia',
             251: 'Zimbabwe',
             252: 'Ocean'
        }
        key = self.model.predict(model_input)
        return countries_dict[key[0]] # So this is for one prediction at a time, not an array of them


In [68]:
# Define the required artifacts associated with the saved custom pyfunc
artifacts = {"model_path": model_directory}

# Define the signature associated with the model
signature = infer_signature(X_train)


In [81]:
params = {"pipeline":True, "scaler": "standard"}

In [69]:
mlflow.set_tracking_uri(uri="http://ec2-54-226-109-218.compute-1.amazonaws.com:8080")

In [71]:
with mlflow.start_run() as run:
    mlflow.pyfunc.log_model(
        python_model=ModelWrapper(),
        input_example=X_train,
        signature=signature,
        artifacts=artifacts,
        pip_requirements=["joblib", "sklearn"],
        artifact_path="countries_name",
        registered_model_name="countries_name",
    )
mlflow.end_run()

Successfully registered model 'countries_name'.
2023/12/08 13:22:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: countries_name, version 1
Created version '1' of model 'countries_name'.


In [72]:
# got the uri from mlflow ui
loaded_model = mlflow.pyfunc.load_model('mlflow-artifacts:/0/89cbbffe4129441db9f5f2bf4c269be1/artifacts/countries_name')

In [73]:
X_test[0]

array([ 95.77857208, -27.5747509 ,   0.        ])

In [75]:
prediction = loaded_model.predict(np.array([[-98.0, 36.0, 55.0]]))
prediction

'United States'

In [76]:
prediction = loaded_model.predict(np.array([[0.0, 0.0, 0.0]]))
prediction

'Sao Tome and Principe'

So this is nice, but adding elevation is going to make inference more complicated, as I will have to have something query the DEM for the elevation wherever the userr clicks. Just easier to do away with elevation entirely.

In [77]:
df

Unnamed: 0,OBJECTID,CID,Longitude,Latitude,Elev
18181,18182,10,-62.647358,-29.964741,76
206702,206703,96,-2.537722,49.472713,1
11799,11800,252,-169.961044,-16.951956,0
155601,155602,145,158.153061,6.895745,18
249520,249521,114,144.167709,43.637287,323
...,...,...,...,...,...
21617,21618,190,-5.697285,-15.914624,161
98494,98495,195,-61.202251,13.195112,558
130707,130708,45,19.150009,5.998329,517
103066,103067,93,-61.670029,16.189217,181


In [78]:
features = df[["Longitude", "Latitude"]].to_numpy()
target = df[["CID"]].to_numpy()

In [79]:
X_train, X_test, y_train, y_test = train_test_split(features, target.ravel(), test_size=0.2, random_state=42)

In [80]:
with mlflow.start_run() as run:
    mlflow.autolog()
    clf = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=11))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f1)
mlflow.end_run()    

2023/12/08 16:21:53 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


0.9759764486918437


Oh wow. Model does even better without elevation... Let's see how Gaussian Naive Bayes does...

In [81]:
with mlflow.start_run() as run:
    mlflow.autolog()
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f1)
mlflow.end_run()    

2023/12/08 16:30:54 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
  _warn_prf(average, modifier, msg_start, len(result))


0.9201669956380213


So it's nice having these guys autologged and tracked so that you can look at the MLflow GUI and compare apples to apples... but I need to wrap these guys up and override the predict method to make something useful... so much for the CI/CD pipeline this thing is completely busted and I need to do everything manually...

In [82]:
model_directory = "/tmp/sklearn_model.joblib"
dump(clf, model_directory)

['/tmp/sklearn_model.joblib']

In [83]:
# Define the required artifacts associated with the saved custom pyfunc
artifacts = {"model_path": model_directory}

# Define the signature associated with the model
signature = infer_signature(X_train)


In [84]:
with mlflow.start_run() as run:
    mlflow.pyfunc.log_model(
        python_model=ModelWrapper(),
        input_example=X_train,
        signature=signature,
        artifacts=artifacts,
        artifact_path="countries_name",
    )
mlflow.end_run()

In [85]:
loaded_model = mlflow.pyfunc.load_model('mlflow-artifacts:/0/51c28246ab5d47b5a6a6e9e94d094996/artifacts/countries_name')

In [86]:
X_test[0]

array([ 95.77857208, -27.5747509 ])

In [87]:
prediction = loaded_model.predict(np.array([[-98.0, 36.0]]))
prediction

'United States'

In [88]:
prediction = loaded_model.predict(np.array([[0.0, 0.0]]))
prediction

'Ocean'

Okie dokie. Let's see if we can deploy. At this point I grab a shell, navigate to that directory on the MLflow server and get

`sudo mlflow sagemaker build-and-push-container`

to work.

Update: Yeah don't do this on a micro instance. 

In [None]:
from mlflow.deployments import get_deploy_client

config = dict(
    execution_role_arn="",
    bucket_name="",
    image_url="",
    region_name="us-east-1",
    archive=False,
    instance_count=1,
    synchronous=True,
    timeout_seconds=420,
    variant_name="prod-variant-1",
    # env={"DISABLE_NGINX": "true", "GUNICORN_CMD_ARGS": '"--timeout 60"'},
    tags={"training_timestamp": "2023-12-07T05:12:26"},
    instance_type="ml.t2.large",
    # serverless_config = {
    #     "MemorySizeInMB": 1024,
    #     "MaxConcurrency": 1,
    # },
)
client = get_deploy_client("sagemaker")
client.create_deployment(
    "my-deployment3",
    model_uri="",
    flavor="python_function",
    config=config,
)


Lovely. Looks like serverless sagemaker deployments with the python_function flavor are bugged: https://github.com/mlflow/mlflow/issues/9808

I'll try to deploy the regular one...

aaand this whole MLFlow thing is looking like a mistake. I should have just stuck with SageMaker.