# Install Requirements

In [1]:

%pip install mlrun scikit-learn pandas numpy

You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


## Create an MLRun project and configuration

In [2]:
from os import path
import mlrun

project_name_base = 'pmt-app'

project_name, artifact_path = mlrun.set_environment(project=project_name_base, user_project=True)

print(f'Project name: {project_name}')
print(f'Artifact path: {artifact_path}')

Project name: pmt-app-jovyan
Artifact path: /home/jovyan/data


In [4]:
from os import path
import numpy as np 
import pandas as pd
import datetime as dt
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from pickle import dumps
from sklearn.ensemble import RandomForestClassifier

In [5]:
def clean_yes_no_column(serie, train=True, train_mean=None):
    _serie = serie.apply(lambda x: 0 if x=="no" else x)
    _serie = _serie.apply(lambda x: float(x) if x!="yes" else x)
    if train:
        mean_value = _serie[_serie != "yes"].mean()
    else:
        mean_value = train_mean
    return _serie.apply(lambda x: mean_value if x=="yes" else x)

In [6]:
def fetch_data(context : MLClientCtx, pmt_records_path: DataItem):
    
    
    pmt_records_dataset = pmt_records_path.as_df()
    
    
    
    target_path = path.join(context.artifact_path, 'data')
    context.logger.info('Saving datasets to {} ...'.format(target_path))

    # Store the data sets in your artifacts database
    context.log_dataset('pmt-app-dataset', df=pmt_records_dataset, format='csv',
                        index=False, artifact_path=target_path)
      

In [7]:
def transform_dataset(context : MLClientCtx, pmt_records_path: DataItem):
    
    context.logger.info('Begin datasets transform')
    
    
    
    
    
    
    
    train_data = pmt_records_path.as_df()
    fill_dict = {"v2a1": train_data.v2a1.median(), #Monthly rent payment
                 "v18q1": 0, #number of tablets household owns
                 "rez_esc": train_data.rez_esc.median(), #Years behind in school
                 "meaneduc": train_data.meaneduc.median(), #average years of education for adults (18+)
                }
    train_data = train_data.fillna(fill_dict)
    train_data.SQBmeaned = np.sqrt(train_data.meaneduc)

    train_data.dependency = clean_yes_no_column(train_data.dependency)
    train_data.edjefe = clean_yes_no_column(train_data.edjefe)
    train_data.edjefa = clean_yes_no_column(train_data.edjefa)
    
    # Save dataset to artifact
    target_path = path.join(context.artifact_path, 'data')
    context.log_dataset('pmt-app-dataset-transformed', df=train_data, artifact_path=target_path, format='csv')    
    
    context.logger.info('End dataset transform')

In [8]:
def train_model(context: MLClientCtx, input_ds: DataItem):
    
    context.logger.info('Begin training')
    train_data = input_ds.as_df();
    feature_cols = [x for x in train_data.columns if x not in ["Target", "Id", "idhogar"]]
    X = train_data[feature_cols]
    y = train_data.Target
    

    model =  RandomForestClassifier()
    model.fit(X, y)
    
    
    
   
    
    context.log_model('ModelPMT',
                     body=dumps(model),
                     artifact_path=context.artifact_subpath("models"),
                     model_file="ModelPMT.pkl")
    
    context.logger.info('End training')

In [9]:
# mlrun: end-code

## Set Input Paths

In [10]:
pmt_records_csv_path = 'https://pmt-data.herokuapp.com/train.csv'


## Convert Code to a Function

In [11]:
model_pmt_func = mlrun.code_to_function(name='model_pmt',
                                   kind='job',
                                   image='mlrun/mlrun',
                                   requirements=['scikit-learn', 'numpy','pandas'])

## Run `fetch_data` Locally

We can test out code locally, by calling the function with `local` parameter set to `True`

In [12]:
fetch_data_run = model_pmt_func.run(handler='fetch_data',
                               inputs={'pmt_records_path': pmt_records_csv_path},
                               local=True)

> 2021-06-27 12:15:53,354 [info] starting run model-pmt-fetch_data uid=c9c1eb7518124cb2aff6512e835ecced DB=http://mlrun-api:8080
> 2021-06-27 12:16:20,025 [info] Saving datasets to /home/jovyan/data/data ...


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
pmt-app-jovyan,...835ecced,0,Jun 27 12:15:53,completed,model-pmt-fetch_data,kind=owner=jovyanhost=mlrun-kit-jupyter-6879c4d97f-ksbvf,pmt_records_path,,,pmt-app-dataset


to track results use .show() or .logs() or in CLI: 
!mlrun get run c9c1eb7518124cb2aff6512e835ecced --project pmt-app-jovyan , !mlrun logs c9c1eb7518124cb2aff6512e835ecced --project pmt-app-jovyan
> 2021-06-27 12:16:20,896 [info] run executed, status=completed


In [13]:
fetch_data_run.outputs

{'pmt-app-dataset': 'store://artifacts/pmt-app-jovyan/model-pmt-fetch_data_pmt-app-dataset:c9c1eb7518124cb2aff6512e835ecced'}

## Run on the Cluster

In [14]:
from mlrun.platforms import auto_mount
model_pmt_func.apply(auto_mount())
model_pmt_func.deploy()

> 2021-06-27 12:16:28,114 [info] starting remote build, image: .abhayrpatel10/func-pmt-app-jovyan-model-pmt:latest
E0627 12:16:40.446443       1 aws_credentials.go:77] while getting AWS credentials NoCredentialProviders: no valid providers in chain. Deprecated.
	For verbose messaging see aws.Config.CredentialsChainVerboseErrors
[36mINFO[0m[0004] Retrieving image manifest mlrun/mlrun:0.6.4-rc7 
[36mINFO[0m[0008] Retrieving image manifest mlrun/mlrun:0.6.4-rc7 
[36mINFO[0m[0011] Built cross stage deps: map[]                
[36mINFO[0m[0011] Retrieving image manifest mlrun/mlrun:0.6.4-rc7 
[36mINFO[0m[0014] Retrieving image manifest mlrun/mlrun:0.6.4-rc7 
[36mINFO[0m[0018] Executing 0 build triggers                   
[36mINFO[0m[0018] Unpacking rootfs as cmd RUN python -m pip install scikit-learn numpy pandas requires it. 
[36mINFO[0m[0135] RUN python -m pip install scikit-learn numpy pandas 
[36mINFO[0m[0135] Taking snapshot of full filesystem...        
[36mINFO[0m

True

In [15]:
fetch_data_run = model_pmt_func.run(name='fetch_data',
                               handler='fetch_data',
                               inputs={'pmt_records_path': pmt_records_csv_path})

> 2021-06-27 12:19:39,216 [info] starting run fetch_data uid=8c767330a1854d368677abd357df704d DB=http://mlrun-api:8080
> 2021-06-27 12:19:39,284 [info] Job is running in the background, pod: fetch-data-2qdnt
> 2021-06-27 12:20:32,784 [info] Saving datasets to /home/jovyan/data/data ...
> 2021-06-27 12:20:33,496 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
pmt-app-jovyan,...57df704d,0,Jun 27 12:20:20,completed,fetch_data,kind=jobowner=jovyanhost=fetch-data-2qdnt,pmt_records_path,,,pmt-app-dataset


to track results use .show() or .logs() or in CLI: 
!mlrun get run 8c767330a1854d368677abd357df704d --project pmt-app-jovyan , !mlrun logs 8c767330a1854d368677abd357df704d --project pmt-app-jovyan
> 2021-06-27 12:20:40,627 [info] run executed, status=completed


In [16]:
fetch_data_run.outputs['pmt-app-dataset']

'store://artifacts/pmt-app-jovyan/fetch_data_pmt-app-dataset:8c767330a1854d368677abd357df704d'

## Transform the Dataset

In [17]:
transform_dataset_run = model_pmt_func.run(name='transform_dataset',
                                      handler='transform_dataset',
                                      inputs={'pmt_records_path': fetch_data_run.outputs['pmt-app-dataset']})

> 2021-06-27 12:23:10,394 [info] starting run transform_dataset uid=947e71d7c22f4c21acec73cf9cd2df3b DB=http://mlrun-api:8080
> 2021-06-27 12:23:11,019 [info] Job is running in the background, pod: transform-dataset-r9sl7
> 2021-06-27 12:23:26,323 [info] Begin datasets transform
> 2021-06-27 12:23:27,394 [info] End dataset transform
> 2021-06-27 12:23:27,421 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
pmt-app-jovyan,...9cd2df3b,0,Jun 27 12:23:26,completed,transform_dataset,kind=jobowner=jovyanhost=transform-dataset-r9sl7,pmt_records_path,,,pmt-app-dataset-transformed


to track results use .show() or .logs() or in CLI: 
!mlrun get run 947e71d7c22f4c21acec73cf9cd2df3b --project pmt-app-jovyan , !mlrun logs 947e71d7c22f4c21acec73cf9cd2df3b --project pmt-app-jovyan
> 2021-06-27 12:23:31,290 [info] run executed, status=completed


In [18]:
transform_dataset_run.outputs

{'pmt-app-dataset-transformed': 'store://artifacts/pmt-app-jovyan/transform_dataset_pmt-app-dataset-transformed:947e71d7c22f4c21acec73cf9cd2df3b'}

## Train Model

In [19]:
train_model_run = model_pmt_func.run(name='train_model',
                                handler='train_model',
                                inputs={'input_ds': transform_dataset_run.outputs['pmt-app-dataset-transformed']})

> 2021-06-27 12:23:37,274 [info] starting run train_model uid=c9e3fcddb5ed42ab85f1709c7780e2a4 DB=http://mlrun-api:8080
> 2021-06-27 12:23:37,349 [info] Job is running in the background, pod: train-model-p279l
> 2021-06-27 12:23:50,911 [info] Begin training
> 2021-06-27 12:23:55,909 [info] End training
> 2021-06-27 12:23:55,920 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
pmt-app-jovyan,...7780e2a4,0,Jun 27 12:23:50,completed,train_model,kind=jobowner=jovyanhost=train-model-p279l,input_ds,,,ModelPMT


to track results use .show() or .logs() or in CLI: 
!mlrun get run c9e3fcddb5ed42ab85f1709c7780e2a4 --project pmt-app-jovyan , !mlrun logs c9e3fcddb5ed42ab85f1709c7780e2a4 --project pmt-app-jovyan
> 2021-06-27 12:23:58,760 [info] run executed, status=completed


In [23]:
train_model_run.outputs['ModelPMT']

'store://artifacts/pmt-app-jovyan/train_model_ModelPMT:c9e3fcddb5ed42ab85f1709c7780e2a4'

## Serving

In [25]:
from mlrun import import_function
from mlrun.platforms import auto_mount


serve = import_function('hub://v2_model_server').apply(auto_mount())
model_name='PMTModel'
serve.add_model(model_name, model_path=train_model_run.outputs['ModelPMT'])
addr = serve.deploy()

> 2021-06-27 13:14:42,890 [info] Starting remote function deploy
2021-06-27 13:14:44  (info) Deploying function
2021-06-27 13:14:44  (info) Building
2021-06-27 13:14:44  (info) Staging files and preparing base images
2021-06-27 13:14:44  (info) Building processor image
2021-06-27 13:16:19  (info) Build complete
> 2021-06-27 13:16:39,541 [info] function deployed, address=192.168.65.4:30686


In [31]:
# Test the model

In [30]:
import json

inputs = [[190000,0,3,0,1,1,0,0,0,1,1,0,0,0,0,1,1,1,1,10,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,10,0,10,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,1,1,0,0,0,0,0,1,0,43,100,1849,1,100,0,1,0,100,1849]]
my_data = json.dumps({'inputs': inputs})
serve.invoke(f'v2/models/PMTModel/infer', my_data)

{'id': '514a9556-0d45-458e-8a75-9898e90b6534',
 'model_name': 'PMTModel',
 'outputs': [4]}