# Analítica in-databse con Python y Vantage - Parte 3 (BYOM)

**Notas**
* BYOM: https://docs.teradata.com/r/Teradata-VantageTM-Bring-Your-Own-Model-User-Guide/May-2022/Bring-Your-Own-Model

**Contenido**
1. Cargar librerias
2. Preparar Datos
3. Entrenar Modelo
4. Exportar modelo a PMML
5. Cargar modelo PMML a Teradata
6. Predecir

In [1]:
import pandas as pd
import getpass

from teradataml import (
    create_context, 
    remove_context,
    get_context,
    get_connection,
    DataFrame,
    retrieve_byom,
    PMMLPredict,
    configure)

In [2]:
host="10.40.176.7"  #hostname o IP de Teradata
username="teradataml" #usuario de base de datos
password=getpass.getpass(prompt='pwd:') #password del usuario de base de datos
database="teradataml" #DB donde residen los datos
dbtmp="teradataml"  #DB de trabajo para las funciones analíticas


pwd: ········


In [6]:
context=create_context(host=host, username=username, database=database, password=password, logmech="TDNEGO")

## Cargar datos

In [14]:
from teradataml import copy_to_sql
pdf=pd.read_csv("data\pima_patient_diagnoses.csv")
copy_to_sql(df=pdf,schema_name=database,table_name="pima_patient_diagnoses",if_exists="replace")

In [16]:
pdf=pd.read_csv("data\pima_patient_features.csv")
copy_to_sql(df=pdf,schema_name=database,table_name="pima_patient_features",if_exists="replace")

## Preparar datos

In [17]:
train_pdf = DataFrame.from_query("""
SELECT 
    F.*, D.hasdiabetes 
FROM pima_patient_features F
JOIN pima_patient_diagnoses D
    ON F.patientid = D.patientid 
    WHERE F.patientid MOD 5 <> 0
""").to_pandas(all_rows=True)


# split data into X and y
features = ["NumTimesPrg", "Age", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc"]
X_train = train_pdf[features]

target = "HasDiabetes"
y_train = train_pdf[target]

## Entrenar al modelo

In [None]:
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

model = Pipeline([('scaler', MinMaxScaler()),
                  ('xgb', XGBClassifier(eta=0.2, max_depth=6))])
model.fit(X_train, y_train)
print("Finished training")

Finished training


## Convertir el modelo a PMML

In [21]:
from nyoka import xgboost_to_pmml

xgboost_to_pmml(pipeline=model, col_names=features, target_name=target, pmml_f_name="modelo_diagnostico_diabetes_rf.pmml")

## Cargar modelo a Teradara

In [7]:
import  datetime
from teradataml import  save_byom 

model_id = 'diagnostico_diabetes_rf'
model_file = 'modelo_diagnostico_diabetes_rf.pmml'
table_name = 'modelos_pmml'

save_byom(model_id,
          model_file,
          table_name,
          additional_columns={"Description": "XGBoost",
                              "ModelSavedTime": datetime.datetime.now(),
                              "ModelGeneratedDate":datetime.date.today(),
                              "ModelGeneratedTime": datetime.time(hour=0,minute=5,second=45,microsecond=110)
                              },
              schema_name=database
              )



Created the model table 'modelos_pmml' as it does not exist.
Model is saved.


## Consultar modelos

In [25]:
pd.options.display.max_colwidth = 250
pd.read_sql(f'SELECT TOP 2 * FROM {database}.modelos_rf', get_connection())

Unnamed: 0,model_id,model,Description,ModelSavedTime,ModelGeneratedDate,ModelGeneratedTime
0,diagnostico_diabetes_rf,"b'<?xml version=""1.0"" encoding=""UTF-8""?>\r\n<PMML xmlns=""http://www.dmg.org/PMML-4_4"" version=""4.4.1"">\r\n <Header copyright=""Copyright (c) 2021 Software AG"" description=""Default description"">\r\n <Application name=""Nyoka"" version=""5.3....",XGBoost,2022-09-07 10:26:22.150481,2022-09-07,00:05:45.000110


In [26]:
from teradataml import  list_byom
list_byom(table_name="modelos")

                                       model
model_id                                    
consumo_energia_lr  b'3C3F786D6C20766572...'


## Predecir

In [27]:
df = DataFrame('{database}."pima_patient_features"')
df.head(3)

PatientId,NumTimesPrg,PlGlcConc,BloodP,SkinThick,TwoHourSerIns,BMI,DiPedFunc,Age
2,8,183,64,0,0,23.3,0.672,32
1,1,85,66,29,0,26.6,0.351,31
0,6,148,72,35,0,33.6,0.627,50


In [28]:
modelo = retrieve_byom("diagnostico_diabetes_rf", table_name="modelos_rf")
modelo

model_id,model
diagnostico_diabetes_rf,b'3C3F786D6C20766572...'


In [30]:
configure.byom_install_location = "MLDB"
prediction = PMMLPredict(
            modeldata=modelo,
            newdata=df,
            accumulate='PatientId')

In [31]:
prediction

############ STDOUT Output ############

   PatientId prediction                                                                                           json_report
0        734          0    {"probability_0":0.9768986687837254,"probability_1":0.02310133121627458,"predicted_HasDiabetes":0}
1          0          1    {"probability_0":0.06581358523585568,"probability_1":0.9341864147641443,"predicted_HasDiabetes":1}
2         61          1    {"probability_0":0.09775174319883084,"probability_1":0.9022482568011692,"predicted_HasDiabetes":1}
3        530          0  {"probability_0":0.9932391738431502,"probability_1":0.0067608261568498276,"predicted_HasDiabetes":0}
4        591          0   {"probability_0":0.9917639473673989,"probability_1":0.008236052632601056,"predicted_HasDiabetes":0}
5        427          1   {"probability_0":0.017599795642817773,"probability_1":0.9824002043571822,"predicted_HasDiabetes":1}
6         40          1   {"probability_0":0.014731937928161631,"probability_

In [33]:
## SQL
predicciones = DataFrame.from_query("""
     SELECT * FROM MLDB.PMMLPredict (
        ON teradataml.pima_patient_features AS InputTable
        ON (SELECT * FROM teradataml.modelos_rf WHERE model_id='diagnostico_diabetes_rf') AS ModelTable DIMENSION
    USING
          Accumulate ('PatientID')
    ) AS td""").to_pandas()
predicciones

Unnamed: 0,PatientId,prediction,json_report
0,469,0,"{""probability_0"":0.8745927635194019,""probability_1"":0.12540723648059815,""predicted_HasDiabetes"":0}"
1,265,0,"{""probability_0"":0.5361277298304299,""probability_1"":0.46387227016957006,""predicted_HasDiabetes"":0}"
2,734,0,"{""probability_0"":0.9768986687837254,""probability_1"":0.02310133121627458,""predicted_HasDiabetes"":0}"
3,122,0,"{""probability_0"":0.968890386377149,""probability_1"":0.031109613622850977,""predicted_HasDiabetes"":0}"
4,0,1,"{""probability_0"":0.06581358523585568,""probability_1"":0.9341864147641443,""predicted_HasDiabetes"":1}"
...,...,...,...
763,511,0,"{""probability_0"":0.9963771121691171,""probability_1"":0.0036228878308829844,""predicted_HasDiabetes"":0}"
764,551,0,"{""probability_0"":0.9866352992696981,""probability_1"":0.013364700730301932,""predicted_HasDiabetes"":0}"
765,633,0,"{""probability_0"":0.9959932947895436,""probability_1"":0.004006705210456415,""predicted_HasDiabetes"":0}"
766,143,1,"{""probability_0"":0.08588086940182293,""probability_1"":0.9141191305981771,""predicted_HasDiabetes"":1}"


### Trabajando sin mover los datos de Teradata

In [34]:
prediccionesTD = DataFrame.from_query("""
     SELECT * FROM MLDB.PMMLPredict (
        ON teradataml.pima_patient_features AS InputTable
        ON (SELECT * FROM teradataml.modelos_rf WHERE model_id='diagnostico_diabetes_rf') AS ModelTable DIMENSION
    USING
          Accumulate ('PatientID')
    ) AS td""")

In [35]:
from teradataml import copy_to_sql
copy_to_sql(df = prediccionesTD, table_name = "prediccionDiabetes", primary_index="PatientId", if_exists="replace")

In [36]:
df = DataFrame("prediccionDiabetes")
df

PatientId,prediction,json_report
402,1,"{""probability_0"":0.23667171219591787,""probability_1"":0.7633282878040821,""predicted_HasDiabetes"":1}"
379,0,"{""probability_0"":0.9755544367443019,""probability_1"":0.024445563255698147,""predicted_HasDiabetes"":0}"
427,1,"{""probability_0"":0.017599795642817773,""probability_1"":0.9824002043571822,""predicted_HasDiabetes"":1}"
295,1,"{""probability_0"":0.1237526411534412,""probability_1"":0.8762473588465588,""predicted_HasDiabetes"":1}"
101,0,"{""probability_0"":0.9869249432557844,""probability_1"":0.013075056744215607,""predicted_HasDiabetes"":0}"
19,1,"{""probability_0"":0.13734088756892637,""probability_1"":0.8626591124310736,""predicted_HasDiabetes"":1}"
141,0,"{""probability_0"":0.8793171537908955,""probability_1"":0.12068284620910454,""predicted_HasDiabetes"":0}"
762,0,"{""probability_0"":0.9946098557121638,""probability_1"":0.005390144287836211,""predicted_HasDiabetes"":0}"
631,0,"{""probability_0"":0.9663924480605154,""probability_1"":0.033607551939484635,""predicted_HasDiabetes"":0}"
507,0,"{""probability_0"":0.9566119541857399,""probability_1"":0.043388045814260194,""predicted_HasDiabetes"":0}"


In [None]:
remove_context()