In [1]:
import pandas as pd
import lightgbm as lgb

from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

from sklearn2pmml import sklearn2pmml
from sklearn2pmml import DataFrameMapper
from sklearn2pmml.decoration import ContinuousDomain
from sklearn2pmml.preprocessing.lightgbm import make_lightgbm_dataframe_mapper
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.decoration import Alias
from sklearn2pmml.preprocessing import ExpressionTransformer

# 1. read data

In [2]:
df=pd.read_parquet('cc_prospect_gen1_thick.parquet.gzip')

In [3]:
mono_df=pd.read_csv('thick_mono_list.csv')
mono_list_attr=mono_df['mono_list_attr'].to_list()
mono_list_constr=mono_df['mono_list_constr'].to_list()

In [4]:
df_X = df[mono_list_attr]
df_y = df['24_month_bad90_flag']

# 2. data transformation using mapper

In [5]:
mapper,_ = make_lightgbm_dataframe_mapper(df_X.dtypes, missing_value_aware = True)
mapper=DataFrameMapper([(['REV5620'],ContinuousDomain( high_value=999999990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['TBCA2527'],ContinuousDomain( high_value=999999990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['BCC5620'],ContinuousDomain( high_value=999999990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['BCA8370'],ContinuousDomain( high_value=9990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALL8325'],ContinuousDomain( high_value=9990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['TBCC2351'],ContinuousDomain( high_value=990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['REH7120'],ContinuousDomain( high_value=990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['BCX7110'],ContinuousDomain( high_value=990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALL8320'],ContinuousDomain( high_value=9990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALL4520'],ContinuousDomain( high_value=90,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALL7938'],ContinuousDomain( high_value=100,low_value=0,outlier_treatment='as_missing_values' )),
                        (['IQT9426'],ContinuousDomain( high_value=90,low_value=0,outlier_treatment='as_missing_values' )),
                        (['IQT9510'],ContinuousDomain( high_value=9990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['IQT9427'],ContinuousDomain( high_value=90,low_value=0,outlier_treatment='as_missing_values' )),
                        (['IQT9420'],ContinuousDomain( high_value=90,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALL5072'],ContinuousDomain( high_value=999999990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALL7340'],ContinuousDomain( high_value=100,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALL8160'],ContinuousDomain( high_value=9990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALL8164'],ContinuousDomain( high_value=9990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALL8152'],ContinuousDomain( high_value=9990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALL7517'],ContinuousDomain( high_value=100,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALL7936'],ContinuousDomain( high_value=100,low_value=0,outlier_treatment='as_missing_values' )),
                        (['TBCC4502'],ContinuousDomain( high_value=9990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['TBCC3503'],ContinuousDomain( high_value=90,low_value=0,outlier_treatment='as_missing_values' )),
                        (['TBCC2503'],ContinuousDomain( high_value=90,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALM6280'],ContinuousDomain( high_value=400,low_value=0,outlier_treatment='as_missing_values' )),
                        (['TBCA3275'],ContinuousDomain( high_value=999999990,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALL7519'],ContinuousDomain( high_value=100,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALM6200'],ContinuousDomain( high_value=400,low_value=0,outlier_treatment='as_missing_values' )),
                        (['ALL7518'],ContinuousDomain( high_value=100,low_value=0,outlier_treatment='as_missing_values' )),
                        (['TBCC3205'],ContinuousDomain( high_value=12,low_value=0,outlier_treatment='as_missing_values' ))])

# 3. fit model use LGBMClassifier and save to pmml

In [6]:
for i in mono_list_attr:
    df_X[i]=df_X[i].astype('float64')
df_X.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


REV5620     float64
TBCA2527    float64
BCC5620     float64
BCA8370     float64
ALL8325     float64
TBCC2351    float64
REH7120     float64
BCX7110     float64
ALL8320     float64
ALL4520     float64
ALL7938     float64
IQT9426     float64
IQT9510     float64
IQT9427     float64
IQT9420     float64
ALL5072     float64
ALL7340     float64
ALL8160     float64
ALL8164     float64
ALL8152     float64
ALL7517     float64
ALL7936     float64
TBCC4502    float64
TBCC3503    float64
TBCC2503    float64
ALM6280     float64
TBCA3275    float64
ALL7519     float64
ALM6200     float64
ALL7518     float64
TBCC3205    float64
dtype: object

In [7]:
classifier = LGBMClassifier(random_state = 13)
classifier = LGBMClassifier(objective = "binary",min_child_samples =300,boosting_type='gbdt',reg_lambda=5,
                            random_state=157,learning_rate=0.02,monotone_constraints=mono_list_constr,
                            max_depth=6, n_estimators = 1579,num_leaves=25,feature_fraction=0.8)

pipeline = PMMLPipeline([
  ("mapper",mapper),
  ("classifier", classifier)],
    predict_proba_transformer = Alias(ExpressionTransformer("X[1]"), name="Predicted_y", prefit=True))
    
pipeline.fit(df_X, df_y)



PMMLPipeline(steps=[('mapper', DataFrameMapper(drop_cols=[],
                features=[(['REV5620'],
                           ContinuousDomain(high_value=999999990, low_value=0,
                                            outlier_treatment='as_missing_values')),
                          (['TBCA2527'],
                           ContinuousDomain(high_value=999999990, low_value=0,
                                            outlier_treatment='as_missing_values')),
                          (['BCC5620'],
                           ContinuousDomain(high_value=999999990, low_value=0,
                                            outlier_treatment='as_missing_values')),
                          (...
                           ContinuousDomain(high_value=999999990, low_value=0,
                                            outlier_treatment='as_missing_values')),
                          (['ALL7519'],
                           ContinuousDomain(high_value=100, low_value=0,
                  

In [8]:
sklearn2pmml(pipeline, "cc_prospect_gen1_thick_pmml_test.xml",with_repr=True, debug=True)

python: 3.6.13
sklearn: 0.23.2
sklearn2pmml: 0.56.2
joblib: 1.0.1
sklearn_pandas: 2.0.2
pandas: 1.1.5
numpy: 1.19.5
openjdk: 1.8.0_272
Executing command:
java -cp /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn2pmml/resources/pmml-model-metro-1.4.15.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn2pmml/resources/jaxb-runtime-2.3.2.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn2pmml/resources/jakarta.xml.bind-api-2.3.2.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn2pmml/resources/slf4j-api-1.7.29.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn2pmml/resources/jpmml-xgboost-1.3.16.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn2pmml/resources/h2o-logger-3.30.0.3.jar:/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn2pmml/resources/jpmml-converter-1.3.12.jar:/home/ec2-user/anaconda3/envs/pytho