# Example 1: Build models using Python libraries

## Use case

A business user with minimal programming knowledge wants to visually apply some complex business rules during the data preparation stage for different business scenarios and train a LightGBM model which is not available out of the box at the time of writing.

## Build Syntax in Extension Model node

Copy and the below code to the Build Syntax section of Extension Model node 

In [None]:
# install the required Python libraries

import sys
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas", "ibm-watson-machine-learning", "scikit-learn", "lightgbm", "--no-input"])

import spss.pyspark.runtime
from pyspark.sql.types import *

import joblib
import numpy as np
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

cxt = spss.pyspark.runtime.getContext()
df = cxt.getSparkInputData().toPandas()

# build a simple lightgbm model

target = "MortgageDefault"
y = df[target]
X = df.drop(target, axis=1)

ct = make_column_transformer(
    (OneHotEncoder(), make_column_selector(dtype_include=object)),
    remainder="passthrough"
)

pipeline = Pipeline(steps=[("transform", ct), ("clf", lgb.LGBMClassifier(objective="binary"))])

pipeline.fit(X, y)

# save the model for scoring
joblib.dump(pipeline, "/tmp/pipeline.pkl")


## Score Syntax in Extension Model node

Copy and the below code to the Build Syntax section of Extension Model node 

In [None]:
import sys
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas", "ibm-watson-machine-learning", "scikit-learn", "lightgbm", "--no-input"])

import spss.pyspark.runtime
from pyspark.sql.types import *
from pyspark.sql import SQLContext

import joblib

cxt = spss.pyspark.runtime.getContext()
sqlContext = cxt.getSparkSQLContext()

target = "MortgageDefault"
prediction = f"$PRED-{target}"
probability = f"$PROB-{target}"

fieldList = [StructField(x.name, x.dataType, x.nullable) for x in cxt.getSparkInputSchema()]
fieldList.append(StructField(prediction, StringType(), nullable=False))
fieldList.append(StructField(probability, FloatType(), nullable=False))
outputSchema = StructType(fieldList)
cxt.setSparkOutputSchema(outputSchema)

if not cxt.isComputeDataModelOnly():
    df = cxt.getSparkInputData().toPandas()
    pipeline = joblib.load("/tmp/pipeline.pkl") 
    df[prediction] = pipeline.predict(df)
    df[probability] = pipeline.predict_proba(df)[:,-1]
    cxt.setSparkOutputData(sqlContext.createDataFrame(df))
    
