### 1. Install Dependencies 

In [None]:
!java -version

In [None]:
!sudo yum install -y java-1.8.0
!sudo yum remove -y java-1.7.0-openjdk

In [None]:
!java -version

In [None]:
!pip install pypmml
!pip install sklearn2pmml

### 2. Imports

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer
from sklearn2pmml import sklearn2pmml
from pypmml import Model
import pandas
import numpy

### 3. Load & Prep Data

In [2]:
churn_data = pandas.read_csv("sample.csv")

In [3]:
churn_data.head()

Unnamed: 0,CustomerID,CLVSegment,Gender,Age,ResBiz,Optout,CreditStatus,SalesChannel,ContractDuration,ContractEndMonths,ZipDemographHouseOther,ZipDemographHouseRenter,ZipDemographHouseOwner,ZipDemographAvgAge,AvgCallsIn,AvgTxtsIn,AvgCallsOut,ZipCode,ReferenceModel,Churn
0,CE-2334791,LARGE,M,25,N,N,N,THIRD PARTY SALES CHANNEL,,3.0,1.0,37.0,62.0,49.5,210.0,30.333333,395.0,89107.0,0.233302,Y
1,CE-4374827,SMALL,F,25,N,N,N,THIRD PARTY SALES CHANNEL,12.0,3.0,1.0,45.0,54.0,44.4,44.666667,0.0,106.333333,30694.0,0.540289,Y
2,CE-5126719,LARGE,M,23,N,N,N,THIRD PARTY SALES CHANNEL,12.0,3.0,0.0,0.0,100.0,48.1,65.666667,0.0,104.333333,43664.0,0.11522,Yes
3,CE-2874904,LARGE,F,22,N,N,N,THIRD PARTY SALES CHANNEL,12.0,3.0,0.0,83.0,17.0,44.1,33.666667,31.0,46.333333,29840.0,0.626136,No
4,CE-2090451,LARGE,F,31,N,N,N,THIRD PARTY SALES CHANNEL,12.0,3.0,1.0,23.0,76.0,51.8,38.666667,15.333333,88.333333,9413.0,0.660848,No


In [4]:
X = churn_data[["Age", "AvgCallsOut"]]

In [5]:
X.head()

Unnamed: 0,Age,AvgCallsOut
0,25,395.0
1,25,106.333333
2,23,104.333333
3,22,46.333333
4,31,88.333333


In [6]:
# Map the multiple values of the Churn field
y = churn_data["Churn"].map(lambda x: ("Churned", "Loyal")[x.startswith("N")])
y.head()

0    Churned
1    Churned
2    Churned
3      Loyal
4      Loyal
Name: Churn, dtype: object

### 4. Create a PMML Pipeline

In [7]:
# Create a preprocessing mapper to replace missing values with median
preprocessing_mapper = DataFrameMapper([(["Age", "AvgCallsOut"], 
                                         [SimpleImputer(missing_values=numpy.nan, strategy='median')])])

In [8]:
# Create a random forest classifier
churn_classifier = RandomForestClassifier(n_estimators=20)

In [9]:
# Create a PMML pipeline with the preprocessing and 
pipeline = PMMLPipeline([("PRE-PROCESS", preprocessing_mapper), ('MODEL-TRAIN', churn_classifier)])



In [10]:
# Fit the model
pipeline.fit(X, y)



PMMLPipeline(steps=[('PRE-PROCESS', DataFrameMapper(features=[(['Age', 'AvgCallsOut'],
                           [SimpleImputer(strategy='median')])])),
       ('MODEL-TRAIN', RandomForestClassifier(n_estimators=20))])

### 5. Save Model

In [11]:
sklearn2pmml(pipeline, "churn_sklearn.pmml", with_repr = True)

### 6. Load PMML as Sklearn Model

In [12]:
model = Model.load('churn_sklearn.pmml')

In [13]:
model.inputNames

['Age', 'AvgCallsOut']

In [14]:
model.outputNames

['probability(Churned)', 'probability(Loyal)']

In [15]:
model.predict([25, 395])

[0.85, 0.15]