#3. Modélisation

Azure ML & Azure Databricks notebooks by Parashar Shah.
Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License.

##1. Partitionnement des données

In [3]:
import os
import pprint
import numpy as np

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [4]:
train_data_path = "AdultCensusIncomeTrain"
test_data_path = "AdultCensusIncomeTest"

train = spark.read.parquet(train_data_path)
test = spark.read.parquet(test_data_path)

print("Train : ({}, {})".format(train.count(), len(train.columns)))
print("Test : ({}, {})".format(test.count(), len(test.columns)))

print()
train.printSchema()

##2. Construction du pipeline de Machine Learning

In [6]:
# Définition de la variable Cible
label = "income"

reg = 0.1
print("Taux de régularisation est {}.".format(reg))

# Création modèle de régression Logistique.
lr = LogisticRegression(regParam=reg)

dtypes = dict(train.dtypes)
dtypes.pop(label)

si_xvars = []
ohe_xvars = []
featureCols = []
for idx,key in enumerate(dtypes):
    if dtypes[key] == "string":
        featureCol = "-".join([key, "encoded"])
        featureCols.append(featureCol)
        
        tmpCol = "-".join([key, "tmp"])
        # string-index and one-hot encode the string column
        #https://spark.apache.org/docs/2.3.0/api/java/org/apache/spark/ml/feature/StringIndexer.html
        #handleInvalid: Param for how to handle invalid data (unseen labels or NULL values). 
        #Options are 'skip' (filter out rows with invalid data), 'error' (throw an error), 
        #or 'keep' (put invalid data in a special additional bucket, at index numLabels). Default: "error"
        si_xvars.append(StringIndexer(inputCol=key, outputCol=tmpCol, handleInvalid="skip")) #, handleInvalid="keep"
        ohe_xvars.append(OneHotEncoder(inputCol=tmpCol, outputCol=featureCol))
    else:
        featureCols.append(key)

si_label = StringIndexer(inputCol=label, outputCol='label')

assembler = VectorAssembler(inputCols=featureCols, outputCol="features")

# Définition pipeline
pipe = Pipeline(stages=[*si_xvars, *ohe_xvars, si_label, assembler, lr])

# Estimation du modèle sur base apprentissage
model = pipe.fit(train)
print(model)

## 3. Ajustement du Pipeline

In [8]:
regs = np.arange(0.0, 1.0, 0.2)

paramGrid = ParamGridBuilder().addGrid(lr.regParam, regs).build()
cv = CrossValidator(estimator=pipe, evaluator=BinaryClassificationEvaluator(), estimatorParamMaps=paramGrid)

In [9]:
cvModel = cv.fit(train)

In [10]:
# On récupére le meilleur modèle

model = cvModel.bestModel


## 4. Evaluation du modèle de ML

In [12]:
# Prédiction sur base de Test

pred = model.transform(test)
output = pred[['hours_per_week','age','workclass','marital_status','income','prediction']]
display(output)

hours_per_week,age,workclass,marital_status,income,prediction
1,21,Private,Never-married,<=50K,0.0
1,23,Private,Never-married,<=50K,0.0
1,27,Private,Never-married,<=50K,0.0
1,72,?,Married-civ-spouse,<=50K,0.0
2,32,?,Never-married,<=50K,0.0
2,32,Private,Married-civ-spouse,<=50K,0.0
2,47,Private,Married-civ-spouse,>50K,1.0
2,61,?,Married-civ-spouse,>50K,0.0
2,67,?,Married-civ-spouse,<=50K,0.0
2,67,Self-emp-not-inc,Widowed,>50K,1.0


In [13]:
display(output)

hours_per_week,age,workclass,marital_status,income,prediction
1,21,Private,Never-married,<=50K,0.0
1,23,Private,Never-married,<=50K,0.0
1,27,Private,Never-married,<=50K,0.0
1,72,?,Married-civ-spouse,<=50K,0.0
2,32,?,Never-married,<=50K,0.0
2,32,Private,Married-civ-spouse,<=50K,0.0
2,47,Private,Married-civ-spouse,>50K,1.0
2,61,?,Married-civ-spouse,>50K,0.0
2,67,?,Married-civ-spouse,<=50K,0.0
2,67,Self-emp-not-inc,Widowed,>50K,1.0


In [14]:
bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
au_roc = bce.setMetricName('areaUnderROC').evaluate(pred)
au_prc = bce.setMetricName('areaUnderPR').evaluate(pred)

print("Métriques de qualité du modèle obtenu : ")
print()
print("Area under ROC : {}".format(au_roc))
print("Area Under PR : {}".format(au_prc))

## 5. Persistence du modèle

In [16]:
model_name = "AdultCensus.mml"
model_dbfs = os.path.join("/dbfs", model_name)

model.write().overwrite().save(model_name)
print("Sauvegarde du modèle de ML : {}".format(model_dbfs))

In [17]:
%sh

ls -la /dbfs/AdultCensus.mml/*

In [18]:
dbutils.notebook.exit("success")

success