# USE SPARK MLLIB & SPARK MSFT MLLIB
********************FLOW SERVE CLASSIFICATION MODEL BASED ON SAMPLE DATA ***********************
***********INPUT FILE, SCHEMA & DETAILS : parts_classification.data.csv*************************
PartCode -> string -> It is the part number/code
OrganizationKey -> integer -> range 1 – 5
An identifier for a part of Flowserve which combined with the PartCode produces a unique id for the part
ItemDescription -> string -> the free text description of the part; this is what is used to classify the part
EuropumpBubbleNumber -> string -> the target to predict from the ItemDescription

In [8]:
import os, urllib
import pandas as pd
import mmlspark
from pyspark.sql.types import IntegerType, StringType, StructType, StructField

***********Import File, Parse the fields & prepare label for classification********************

In [7]:
spark

<pyspark.sql.session.SparkSession object at 0x7f8827c5b0f0>

In [9]:

#dataFile = "adl://srramhdi.azuredatalakestore.net/clusters/srramhdir/HdiNotebooks/sample.csv"
textSchema = StructType([StructField("partcode", StringType(), False),
                         StructField("orgkey", StringType(), False),
                         StructField("itemdesc",StringType(), False),
                         StructField("europumpbubbleno", StringType(), False)])



In [11]:
data = spark.createDataFrame(pd.read_csv("adl://srramhdi.azuredatalakestore.net/Files/sample.csv", sep=",", header=None), textSchema)
data.registerTempTable("data1")
df = spark.sql("SELECT itemdesc,CASE WHEN cast(europumpbubbleno as int) = 2200 THEN 1 \
                 WHEN cast(europumpbubbleno as int) = 1100 THEN 2 \
               ELSE 0 END as label FROM data1")
df.limit(2).toPandas()

File b'adl://srramhdi.azuredatalakestore.net/Files/sample.csv' does not exist
Traceback (most recent call last):
  File "/usr/bin/anaconda/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py", line 562, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/usr/bin/anaconda/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py", line 315, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/usr/bin/anaconda/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py", line 645, in __init__
    self._make_engine(self.engine)
  File "/usr/bin/anaconda/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py", line 799, in _make_engine
    self._engine = CParserWrapper(self.f, **self.options)
  File "/usr/bin/anaconda/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py", line 1213, in __init__
    self._reader = _parser.TextReader(src, **kwds)
  File "pandas/parser.pyx", line 358, in pandas.parser.TextReader.__cinit__ (pandas/parser.c:3427

# ******Using plain old Spark ML Lib********************************
******STEP 1 : CREATE FEATURE VECTOR FROM FREE FORM TEXT**********

In [26]:
from pyspark.ml.feature import Tokenizer, HashingTF
from pyspark.ml.feature import VectorAssembler
tokenizer = Tokenizer(inputCol="itemdesc", outputCol="tokenizedText")
tokenizedData = tokenizer.transform(df)
numFeatures = 1000
hashingScheme = HashingTF(inputCol="tokenizedText",
                          outputCol="features",
                          numFeatures=numFeatures)
featurizedData = hashingScheme.transform(tokenizedData)
processedData = featurizedData.withColumn("label", featurizedData["label"]) \
                             .select(["features", "label"])
#featurizedData.take(7)

Unnamed: 0,itemdesc,label,tokenizedText
0,CASING,0,[casing]
1,CASING,0,[casing]
2,???BRG|SS400|[*AN12 D80 T11 2]|????1||H11|?????,0,"[???brg|ss400|[*an12, d80, t11, 2]|????1||h11|..."
3,PIPE OF THE SINGLE SCREW,0,"[pipe, of, the, single, screw]"
4,???????|SUS304TP|[*PT1/2X115L SCH80 0]|????1||...,0,"[???????|sus304tp|[*pt1/2x115l, sch80, 0]|????..."
5,???????|SUS316TP|[*PT3/4X140L SCH80 0]|????1||...,0,"[???????|sus316tp|[*pt3/4x140l, sch80, 0]|????..."
6,???????|SUS316TP|[*PT3/4X400L SCH80 0]|????1||...,0,"[???????|sus316tp|[*pt3/4x400l, sch80, 0]|????..."
7,PIPE OF THE SINGLE SCREW,0,"[pipe, of, the, single, screw]"
8,???????|STPT370|[*NPT1/2X60L SCH80]|????1||H11...,0,"[???????|stpt370|[*npt1/2x60l, sch80]|????1||h..."
9,????|SS400||????1||H11|?????,0,[????|ss400||????1||h11|?????]


# Trying three different models from Spark MLLib

In [37]:
#from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
# Prepare data for learning
train, test, validation = processedData.randomSplit([0.60, 0.20, 0.20], seed=123)

#Train the models on the 'train' data
lrHyperParams = [0.05, 0.1, 0.2, 0.4]
logisticRegressions = [LogisticRegression(regParam = hyperParam)
                       for hyperParam in lrHyperParams]
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", metricName="areaUnderROC")
metrics = []
models = []

# Select the best model
for learner in logisticRegressions:
    model = learner.fit(train)
    models.append(model)
    scored_data = model.transform(test)
    metrics.append(evaluator.evaluate(scored_data))
best_metric = max(metrics)
best_model = models[metrics.index(best_metric)]

# Save model
best_model.write().overwrite().save("sparkmlflowserveclassification.mmls")
# Get AUC on the validation dataset
scored_val = best_model.transform(validation)
print(evaluator.evaluate(scored_val))

numNodes =  13
depth =  3


In [48]:
##Build a Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3)
dtModel = dt.fit(train)
print ("numNodes = ", dtModel.numNodes)
print ("depth = ", dtModel.depth)
# Evaluate model
evaluator = MulticlassClassificationEvaluator()
predictions = dtModel.transform(test)
evaluator.evaluate(predictions)

0.9978040890743592

# Tuning model Hyper Parameters for decision Tree

In [45]:
##Tune Hyper parameters for Decision Tree model 
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1,2,6,10])
             .addGrid(dt.maxBins, [20,40,80])
             .build())
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cvModel = cv.fit(train)
# Evaluate model
predictions = cvModel.transform(test)
evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(predictions)

In [50]:
# Create an initial RandomForest model.
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
# Train model with Training Data
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(predictions)

0.9974381737296761

# Using MML Spark 1.Eliminate lot of boiler plate code/ Feauturize efficiently

In [51]:
#Use TextFeaturizer to generate our features column. We remove stop words, and use TF-IDF to generate 2²⁰ sparse features.
processedData = textFeaturizer.transform(df)
processedData.limit(5).toPandas()

NameError: name 'textFeaturizer' is not defined

# Text Feauturizer

In [53]:
from mmlspark.TextFeaturizer import TextFeaturizer
textFeaturizer = TextFeaturizer() \
  .setInputCol("itemdesc").setOutputCol("features") \
  .setUseStopWordsRemover(True).setUseIDF(True).setMinDocFreq(5).setNumFeatures(1 << 16).fit(df)
processedData = processedData.withColumn("label", processedData["label"]) \
                             .select(["features", "label"])
processedData.limit(5).toPandas()

Unnamed: 0,features,label
0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
2,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
3,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
4,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0


Train several Logistic Regression models with different regularizations.

In [54]:
train, test, validation = processedData.randomSplit([0.60, 0.20, 0.20])
from pyspark.ml.classification import LogisticRegression
#from pyspark.ml.classification import DecisionTreeClassifier
from mmlspark.TrainClassifier import TrainClassifier
model = TrainClassifier(model=LogisticRegression(),labelCol="label").fit(train)
model.write().overwrite().save("partsclassification.mml")
#lrHyperParams = [0.05, 0.1, 0.2, 0.4]
#logisticRegressions = [LogisticRegression(regParam = hyperParam) for hyperParam in lrHyperParams]


#lrmodels = [TrainClassifier(model=lrm, labelCol="label").fit(train) for lrm in logisticRegressions]

In [55]:
from mmlspark import ComputeModelStatistics, TrainedClassifierModel
predictionModel = TrainedClassifierModel.load("partsclassification.mml")
prediction = predictionModel.transform(test)


In [56]:
metrics = ComputeModelStatistics().transform(prediction)
metrics.limit(10).toPandas()

Unnamed: 0,evaluation_type,predicted_class_as_0.0_actual_is_0.0,predicted_class_as_0.0_actual_is_1.0,predicted_class_as_0.0_actual_is_2.0,predicted_class_as_1.0_actual_is_0.0,predicted_class_as_1.0_actual_is_1.0,predicted_class_as_1.0_actual_is_2.0,predicted_class_as_2.0_actual_is_0.0,predicted_class_as_2.0_actual_is_1.0,predicted_class_as_2.0_actual_is_2.0,accuracy,precision,recall,average_accuracy,macro_averaged_precision,macro_averaged_recall
0,Classification,9798.0,10.0,11.0,9.0,8.0,0.0,1.0,0.0,0.0,0.996849,0.996849,0.996849,0.997899,0.489483,0.481142


In [59]:
#from mmlspark import FindBestModel, BestModel
#bestModel = FindBestModel(evaluationMetric="AUC", models=lrmodels).fit(test)
#bestModel.write().overwrite().save("model.mml")
#loadedBestModel = BestModel.load("model.mml")

In [58]:
from mmlspark.ComputeModelStatistics import ComputeModelStatistics
predictionModel = TrainedClassifierModel.load("partsclassification.mml")
predictions = predictionModel.transform(validation)
metrics = ComputeModelStatistics().transform(predictions)
print("Best model's accuracy on validation set = "
      + "{0:.2f}%".format(metrics.first()["accuracy"] * 100))

Best model's accuracy on validation set = 99.80%
