In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import pandas as pd
import numpy as np
import matplotlib.pylab as plt

from datetime import datetime, timedelta, date

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

from time import time
import logging

In [None]:
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StandardScaler


from pyspark.sql.types import *
from pyspark.sql.functions import col, udf
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.util import MLUtils

In [None]:
spark = SparkSession.builder.master("local").enableHiveSupport().getOrCreate()

In [None]:
df = pd.read_csv('winequality.csv',sep="|")
df.head()

In [None]:
df.isnull()

In [None]:

schema = StructType([
StructField("fixedacidity", DoubleType(), True),
StructField("volatileacidity",FloatType(), True),
StructField("citricacid",FloatType(), True),
StructField("residualsugar",FloatType(), True),
StructField("chlorides",FloatType(), True),
StructField("freesulfurdioxide",FloatType(), True),
StructField("totalsulfurdioxide",FloatType(), True),
StructField("density",FloatType(), True),
StructField("ph",FloatType(), True),
StructField("sulphates",FloatType(), True),
StructField("alcohol",FloatType(), True),
StructField("quality",IntegerType(), True)])

In [None]:
dataset=spark.createDataFrame(df, schema)

In [None]:
train, test = dataset.randomSplit([0.7, 0.3])

In [None]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
featuresCols = dataset.columns
featuresCols.remove('quality')
print featuresCols

In [None]:
stages = []
numericCols = ["fixedacidity","volatileacidity", "citricacid", "residualsugar","chlorides","freesulfurdioxide","totalsulfurdioxide","density","ph","sulphates","alcohol"]
assemblerInputs = numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [None]:
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol = "quality", outputCol = "label")
stages += [label_stringIdx]
# label_stringIdx.setHandleInvalid("skip")

In [None]:
from pyspark.ml.classification import RandomForestClassifier

# Create initial LogisticRegression model

rf = RandomForestClassifier(labelCol="quality", numTrees=10)


In [None]:
stages += [rf]

In [None]:
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(train)
prediction = pipelineModel.transform(test)

# Keep relevant columns
# selectedcols = ["label", "features"] + cols
# dataset = dataset.select(cols)
print(dataset)
prediction.show()

In [None]:
predictionAndLabels = prediction.select("prediction","quality")
predictionAndLabels.show()

In [None]:
predictionAndLabels.show()

In [None]:
predictionAndLabels = predictionAndLabels.selectExpr("prediction", "quality as label")
predictionAndLabels.show()

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
evaluator.setMetricName("areaUnderPR")
evaluator.evaluate(predictionAndLabels)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
evaluator.setMetricName("areaUnderROC")
evaluator.evaluate(predictionAndLabels)

In [None]:
pipelineModel.write().overwrite().save("hdfs://localhost/home/opentext/bda/ML_Model/RFnew")