In [54]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import (ChiSqSelector, HashingTF, 
Imputer, MinMaxScaler, Normalizer, QuantileDiscretizer, 
StandardScaler, Tokenizer)
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.appName('test').getOrCreate()

from pyspark.ml.pipeline import Estimator, Model, Pipeline
from pyspark.ml.param.shared import *
from pyspark.sql.functions import avg, stddev_samp


class HasMean(Params):

    mean = Param(Params._dummy(), "mean", "mean", 
        typeConverter=TypeConverters.toFloat)

    def __init__(self):
        super(HasMean, self).__init__()

    def setMean(self, value):
        return self._set(mean=value)

    def getMean(self):
        return self.getOrDefault(self.mean)
    
class HasStandardDeviation(Params):

    stddev = Param(Params._dummy(), "stddev", "stddev", 
        typeConverter=TypeConverters.toFloat)

    def __init__(self):
        super(HasStandardDeviation, self).__init__()

    def setStddev(self, value):
        return self._set(stddev=value)

    def getStddev(self):
        return self.getOrDefault(self.stddev)

class HasCenteredThreshold(Params):

    centered_threshold = Param(Params._dummy(),
            "centered_threshold", "centered_threshold",
            typeConverter=TypeConverters.toFloat)

    def __init__(self):
        super(HasCenteredThreshold, self).__init__()

    def setCenteredThreshold(self, value):
        return self._set(centered_threshold=value)

    def getCenteredThreshold(self):
        return self.getOrDefault(self.centered_threshold)

    
class NormalDeviation(Estimator, HasInputCol, 
        HasPredictionCol, HasCenteredThreshold):

    def _fit(self, dataset):
        c = self.getInputCol()
        mu, sigma = dataset.agg(avg(c), stddev_samp(c)).first()
        return (NormalDeviationModel()
            .setInputCol(c)
            .setMean(mu)
            .setStddev(sigma)
            .setCenteredThreshold(self.getCenteredThreshold())
            .setPredictionCol(self.getPredictionCol()))

class NormalDeviationModel(Model, HasInputCol, HasPredictionCol,
        HasMean, HasStandardDeviation, HasCenteredThreshold):

    def _transform(self, dataset):
        x = self.getInputCol()
        y = self.getPredictionCol()
        threshold = self.getCenteredThreshold()
        mu = self.getMean()
        sigma = self.getStddev()

        return dataset.withColumn(y, (dataset[x] - mu) > threshold * sigma)
    
df = spark.createDataFrame([(1, 2.0), (2, 3.0), (3, 0.0), (4, 99.0)], ["id", "x"])

normal_deviation = NormalDeviation().setInputCol("x").setCenteredThreshold(1.0)
model  = Pipeline(stages=[normal_deviation]).fit(df)

df = model.transform(df)
df.show()
f = df.schema.fields[0]

+---+----+----------+
| id|   x|prediction|
+---+----+----------+
|  1| 2.0|     false|
|  2| 3.0|     false|
|  3| 0.0|     false|
|  4|99.0|      true|
+---+----+----------+



'id'

In [46]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import (ChiSqSelector, HashingTF, 
Imputer, MinMaxScaler, Normalizer, QuantileDiscretizer, 
StandardScaler, Tokenizer)
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.appName('test').getOrCreate()

from pyspark.ml.pipeline import Estimator, Model, Pipeline
from pyspark.ml.param.shared import *
from pyspark.sql.functions import col, min

class HasMin(Params):
    minimum = Param(Params._dummy(), "minimum", "minimum")

    def __init__(self):
        super(HasMin, self).__init__()

    def setMin(self, value):
        return self._set(minimum=value)

    def getMin(self):
        return self.getOrDefault(self.minimum)

class MinTransformation(Estimator, HasInputCol, 
        HasOutputCol, HasMin):

    def _fit(self, dataset):
        c = self.getInputCol()
        self.minimum = dataset.agg(min(c)).first()[0]
        return (MinTransformationModel()
            .setInputCol(c)
            .setMin(self.minimum)
            .setOutputCol(self.getOutputCol()))

class MinTransformationModel(Model, HasInputCol, HasOutputCol, HasMin):

    def _transform(self, dataset):
        x = self.getInputCol()
        y = self.getOutputCol()
        minimum = self.getMin()
        meta = { 'const' : minimum}
        return dataset.withColumn(y, (col(x) - minimum).alias(y, metadata=meta))

class HasConst(Params):
    const = Param(Params._dummy(), "const", "const")

    def __init__(self):
        super(HasConst, self).__init__()

    def setConst(self, value):
        return self._set(const=value)

    def getConst(self):
        return self.getOrDefault(self.const)

class ConstTransformation(Estimator, HasInputCol, 
        HasOutputCol, HasConst):

    def _fit(self, dataset):
        c = self.getInputCol()
        self.const = c.
        return (ConstTransformationModel()
            .setInputCol(c)
            .setConst(const)
            .setOutputCol(self.getOutputCol()))

class ConstTransformationModel(Model, HasInputCol, HasOutputCol, HasConst):

    def _transform(self, dataset):
        x = self.getInputCol()
        y = self.getOutputCol()
        const = self.getConst()

        return dataset.withColumn(y, col(x) + const)
    
df = spark.createDataFrame([(1, 2.0), (2, 3.0), (3, 1.0), (4, 99.0)], ["id", "x"])

minTransformation = MinTransformation().setInputCol("x").setOutputCol("prediction")
constTransformation = ConstTransformation().setInputCol("prediction").setOutputCol("prediction_2")\
            .setConst(minTransformation.getMin())
# constTransformation = ConstTransformation().setInputCol("x").setOutputCol("prediction").setConst(20)

# pipeline = Pipeline(stages=[minTransformation, constTransformation])
# model  = pipeline.fit(df)

model1 = minTransformation.fit(df)
df = model1.transform(df)

print(model1.getMin())

model2 = constTransformation.fit(df)
df = model2.transform(df)

df.show()

KeyError: Param(parent='MinTransformation_48bf9ec018eaf3f54832', name='minimum', doc='minimum')

In [9]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.1, -1.0]),),
    (1, Vectors.dense([2.0, 1.1, 1.0]),),
    (2, Vectors.dense([3.0, 10.1, 3.0]),)
], ["id", "features"])

scaledData = dataFrame

scaler = MinMaxScaler(inputCol="features", outputCol="minMaxScaledFeatures")

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(dataFrame)

# rescale each feature to range [min, max].
scaledData = scalerModel.transform(scaledData)

scaler = StandardScaler(inputCol="features", outputCol="standardScaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(dataFrame)

# Normalize each feature to have unit standard deviation.
scaledData = scalerModel.transform(scaledData)


scaledData.select("features", "standardScaledFeatures", "minMaxScaledFeatures").show()

+--------------+----------------------+--------------------+
|      features|standardScaledFeatures|minMaxScaledFeatures|
+--------------+----------------------+--------------------+
|[1.0,0.1,-1.0]|  [1.0,0.0181568259...|       [0.0,0.0,0.0]|
| [2.0,1.1,1.0]|  [2.0,0.1997250857...|       [0.5,0.1,0.5]|
|[3.0,10.1,3.0]|  [3.0,1.8338394239...|       [1.0,1.0,1.0]|
+--------------+----------------------+--------------------+



In [22]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import ChiSquareTest
from pyspark.sql.functions import min, max

data = [(0.0, Vectors.dense(0.5, 10.0)),
        (0.0, Vectors.dense(1.5, 20.0)),
        (1.0, Vectors.dense(1.5, 30.0)),
        (0.0, Vectors.dense(3.5, 30.0)),
        (0.0, Vectors.dense(3.5, 40.0)),
        (1.0, Vectors.dense(3.5, 40.0))]
df = spark.createDataFrame(data, ["label", "features"])

# r = ChiSquareTest.test(df, "features", "label").head()
# print("pValues: " + str(r.pValues))
# print("degreesOfFreedom: " + str(r.degreesOfFreedom))
# print("statistics: " + str(r.statistics))

df.agg(min('label')).first()[0]

0.0

In [10]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load training data
data = [(2, Vectors.dense(0.5, 10.0)),
        (0, Vectors.dense(1.5, 20.0)),
        (1, Vectors.dense(1.5, 30.0)),
        (2, Vectors.dense(3.5, 30.0)),
        (1, Vectors.dense(3.5, 40.0)),
        (1, Vectors.dense(3.5, 40.0))]
data = spark.createDataFrame(data, ["label", "features"])

# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
# train = splits[0]
# test = splits[1]

train = data

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [2, 5, 4, 3]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(train)

# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.5
