In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import (ChiSqSelector, HashingTF, 
Imputer, MinMaxScaler, Normalizer, OneHotEncoderEstimator, QuantileDiscretizer, 
StandardScaler, Tokenizer)
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.appName('test').getOrCreate()

from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([('one',), ('two',), ('one',)], ["input"])
ohe = OneHotEncoderEstimator(inputCols=["input"], outputCols=["output"])
model = ohe.fit(df)
model.transform(df).head().output

IllegalArgumentException: 'requirement failed: Column input must be of type NumericType but was actually of type StringType.'

In [None]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.1, -1.0]),),
    (1, Vectors.dense([2.0, 1.1, 1.0]),),
    (2, Vectors.dense([3.0, 10.1, 3.0]),)
], ["id", "features"])

scaledData = dataFrame

scaler = MinMaxScaler(inputCol="features", outputCol="minMaxScaledFeatures")

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(dataFrame)

# rescale each feature to range [min, max].
scaledData = scalerModel.transform(scaledData)

scaler = StandardScaler(inputCol="features", outputCol="standardScaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(dataFrame)

# Normalize each feature to have unit standard deviation.
scaledData = scalerModel.transform(scaledData)


scaledData.select("features", "standardScaledFeatures", "minMaxScaledFeatures").show()

In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import ChiSquareTest
from pyspark.sql.functions import min, max

data = [(0.0, Vectors.dense(0.5, 10.0, 10.0)),
        (0.0, Vectors.dense(1.5, 20.0, 10.0)),
        (1.0, Vectors.dense(1.5, 30.0, 20.0)),
        (0.0, Vectors.dense(3.5, 30.0, 10.0)),
        (0.0, Vectors.dense(3.5, 40.0, 10.0)),
        (1.0, Vectors.dense(3.5, 40.0, 20.0))]
df = spark.createDataFrame(data, ["label", "features"])

r = ChiSquareTest.test(df, "features", "label").head()
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))


pValues: [0.687289278791,0.682270330336,0.0143058784354]
degreesOfFreedom: [2, 3, 1]
statistics: [0.75,1.5,6.0]


In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Load training data
data = [(2, Vectors.dense(0.5, 10.0)),
        (0, Vectors.dense(1.5, 20.0)),
        (1, Vectors.dense(1.5, 30.0)),
        (2, Vectors.dense(3.5, 30.0)),
        (1, Vectors.dense(3.5, 40.0)),
        (1, Vectors.dense(3.5, 40.0))]
data = spark.createDataFrame(data, ["label", "features"])

# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
# train = splits[0]
# test = splits[1]

train = data

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [2, 5, 4, 3]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

# train the model
model = trainer.fit(train)

# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))