In [None]:
import sys

rootpath = '/class/'
datapath = f'{rootpath}datasets/'
sys.path.append(rootpath)
from pyspark_helpers import *
sc, spark, conf = initspark()



In [None]:
import pandas as pd
import sklearn as sk
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
print(iris.keys())
print(iris['feature_names'])

In [None]:
iris_features = iris['data']
iris_label = iris['target']
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

data1 = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= features + ['target'])
irisDF = spark.createDataFrame(data1)
display(irisDF)

irisPandas = irisDF.toPandas()


In [None]:
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=features, outputCol="features")
dfML = vecAssembler.transform(irisDF)
display(dfML)


In [None]:
from pyspark.ml.clustering import KMeans
import matplotlib.pyplot as plt
from pyspark.sql.functions import expr

CLUSTERS = 3

kmeans = KMeans(k=3, seed = 1)
model = kmeans.fit(dfML)
centroids = model.clusterCenters()
#print(centroids)

predictions = model.transform(dfML) 
#display(predictions)


x = predictions.select('prediction', 'target').collect()
print(x[0])
print('-->', tuple(x[0]))
print(x[0].prediction, x[0]['target'])

# good recipe to convert list of Row objects into list of tuples
print(list(map(tuple, x)))



In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import SQLTransformer
display(irisDF)
#irisDF2 = irisDF.withColumn('ratio', 'sepal_length/sepal_width')
sql = "SELECT *, sepal_length/sepal_width as sepal_ratio, petal_length/petal_width as petal_ratio FROM __THIS__"
sqlTransformer = SQLTransformer(statement = sql)
sqlModel = sqlTransformer.transform(irisDF)
display(sqlModel)


In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StandardScaler
display(irisDF.describe())

standardScaler = StandardScaler(inputCol='features', outputCol='scaled_features')
model = standardScaler.fit(dfML)
print(model.mean, model.std)
dfMLScaled = model.transform(dfML)
display(dfML)
display(dfMLScaled)


In [None]:
#from pyspark.ml.linalg import Vectors
#from pyspark.ml.feature import QuantileDiscretizer
from pyspark.ml.feature import RFormula

Rform = RFormula(formula = 'target~sepal_length + sepal_width + petal_length + petal_width').fit(irisDF).transform(irisDF)
display(Rform)


In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.feature import RFormula

quantile = QuantileDiscretizer(inputCol='sepal_width', outputCol='sepal_width_bucket', numBuckets = 2).fit(irisDF).transform(irisDF)

display(quantile)

# d = {'inputCol':'sepal_length', 'outputCol'='sepal_length_bucket', 'numBuckets' = 2}
# d1 = dict(inputCol='sepal_length', outputCol='sepal_length_bucket', numBuckets = 2)
# quantile1 = QuantileDiscretizer(**d)

quantile1 = QuantileDiscretizer(inputCol='sepal_length', outputCol='sepal_length_bucket', numBuckets = 2)
quantile2 = QuantileDiscretizer(inputCol='sepal_width', outputCol='sepal_width_bucket', numBuckets = 2)
quantile3 = QuantileDiscretizer(inputCol='petal_length', outputCol='petal_length_bucket', numBuckets = 2)
quantile4 = QuantileDiscretizer(inputCol='petal_width', outputCol='petal_width_bucket', numBuckets = 2)


sql = "SELECT *, sepal_length * sepal_length_bucket as sl FROM __THIS__"
sqlTransformer = SQLTransformer(statement = sql)


Rform = RFormula(formula = 'target~sepal_length_bucket + sepal_width_bucket + petal_length_bucket + petal_width_bucket')

stages = [quantile1, quantile2, sqlTransformer, quantile3, quantile4, Rform]

pipeline = Pipeline(stages = stages)

dfPipeModel = pipeline.fit(irisDF)
dfPipe = dfPipeModel.transform(irisDF)
dfPipe2 = dfPipe.select('label', 'features')
display(dfPipe)

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.feature import PCA
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.ml.feature import RFormula

#    , 'SQL' : (SQLTransformer, {'statement':sql})
#    , 'RFormula' : (RFormula, {'formula':'target~sepal_length + sepal_width + petal_length + petal_width'})
#    , 'quantile' : (QuantileDiscretizer, {'inputOutput':True, 'numBuckets':2})


scalers = {
      'standard' : (StandardScaler,{})
    , 'maxAbs' : (MaxAbsScaler,{})
    , 'minMax' : (MinMaxScaler,{})
    , 'PCA' : (PCA, {'k':2})
}

for name, scalerClass in scalers.items():
    print (name)
    c, p = scalerClass
    p.update({'inputCol':'features', 'outputCol':name+'_features'})
    print (p)
    scaler = c(**p)

    model = scaler.fit(dfML).transform(dfML)
    display(model)
