In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.util import MLUtils
from pyspark.sql.types import LongType, DoubleType
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.linalg import DenseVector
from pyspark.mllib.evaluation import MulticlassMetrics
import time
from pyspark.mllib.regression import LabeledPoint

In [None]:
spark = SparkSession.builder.master("spark://master:7077").config("spark.executor.memory", "6gb").appName("Naive Bayes").getOrCreate()
sc = spark.sparkContext

In [None]:
columns = ['labels']
for i in range(1, 1025):
    columns.append("f" + str(i))
df = spark.read.format('csv').options(header='true').load('/MLInput_u/MLInput_u.csv') 

df.printSchema()
df.show()

In [None]:
def convertColumn(df, names, newType):
    for name in names: 
        df = df.withColumn(name, df[name].cast(newType))
    return df 

df = convertColumn(df, columns, LongType())

In [None]:
input_data = df.rdd.map(lambda x: (x[0], DenseVector(x[1:])))
df_new = spark.createDataFrame(input_data, ["label", "features"])

# standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")
# scaler = standardScaler.fit(df_new)
# scaled_df = scaler.transform(df_new)

# scaled_df.take(2)
# scaled_df.printSchema()
# scaled_df.show()

#without scaling
scaled_df = df_new

In [None]:
train_data, test_data = scaled_df.withColumn("label", predictions["label"].cast("double")).randomSplit([.7,.3],seed=1234)
rddTrain = train_data.rdd.map(lambda row: LabeledPoint(row['label'], row['features'].toArray()))
rddTest = test_data.rdd.map(lambda row: LabeledPoint(row['label'], row['features'].toArray()))

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

lr = LogisticRegressionWithLBFGS(labelCol="label", featuresCol="features")
nbparamGrid = (ParamGridBuilder().build())
nbevaluator = MulticlassClassificationEvaluator(metricName="accuracy")
nbcv = CrossValidator(estimator = lr,
                    estimatorParamMaps = nbparamGrid,
                    evaluator = nbevaluator,
                    numFolds = 10)
nbcvModel = nbcv.fit(train_data)

In [None]:
predictions = nbcvModel.transform(test_data)
print('Accuracy:', nbevaluator.evaluate(predictions))

In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
predictions_rdd = predictions.withColumn("label", predictions["label"].cast("double")).rdd.map(lambda r: (r.prediction, r.label))
metrics = MulticlassMetrics(predictions_rdd)

In [None]:
print("Summary Stats")
print("Precision = %s" % metrics.precision())
print("Recall = %s" % metrics.recall())
print("F1 Score = %s" % metrics.fMeasure())
print("Weighted recall = %s" % metrics.weightedRecall)
print("Weighted precision = %s" % metrics.weightedPrecision)
print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

In [None]:
print(metrics.confusionMatrix().toArray())