In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.types import LongType, DoubleType
from pyspark.ml.linalg import DenseVector
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import time

In [2]:
spark = SparkSession.builder.master("spark://master:7077").appName("Multilayer perceptron classifier").config("spark.executor.memory", "6gb").getOrCreate()
#spark.config("spark.executor.memory", "6gb")
sc = spark.sparkContext

In [3]:
columns = ['labels']
for i in range(1, 1025):
    columns.append("f" + str(i))
df = spark.read.format('csv').options(header='true').load('/MLInput_u/MLInput_u.csv') 

In [4]:
# from pyspark.sql import functions as F

# def sum_col(df, col):
#     return df.select(F.sum(col)).collect()[0][0]

# for i in range(1, 1025):
#     colname = "f" + str(i)
#     s = sum_col(df, colname)
    
#     if s < 1: # Equal to if s is approx 0, but avoiding numerical errors
#         print(colname)
        
#     if i % 25 == 0:
#         print("done until " + str(i))

In [5]:
def convertColumn(df, names, newType):
    for name in names: 
        df = df.withColumn(name, df[name].cast(newType))
    return df 

df = convertColumn(df, columns, LongType())

In [6]:
input_data = df.rdd.map(lambda x: (x[0], DenseVector(x[1:])))
df_new = spark.createDataFrame(input_data, ["label", "features"])

# standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")
# scaler = standardScaler.fit(df_new)
# scaled_df = scaler.transform(df_new)

# scaled_df.drop('features')

scaled_df = df_new

scaled_df.take(2)
scaled_df.printSchema()
scaled_df.show()

# #Not Scaling
# scaled_df = df_new
# scaled_df.printSchema()
# scaled_df.show()

root
 |-- label: long (nullable = true)
 |-- features: vector (nullable = true)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[0.0,0.0,0.0,0.0,...|
|    1|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|
|    1|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,1.0,...|
|    0|[0.0,0.0,0.0,0.0,...|
|    1|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|
|    1|[1.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|
|    1|[0.0,0.0,0.0,0.0,...|
+-----+--------------------+
only showing top 20 rows



In [7]:
splits = scaled_df.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

In [None]:
start_time = time.time()
maxIter = 100
layers = [[1024, 50, 2], [1024, 100, 2], [1024, 150, 2]]

trainer = MultilayerPerceptronClassifier(maxIter=maxIter)
paramGrid = ParamGridBuilder().addGrid(trainer.layers, layers).build()

crossval = CrossValidator(estimator=trainer,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)

model = crossval.fit(train)
result = model.transform(test)
timeConsumed = time.time() - start_time
predictionAndLabels = result.select("prediction", "label")

In [None]:
best_model = model.bestModel
best_reg_param = best_model._java_obj.getRegParam()
best_elasticnet_param = best_model._java_obj.getElasticNetParam()

In [None]:
with open('Perceptron_Result', 'w') as f:
    f.write(str(best_reg_param))
    f.write('\n')
    f.write(str(best_elasticnet_param))
    f.write('\n')

In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
columns = ["prediction", "label"]
predictionAndLabels = convertColumn(predictionAndLabels, columns, DoubleType())
metrics = MulticlassMetrics(predictionAndLabels.rdd)

cfMatrix = metrics.confusionMatrix().toArray()
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()

evaluator = MulticlassClassificationEvaluator()
evaluator.setPredictionCol("prediction")

accuracy = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "f1"})
weightedPrecision = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "weightedPrecision"})
weightedRecall = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "weightedRecall"})


In [None]:
with open('Perceptron_Result', 'a') as f:
    f.write('Summary Stats:' + '\n')
    f.write('It takes %s minutes' % timeConsumed)
    f.write('\n')
    f.write(str(metrics.confusionMatrix().toArray()))
    f.write('\n')
    f.write('Precision: %s \n' % precision)
    f.write('Recall: %s \n' % recall)
    f.write('f1Score: %s \n'% f1Score)
    f.write('Accuracy: %s \n' % accuracy)
    f.write('f1: %s \n' % f1)
    f.write('Weighted Precision: %s \n' % weightedPrecision)
    f.write('weighted Recall: %s \n'% weightedRecall)
    
sc.stop()

In [None]:
# middle = 60
# maxIter = 100
# metrics = {}

# start_time = time.time()

# layers = [1024, middle, 2]
# trainer = MultilayerPerceptronClassifier(maxIter=maxIter, layers=layers, blockSize=128, seed=1234)

# model = trainer.fit(train)
# result = model.transform(test)
# predictionAndLabels = result.select("prediction", "label")

In [None]:
# from pyspark.mllib.evaluation import MulticlassMetrics
# columns = ["prediction", "label"]
# predictionAndLabels = convertColumn(predictionAndLabels, columns, DoubleType())
# metrics = MulticlassMetrics(predictionAndLabels.rdd)
# print(metrics.confusionMatrix().toArray())

In [None]:
# evaluator = BinaryClassificationEvaluator()
# evaluator.setPredictionCol("prediction")

# accuracy = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "accuracy"})
# print(accuracy)

In [None]:
# f1 = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "f1"})
# print(f1)

In [None]:
# weightedPrecision = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "weightedPrecision"})
# print(weightedPrecision)

In [None]:
# weightedRecall = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "weightedRecall"})
# print(weightedRecall)

In [None]:
# predictionAndLabels

In [None]:
# precision = metrics.precision()
# print("Summary Stats")
# print("Precision = %s" % precision)

In [None]:
# recall = metrics.recall()
# print("Recall = %s" % recall)

In [None]:
# f1Score = metrics.fMeasure()
# print("F1 Score = %s" % f1Score)

In [None]:
# print("Weighted recall = %s" % metrics.weightedRecall)
# print("Weighted precision = %s" % metrics.weightedPrecision)
# print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())

In [None]:
# spark.stop()
# sc.stop()