In [1]:
import numpy as np
import findspark
findspark.init()
findspark.find()
import pyspark
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import DenseVector
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.sql.types import StringType, IntegerType, LongType, DoubleType
from pyspark.sql import SparkSession
from pyspark.ml.classification import LinearSVC
from pyspark.mllib.evaluation import MulticlassMetrics
import time

In [2]:
conf = pyspark.SparkConf()
pyspark.SparkContext.setSystemProperty('spark.executor.memory', '6g')
sc = pyspark.SparkContext('spark://192.168.2.84:7077','Run supervised learning - linear support vector machine')
spark = SparkSession.builder.getOrCreate()

In [3]:
columns = ['labels']
for i in range(1, 1025):
    columns.append("f" + str(i))
df = spark.read.format('csv').options(header='true').load('/MLInput_u/MLInput_u.csv') 

In [4]:
def convertColumn(df, names, newType):
    for name in names: 
        df = df.withColumn(name, df[name].cast(newType))
    return df 

df = convertColumn(df, columns, LongType())

In [5]:
input_data = df.rdd.map(lambda x: (x[0], DenseVector(x[1:])))
df_new = spark.createDataFrame(input_data, ["label", "features"])

standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")
scaler = standardScaler.fit(df_new)
scaled_df = scaler.transform(df_new)

scaled_df.drop('features')

scaled_df.take(2)
scaled_df.printSchema()
scaled_df.show()

# #Not Scaling
# scaled_df = df_new
# scaled_df.printSchema()
# scaled_df.show()

root
 |-- label: long (nullable = true)
 |-- features: vector (nullable = true)
 |-- features_scaled: vector (nullable = true)

+-----+--------------------+--------------------+
|label|            features|     features_scaled|
+-----+--------------------+--------------------+
|    0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|    1|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|    1|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|    0|[0.0,0.0,0.0,1.0,...|[0.0,0.0,0.0,7.43...|
|    0|[0.0,0.0,0.0,0.

In [13]:
splits = scaled_df.randomSplit([0.7, 0.3], 1234)
trainData = splits[0].rdd.map(lambda row: LabeledPoint(row['label'], row['features'].toArray()))
testData = splits[1].rdd.map(lambda row: LabeledPoint(row['label'], row['features'].toArray()))

In [20]:
model = SVMWithSGD.train(trainData, iterations=100)

In [21]:
labelsAndPreds = testData.map(lambda p: (float(p.label), float(model.predict(p.features))))

In [22]:
metrics = MulticlassMetrics(labelsAndPreds)
print(metrics.confusionMatrix().toArray())

[[178951.  44370.]
 [  1196.   1780.]]


In [23]:
print(metrics.accuracy)

print(metrics.precision())

print(metrics.recall())

print(metrics.fMeasure())

print(metrics.weightedPrecision)

print(metrics.weightedRecall)

print(metrics.weightedFMeasure())

0.7986451433293416
0.7986451433293416
0.7986451433293416
0.7986451433293416
0.9808046540812909
0.7986451433293416
0.8763514975508268


In [14]:
train_data = splits[0]
test_data = splits[1]

In [17]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

svm = LinearSVC(labelCol="label", featuresCol="features_scaled")
svmparamGrid = (ParamGridBuilder().build())
svmevaluator = MulticlassClassificationEvaluator(metricName="accuracy")
svmcv = CrossValidator(estimator = svm,
                    estimatorParamMaps = svmparamGrid,
                    evaluator = svmevaluator,
                    numFolds = 10)
svmcvModel = svmcv.fit(train_data)
print(svmcvModel)

CrossValidatorModel_3c7036e93b0d


In [18]:
predictions = svmcvModel.transform(test_data)
print('Accuracy:', svmevaluator.evaluate(predictions))

Accuracy: 0.8454989269121012


In [19]:
predictions_rdd = predictions.withColumn("label", predictions["label"].cast("double")).rdd.map(lambda r: (r.prediction, r.label))
metrics = MulticlassMetrics(predictions_rdd)

In [20]:
print("Summary Stats")
print("Precision = %s" % metrics.precision())
print("Recall = %s" % metrics.recall())
print("F1 Score = %s" % metrics.fMeasure())
print("Weighted recall = %s" % metrics.weightedRecall)
print("Weighted precision = %s" % metrics.weightedPrecision)
print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

Summary Stats
Precision = 0.8454989269121012
Recall = 0.8454989269121012
F1 Score = 0.8454989269121012
Weighted recall = 0.8454989269121012
Weighted precision = 0.8430020645684446
Weighted F(1) Score = 0.8158317237228315
Weighted F(0.5) Score = 0.8195047310485137
Weighted false positive rate = 0.5548099681796149


In [21]:
print(metrics.confusionMatrix().toArray())

[[132768.   2216.]
 [ 23988.  10632.]]


In [22]:
sc.stop()