In [1]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Spark_SVM").getOrCreate()
spark.sparkContext._conf.getAll()

[('spark.history.kerberos.keytab', 'none'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.history.ui.port', '18081'),
 ('spark.driver.extraLibraryPath',
  '/usr/hdp/current/hadoop-client/lib/native:/usr/hdp/current/hadoop-client/lib/native/Linux-amd64-64'),
 ('spark.executor.extraLibraryPath',
  '/usr/hdp/current/hadoop-client/lib/native:/usr/hdp/current/hadoop-client/lib/native/Linux-amd64-64'),
 ('spark.port.maxRetries', '128'),
 ('spark.history.provider',
  'org.apache.spark.deploy.history.FsHistoryProvider'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.yarn.historyServer.address', 'hdp001.cac.queensu.ca:18081'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.filters',
  'org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter'),
 ('spark.driver.cores', '1'),
 ('spark.eventLog.dir', 'hdfs:///spark2-history/'),
 ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',
  'http://hdp002.cac.queensu.ca:8088/proxy/application_1548786

In [2]:
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '65g'), ('spark.app.name', 'Spark_SVM'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','65g')])
#Stop the current Spark Session

spark.sparkContext.stop()
#Create a Spark Session

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
file_location = "/user/mie_sbetancourt/PROJECT/Data/data_reduced_reweighted_FINAL_3.csv"

from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.types import DoubleType, StringType, IntegerType

# Import the data into a Spark DataFrame with the schema 
data = spark.read.format("csv").option("header","true").option("inferSchema","true").load(file_location)


#data = spark.read.csv(file_location, header=True, inferSchema=True)
data = (data.drop("Census_FirmwareVersionIdentifier")
        .withColumn("AVProductsEnabled", when(data["AVProductsEnabled"]=="unknown", -1).otherwise(data["AVProductsEnabled"])))
    #.withColumn("classWeightCol", when(data["classWeightCol"]>.5, 0.091).otherwise(0.908)))

data_1 = (data.withColumn("AVProductsEnabled", data["AVProductsEnabled"].cast(IntegerType()))
          .withColumn("AvSigVersion_new", data["AvSigVersion_new"].cast(StringType()))
          .withColumn("Census_OSBuildNumber", data["Census_OSBuildNumber"].cast(StringType()))
          .withColumn("Census_OSBuildRevision", data["Census_OSBuildRevision"].cast(StringType()))
          .withColumn("Census_OSUILocaleIdentifier", data["Census_OSUILocaleIdentifier"].cast(StringType()))
          .withColumn("Census_OSVersion_new", data["Census_OSVersion_new"].cast(StringType()))
          .withColumn("CountryIdentifier", data["CountryIdentifier"].cast(StringType()))
          .withColumn("LocaleEnglishNameIdentifier", data["LocaleEnglishNameIdentifier"].cast(StringType()))
          .withColumn("OsBuild", data["OsBuild"].cast(StringType()))
          .withColumn("OsSuite", data["OsSuite"].cast(StringType())))
data_1 = data_1.withColumnRenamed("HasDetections","label").drop("OsBuildLab_new")

In [4]:
stringCols = []
for col in data_1.dtypes:
    if col[1] == 'string':
        stringCols.append(col[0])
stringCols.pop(0)

'MachineIdentifier'

In [5]:
import numpy as np
numericCols = np.setdiff1d(data_1.columns, stringCols).tolist()
numericCols.remove("MachineIdentifier")
numericCols.remove("classWeightCol")
numericCols.remove("label")

In [6]:
sampling_seed=1111
trainingData1 = data_1.sampleBy("label", fractions={0: .01, 1: .1}, seed=sampling_seed)
trainingData = trainingData1.sampleBy("label", fractions={0: .9, 1: .9}, seed=sampling_seed)
# Subtracting 'train' from original 'data' to get test set 
testData = trainingData1.subtract(trainingData)

In [7]:
#trainingData.groupBy("label").count().show()

In [8]:
# Import libraries
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

temp_path = "/user/mie_sbetancourt/PROJECT/"

import time
start_time = time.time()

sampling_seed=1111
# The index of string values multiple columns
indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c),handleInvalid="skip")
    for c in stringCols
]

# The encode of indexed vlaues multiple columns
encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol())) 
    for indexer in indexers
]

lsvc = (LinearSVC(labelCol="label", featuresCol="features"))
      #,weightCol="classWeightCol")) maxIter=100)) #, regParam=0.1, elasticNetParam=0.5))

# Vectorizing encoded values
assembler = VectorAssembler(inputCols=([encoder.getOutputCol() for encoder in encoders] + numericCols),outputCol="features")   
      
#pipeline = Pipeline(stages=indexers + encoders+[assembler]+lr)
pipeline = Pipeline(stages=indexers + encoders+[assembler]+[lsvc])

estimatorParam = ParamGridBuilder() \
    .addGrid(lsvc.regParam, [.025, .01, .05]) \
    .addGrid(lsvc.fitIntercept, [True, False])  \
    .addGrid(lsvc.standardization, [True, False])  \
    .build()

evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction")
  
crossval = CrossValidator(estimator=pipeline,
                         estimatorParamMaps=estimatorParam,
                         evaluator=evaluator,
                         numFolds=3,
                         parallelism=8,
                         seed=sampling_seed)

lsvc_cvmodel = crossval.fit(trainingData)      
# Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
# is areaUnderROC.

lsvc_path = temp_path + "/lsvc"
lsvc.save(lsvc_path)
model_path = temp_path + "/lsvc_model"
lsvc_cvmodel.bestModel.save(model_path)


print("--- %s seconds ---" % (time.time() - start_time))

Py4JJavaError: An error occurred while calling o182848.save.
: java.io.IOException: Path /user/mie_sbetancourt/PROJECT//lsvc already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:503)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:102)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)


In [None]:
###Loading model
from pyspark.ml import *
lsvc_model_path = temp_path + "/lsvc_model"
lsvc_mod2 = PipelineModel.load(lsvc_model_path)
lsvc_predictions = lsvc_mod2.transform(testData)
lsvc_train_predictions = lsvc_mod2.transform(trainingData)
print("The area under ROC for train set after CV  is {}".format(evaluator.evaluate(lsvc_train_predictions)))
print("The area under ROC for test set after CV  is {}".format(evaluator.evaluate(lsvc_predictions)))
print('Best regParam: ', lsvc_mod2.stages[-1]._java_obj.getRegParam())
print('Best fitIntercept: ', lsvc_mod2.stages[-1]._java_obj.getFitIntercept())

In [23]:
'''
1s ry - 8000 secs


.addGrid(lsvc.regParam, [0.0, .025, 1]) \
.addGrid(lsvc.fitIntercept, [True, False]) 
    
The area under ROC for train set after CV  is 0.6836491277992341
The area under ROC for test set after CV  is 0.6522249067567734
Best regParam:  0.025
Best fitIntercept:  True



2nd ry - 8000 secs
'''

NameError: name 'explainParams' is not defined