# Machine Learning

In [17]:
from pyspark import SparkContext
from pyspark.mllib.util import MLUtils
from pyspark.ml.classification import LogisticRegression

In [11]:
spark = SparkSession.builder \
    .master("local[16]") \
    .appName("machine_learning") \
    .getOrCreate()

## Loading the Data

In [24]:
data_A = spark.read.format("libsvm").option("numFeatures", 3000).load("data/model_A")
data_B = spark.read.format("libsvm").option("numFeatures", 5000).load("data/model_B")
data_C = spark.read.format("libsvm").option("numFeatures", 5000).load("data/model_C")

In [25]:
for a in data_A.take(5):
    print(a)

Row(label=1.0, features=SparseVector(3000, {109: 1.0, 938: 1.0, 971: 1.0, 1884: 1.0, 2212: 1.0, 2632: 1.0, 2805: 1.0}))
Row(label=1.0, features=SparseVector(3000, {2736: 1.0}))
Row(label=1.0, features=SparseVector(3000, {455: 1.0, 2058: 1.0}))
Row(label=1.0, features=SparseVector(3000, {387: 1.0, 637: 1.0, 2583: 1.0}))
Row(label=1.0, features=SparseVector(3000, {20: 1.0, 40: 1.0, 1866: 1.0, 2375: 1.0}))


In [42]:
lr = LogisticRegression(maxIter=10, regParam=0.3)

# Fit the model
lrModel = lr.fit(data_A)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients[:10]))
print("Intercept: " + str(lrModel.intercept))

                                                                                

Coefficients: [-0.02276131  0.04061249 -0.01800351  0.08505367 -0.05994562 -0.24979415
  0.02310766 -0.01446619 -0.14054308  0.00888738]
Intercept: 1.9085418487210157


In [39]:
# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = lrModel.summary

# Obtain the objective per iteration
# objectiveHistory = trainingSummary.objectiveHistory
# print("objectiveHistory:")
# for objective in objectiveHistory:
#     print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)

+--------------------+--------------------+
|                 FPR|                 TPR|
+--------------------+--------------------+
|                 0.0|                 0.0|
|                 0.0|9.193716120151319E-4|
|                 0.0|0.001838743224030...|
|                 0.0|0.002798305944767369|
|                 0.0|0.003788011997045...|
|3.263707571801566...|0.004737526940602565|
|6.527415143603133E-5| 0.00568201799556893|
|  9.7911227154047E-5|0.006661676270667022|
|  9.7911227154047E-5|0.007626262879994373|
|  9.7911227154047E-5|0.008580801712141232|
|  9.7911227154047E-5|0.009600651095961296|
|  9.7911227154047E-5|0.010555189928108154|
|1.305483028720626...|0.011564991534747725|
|1.631853785900783...| 0.01255469758702631|
|1.631853785900783...| 0.01350923641917317|
|1.631853785900783...|0.014508990248632246|
|2.610966057441253E-4|0.015433385749237624|
|3.590078328981723E-4|0.016418067912925962|
| 3.91644908616188E-4| 0.01738767841084356|
|4.242819843342036...|0.01840752

LogisticRegression_e90508e1ed95

In [40]:
print(lr.getThreshold())

0.7567948307093846


In [41]:
predictions = lrModel.transform(data_A)

accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(predictions.count())
print("Accuracy : ",accuracy)

Accuracy :  0.8772339990160608
