In [1]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext, SQLContext
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
import numpy as np

In [2]:
# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(np.float64(values[0]), np.float64(values[1:]))

In [3]:
sc = SparkContext(appName="LogisticRegressionWithLBFGS Classification")

In [4]:
data = sc.textFile("C:\Users\Rachan\Downloads\Assignment 1\Assignment 1\winequality-white_Set1.csv")
parsedData = data.map(parsePoint)


In [5]:
print parsedData.first()

(1.0,[7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8])


In [6]:
# 60,40 split traing, tesing data
trainingData, testData = parsedData.randomSplit([3, 2], 11L)

In [7]:
print ("Training Data Count : "+str(trainingData.count()))
print ("Test Data Count : "+str(testData.count()))

Training Data Count : 2958
Test Data Count : 1940


In [9]:
# Build the Training model 
# Parameter variations
trainingModel = LogisticRegressionWithLBFGS.train(trainingData, iterations=200, tolerance=0.001, regType='l2', intercept=True)

In [10]:
trainingPredsAndLabels = trainingData.map(lambda p: (float(trainingModel.predict(p.features)),float(p.label)))

In [12]:
trainErr = trainingPredsAndLabels.filter(lambda (v, p): v != p).count() / float(trainingData.count())
print("Training Error = " + str(trainErr))
metrics = MulticlassMetrics(trainingPredsAndLabels)
binarymetrics = BinaryClassificationMetrics(trainingPredsAndLabels)
print("Training Precision = " + str(metrics.precision()))
print("Training Precision(1) = " + str(metrics.precision(1)))
print("Training Precision(0) = " + str(metrics.precision(0)))
print("Training Recall = " + str(metrics.recall()))
print("Training Recall(1) = " + str(metrics.recall(1)))
print("Training Recall(0) = " + str(metrics.recall(0)))
print("Training Area Under ROC = " + str(binarymetrics.areaUnderROC))


Training Error = 0.19506423259
Training Precision = 0.80493576741
Training Precision(1) = 0.676724137931
Training Precision(0) = 0.815847395451
Training Recall = 0.80493576741
Training Recall(1) = 0.238239757208
Training Recall(0) = 0.967377120487
Training Area Under ROC = 0.602808438848


In [13]:
testPredsAndLabels = testData.map(lambda p: (float(trainingModel.predict(p.features)),float(p.label)))

In [14]:
# Testing data metrics
testErr = testPredsAndLabels.filter(lambda (v, p): v != p).count() / float(testData.count())
print("Testing Error = " + str(testErr))
metrics = MulticlassMetrics(testPredsAndLabels)
binarymetrics = BinaryClassificationMetrics(testPredsAndLabels)
print("Training Precision = " + str(metrics.precision()))
print("Training Precision(1) = " + str(metrics.precision(1)))
print("Training Precision(0) = " + str(metrics.precision(0)))
print("Training Recall = " + str(metrics.recall()))
print("Training Recall(1) = " + str(metrics.recall(1)))
print("Training Recall(0) = " + str(metrics.recall(0)))
print("Training Area Under ROC = " + str(binarymetrics.areaUnderROC))

Testing Error = 0.192783505155
Training Precision = 0.807216494845
Training Precision(1) = 0.597222222222
Training Precision(0) = 0.824053452116
Training Recall = 0.807216494845
Training Recall(1) = 0.213930348259
Training Recall(0) = 0.962288686606
Training Area Under ROC = 0.588109517432


In [15]:
sc.stop()