In [2]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext, SQLContext
from pyspark.mllib.evaluation import MulticlassMetrics,BinaryClassificationMetrics
import numpy as np

In [3]:
# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(np.float64(values[0]), np.float64(values[1:]))

In [4]:
sc = SparkContext(appName="LogisticRegressionWithSGD Classification")

In [5]:
data = sc.textFile("C:\Users\Rachan\Downloads\Assignment 1\Assignment 1\winequality-white_Set1.csv")
parsedData = data.map(parsePoint)


In [6]:
print parsedData.first()

(1.0,[7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8])


In [7]:
# 60,40 split traing, tesing data
trainingData, testData = parsedData.randomSplit([3, 2], 11L)

In [8]:
print ("Training Data Count : "+str(trainingData.count()))
print ("Test Data Count : "+str(testData.count()))

Training Data Count : 2958
Test Data Count : 1940


In [9]:
# Build the Training model 
# Parameter variations
trainingModel = LogisticRegressionWithSGD.train(trainingData, iterations=200, step=0.001, regType='l2', intercept=True)

In [10]:
trainingPredsAndLabels = trainingData.map(lambda p: (float(trainingModel.predict(p.features)),float(p.label)))

In [11]:
trainErr = trainingPredsAndLabels.filter(lambda (v, p): v != p).count() / float(trainingData.count())
print("Training Error = " + str(trainErr))
metrics = MulticlassMetrics(trainingPredsAndLabels)
binarymetrics = BinaryClassificationMetrics(trainingPredsAndLabels)
print("Training Precision = " + str(metrics.precision()))
print("Training Precision(1) = " + str(metrics.precision(1)))
print("Training Precision(0) = " + str(metrics.precision(0)))
print("Training Recall = " + str(metrics.recall()))
print("Training Recall(1) = " + str(metrics.recall(1)))
print("Training Recall(0) = " + str(metrics.recall(0)))
print("Training Area Under ROC = " + str(binarymetrics.areaUnderROC))

Training Error = 0.235294117647
Training Precision = 0.764705882353
Training Precision(1) = 0.025641025641
Training Precision(0) = 0.774580335731
Training Recall = 0.764705882353
Training Recall(1) = 0.00151745068285
Training Recall(0) = 0.98347107438
Training Area Under ROC = 0.492494262532


In [12]:
testPredsAndLabels = testData.map(lambda p: (float(trainingModel.predict(p.features)),float(p.label)))

In [13]:
# Testing data metrics
testErr = testPredsAndLabels.filter(lambda (v, p): v != p).count() / float(testData.count())
print("Testing Error = " + str(testErr))
metrics = MulticlassMetrics(testPredsAndLabels)
binarymetrics = BinaryClassificationMetrics(testPredsAndLabels)
print("Training Precision = " + str(metrics.precision()))
print("Training Precision(1) = " + str(metrics.precision(1)))
print("Training Precision(0) = " + str(metrics.precision(0)))
print("Training Recall = " + str(metrics.recall()))
print("Training Recall(1) = " + str(metrics.recall(1)))
print("Training Recall(0) = " + str(metrics.recall(0)))
print("Training Area Under ROC = " + str(binarymetrics.areaUnderROC))

Testing Error = 0.220618556701
Training Precision = 0.779381443299
Training Precision(1) = 0.138888888889
Training Precision(0) = 0.791491596639
Training Recall = 0.779381443299
Training Recall(1) = 0.0124378109453
Training Recall(0) = 0.979843953186
Training Area Under ROC = 0.496140882066


In [14]:
sc.stop()