In [1]:
from common import Common
from pyspark import *
from pyspark.streaming import *
from pyspark import SparkContext, SparkConf
import numpy as np
from pyspark.mllib.stat import Statistics
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

In [2]:
## Do not run this multiple times
common = Common()
sc,spark = common.get_spark_sql()

<SparkContext master=local[*] appName=myapp>
2.4.5


In [3]:
svm_data = '/home/ec2-user/data/sample_svm_data.txt'
libsvm_data = '/home/ec2-user/data/sample_libsvm_data.txt'

## Summary Statistics

In [4]:
mat = sc.parallelize(
    [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0])])

# Compute column summary statistics.
summary = Statistics.colStats(mat)
print(summary.mean())  # a dense vector containing the mean value for each column
print(summary.variance())  # column-wise variance
print(summary.numNonzeros())  # number of nonzeros in each column

[  2.  20. 200.]
[1.e+00 1.e+02 1.e+04]
[3. 3. 3.]


### Correlation

In [5]:
seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])
seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])

print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))

# calculate the correlation matrix using Pearson's method.
data = sc.parallelize([np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])])
print(Statistics.corr(data, method="pearson"))

Correlation is: 0.8500286768773001
[[1.         0.97888347 0.99038957]
 [0.97888347 1.         0.99774832]
 [0.99038957 0.99774832 1.        ]]


## classification - Support Vector Machine Stochastic Gradient

In [6]:
# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

In [7]:
data = sc.textFile(svm_data)
parsedData = data.map(parsePoint)
print (type(parsedData))
print ('count: ', parsedData.count())
print ('Datatype:', type(parsedData.first()))
parsedData.first()

<class 'pyspark.rdd.PipelinedRDD'>
count:  322
Datatype: <class 'pyspark.mllib.regression.LabeledPoint'>


LabeledPoint(1.0, [0.0,2.52078447201548,0.0,0.0,0.0,2.004684436494304,2.000347299268466,0.0,2.228387042742021,2.228387042742023,0.0,0.0,0.0,0.0,0.0,0.0])

In [8]:
print ('all labels: ', parsedData.map(lambda x: x.label).distinct().collect())
print ('feature length: ', len(parsedData.map(lambda x: x.features).first()))

all labels:  [0.0, 1.0]
feature length:  16


In [9]:
# Split
(train, test) = parsedData.randomSplit([0.7, 0.3])
print ('train: ', train.count())
print ('test: ', test.count())

train:  231
test:  91


In [13]:
# train model
model = SVMWithSGD.train(train, iterations=100)
model

(weights=[-0.14417820005502557,-0.00855376490763898,-0.10906778378861458,0.6252140652717189,-0.16987795951075002,0.015591462837950515,0.02927586053245111,-0.6264710549563833,-0.04503035212088982,-0.04503035212088945,0.23312217099657367,-0.18689628973553016,0.18908928216988097,0.037783841534553414,-0.12186762393928843,0.06523934075383969], intercept=0.0)

In [15]:
# model parameters
print (model.weights)
print ('\n', model.intercept)


[-0.14417820005502557,-0.00855376490763898,-0.10906778378861458,0.6252140652717189,-0.16987795951075002,0.015591462837950515,0.02927586053245111,-0.6264710549563833,-0.04503035212088982,-0.04503035212088945,0.23312217099657367,-0.18689628973553016,0.18908928216988097,0.037783841534553414,-0.12186762393928843,0.06523934075383969]

 0.0


In [16]:
# Evaluating the model on train data
labelsAndPreds = train.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(train.count())
print("Training Error = " + str(trainErr))


Training Error = 0.3722943722943723


In [17]:
# Evaluating the model on test data
labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))
testErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(test.count())
print("Test Error = " + str(testErr))


Test Error = 0.4175824175824176


In [19]:
# Save and load model
# model.save(sc, <model+path>)
# sameModel = SVMModel.load(sc, <model_path>)