### Dataset

[Breast Cancer Wisconsin (Diagnostic) Data Set](http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29)

Class distribution: 357 benign "B", 212 malignant "M"

In [1]:
import urllib
f = urllib.urlretrieve ("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data", "data/wdbc.data")

Break them into train and test datasets

In [3]:
data_file = "data/wdbc.data"
raw = sc.textFile(data_file)

print "All data size is {}".format(raw.count())

All data size is 569


In [4]:
from pyspark.mllib.regression import LabeledPoint
from numpy import array

def parse(line):
    line_split = line.split(",")
    line_final = line_split[2:33]
    if line_split[1]=='B':
        diagnosis = 0.0
    if line_split[1]=='M':
        diagnosis = 1.0
    return LabeledPoint(diagnosis, array([float(x) for x in line_final]))

parsedData = raw.map(parse)
parsedData.take(2)

[LabeledPoint(1.0, [17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189]),
 LabeledPoint(1.0, [20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902])]

In [5]:
(train, test) = parsedData.randomSplit([0.7, 0.3], seed = 123)

print "Train data size is {}".format(train.count())
print "Test data size is {}".format(test.count())

Train data size is 411
Test data size is 158


### Logistic regression

In [8]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS

model = LogisticRegressionWithLBFGS.train(train)

# Evaluating the model on training data
preds1 = train.map(lambda p: (p.label, model.predict(p.features)))
train_acc = preds1.filter(lambda (v, p): v == p).count() / float(train.count())
print("Training accuracy = " + str(round(train_acc, 3)))

# Evaluating the model on testing data
preds2 = test.map(lambda p: (p.label, model.predict(p.features)))
test_acc = preds2.filter(lambda (v, p): v == p).count() / float(test.count())
print("Testing accuracy = " + str(round(test_acc, 3)))

Training accuracy = 0.983
Testing accuracy = 0.93


### Decision trees

In [9]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel

model = DecisionTree.trainClassifier(train, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=3, maxBins=10)

# Evaluating the model on training data
preds1 = model.predict(train.map(lambda x: x.features))
labels1 = train.map(lambda p: p.label).zip(preds1)
train_acc = labels1.filter(lambda (v, p): v == p).count() / float(train.count())
print("Training accuracy = " + str(round(train_acc, 3)))

# Evaluating the model on testing data
preds2 = model.predict(test.map(lambda x: x.features))
labels2 = test.map(lambda p: p.label).zip(preds2)
test_acc = labels2.filter(lambda (v, p): v == p).count() / float(test.count())
print("Testing accuracy = " + str(round(test_acc, 3)))


Training accuracy = 0.981
Testing accuracy = 0.88


In [10]:
print('Learned classification tree model:')
print(model.toDebugString())

Learned classification tree model:
DecisionTreeModel classifier of depth 3 with 15 nodes
  If (feature 22 <= 106.4)
   If (feature 27 <= 0.1546)
    If (feature 25 <= 0.4665)
     Predict: 0.0
    Else (feature 25 > 0.4665)
     Predict: 0.0
   Else (feature 27 > 0.1546)
    If (feature 24 <= 0.1428)
     Predict: 0.0
    Else (feature 24 > 0.1428)
     Predict: 1.0
  Else (feature 22 > 106.4)
   If (feature 1 <= 16.83)
    If (feature 7 <= 0.08543)
     Predict: 0.0
    Else (feature 7 > 0.08543)
     Predict: 1.0
   Else (feature 1 > 16.83)
    If (feature 26 <= 0.1804)
     Predict: 0.0
    Else (feature 26 > 0.1804)
     Predict: 1.0

