In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [None]:
sc = SparkContext(appName = "Breast_Cancer_Diagnosis")

In [None]:
spark = SparkSession.Builder().getOrCreate()

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer

In [None]:
import zipfile

with zipfile.ZipFile("/content/wdbc.zip") as zf:
    zf.extractall()

In [None]:
data = []

with open("wdbc.data") as infile:
  for line in infile:
    tokens = line.rstrip("\n").split(",")
    y = tokens[1]
    features = Vectors.dense([float(x) for x in tokens[2:]])
    data.append((y, features))

In [None]:
inputDF = spark.createDataFrame(data, ["label", "features"])

In [None]:
inputDF.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    M|[17.99,10.38,122....|
|    M|[20.57,17.77,132....|
|    M|[19.69,21.25,130....|
|    M|[11.42,20.38,77.5...|
|    M|[20.29,14.34,135....|
|    M|[12.45,15.7,82.57...|
|    M|[18.25,19.98,119....|
|    M|[13.71,20.83,90.2...|
|    M|[13.0,21.82,87.5,...|
|    M|[12.46,24.04,83.9...|
|    M|[16.02,23.24,102....|
|    M|[15.78,17.89,103....|
|    M|[19.17,24.8,132.4...|
|    M|[15.85,23.95,103....|
|    M|[13.73,22.61,93.6...|
|    M|[14.54,27.54,96.7...|
|    M|[14.68,20.13,94.7...|
|    M|[16.13,20.68,108....|
|    M|[19.81,22.15,130....|
|    B|[13.54,14.36,87.4...|
+-----+--------------------+
only showing top 20 rows



In [None]:
stringIndexer = StringIndexer(inputCol = "label", outputCol = "labelIndexed")
si_model = stringIndexer.fit(inputDF)
inputDF2 = si_model.transform(inputDF)

In [None]:
(train, test) = inputDF2.randomSplit([0.7,0.3], seed = 23)

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
decisionTree = DecisionTreeClassifier(labelCol = "labelIndexed")

In [None]:
dtModel = decisionTree.fit(train)

In [None]:
dtModel.numNodes

27

In [None]:
dtModel.depth

5

In [None]:
dtModel.featureImportances

SparseVector(30, {0: 0.0073, 1: 0.0565, 2: 0.0082, 3: 0.0131, 5: 0.0219, 10: 0.0105, 20: 0.7509, 21: 0.018, 24: 0.0134, 26: 0.0106, 27: 0.0897})

In [None]:
dtModel.numFeatures

30

In [None]:
print(dtModel.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_b738baba51c8, depth=5, numNodes=27, numClasses=2, numFeatures=30
  If (feature 20 <= 16.765)
   If (feature 27 <= 0.15635)
    If (feature 3 <= 696.25)
     If (feature 24 <= 0.179)
      Predict: 0.0
     Else (feature 24 > 0.179)
      If (feature 0 <= 9.7485)
       Predict: 0.0
      Else (feature 0 > 9.7485)
       Predict: 1.0
    Else (feature 3 > 696.25)
     If (feature 1 <= 13.295)
      Predict: 0.0
     Else (feature 1 > 13.295)
      Predict: 1.0
   Else (feature 27 > 0.15635)
    If (feature 21 <= 23.215)
     If (feature 2 <= 86.945)
      Predict: 0.0
     Else (feature 2 > 86.945)
      Predict: 1.0
    Else (feature 21 > 23.215)
     Predict: 1.0
  Else (feature 20 > 16.765)
   If (feature 1 <= 14.805)
    If (feature 5 <= 0.13035)
     Predict: 0.0
    Else (feature 5 > 0.13035)
     Predict: 1.0
   Else (feature 1 > 14.805)
    If (feature 10 <= 0.18159999999999998)
     Predict: 0.0
    Else (feature 10 > 

In [None]:
predictions = dtModel.transform(test)

In [None]:
predictions.select("label", "labelIndexed", "probability", "prediction").show()

+-----+------------+--------------------+----------+
|label|labelIndexed|         probability|prediction|
+-----+------------+--------------------+----------+
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|       0.0|
|    B|         0.0|[0.97609561752988...|     

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "labelIndexed")
accuracy = evaluator.evaluate(predictions)

print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0519481
