In [1]:
from __future__ import print_function

from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.regression import LabeledPoint
from numpy import array
import csv


In [2]:
sc = SparkContext(appName="PythonDecisionTreeRegressionExample")

In [3]:
#labelled points using batsman,nonstrike,bowler,home,averagescore,sr,innings for decision tree classifier
def createlabeledpoints_r(f):
        return LabeledPoint(f[3],f[0:3]+f[5:8]+f[10:12])

#labelled points using batsman,nonstrike,bowler,home,ave,sr,econ,innings
def createlabeledpoints_w(f):
        return LabeledPoint(f[4],f[0:3]+list(f[5])+f[8:12])
    

In [4]:
data = sc.textFile("decisiontree_ballinfo.csv")
csvdata=data.map(regex=player_map[nonstrike],axis=0 x:x.split(","))
header = csvdata.first()
csvdata = csvdata.filter(lambda x: x != header)
trainingdata1=csvdata.map(createlabeledpoints_r)
trainingdata1.collect()

[LabeledPoint(11.0, [1.0,2.0,2.0,1.0,36.44,143.74,7.55,0.0]),
 LabeledPoint(5.0, [1.0,2.0,2.0,1.0,36.44,143.74,7.55,0.0]),
 LabeledPoint(9.0, [1.0,2.0,4.0,1.0,36.44,143.74,7.26,0.0]),
 LabeledPoint(7.0, [2.0,1.0,2.0,1.0,28.77,134.56,7.55,0.0]),
 LabeledPoint(5.0, [2.0,1.0,4.0,1.0,28.77,134.56,7.26,0.0]),
 LabeledPoint(1.0, [2.0,1.0,4.0,1.0,28.37,119.25,8.19,0.0]),
 LabeledPoint(7.0, [2.0,1.0,2.0,1.0,28.37,119.25,7.55,0.0]),
 LabeledPoint(6.0, [1.0,2.0,4.0,1.0,36.44,143.74,6.77,0.0]),
 LabeledPoint(17.0, [2.0,1.0,4.0,1.0,28.37,119.25,8.19,0.0]),
 LabeledPoint(3.0, [2.0,1.0,4.0,1.0,25.88,129.01,6.77,0.0]),
 LabeledPoint(12.0, [1.0,2.0,2.0,1.0,36.44,143.74,7.55,0.0]),
 LabeledPoint(5.0, [1.0,2.0,4.0,1.0,36.44,143.74,7.26,0.0]),
 LabeledPoint(2.0, [2.0,0.0,4.0,1.0,25.88,129.01,6.77,0.0]),
 LabeledPoint(4.0, [1.0,0.0,2.0,1.0,28.45,144.09,7.55,0.0]),
 LabeledPoint(9.0, [0.0,1.0,4.0,1.0,25.63,120.48,6.77,0.0]),
 LabeledPoint(14.0, [1.0,0.0,2.0,1.0,28.45,144.09,7.55,0.0]),
 LabeledPoint(9.0, [

In [5]:
model1 = DecisionTree.trainClassifier(trainingdata1, maxDepth=30, numClasses=37, categoricalFeaturesInfo={0:5,1:5,2:5,3:2,7:2})
print('Classifier tree model:')
print(model1.toDebugString())

Classifier tree model:
DecisionTreeModel classifier of depth 30 with 29995 nodes
  If (feature 6 <= 7.305)
   If (feature 4 <= 16.775)
    If (feature 1 in {3.0,0.0})
     If (feature 5 <= 131.985)
      If (feature 0 in {0.0})
       If (feature 3 in {0.0})
        If (feature 7 in {0.0})
         If (feature 6 <= 6.93)
          If (feature 6 <= 6.375)
           If (feature 2 in {1.0})
            Predict: 10.0
           Else (feature 2 not in {1.0})
            If (feature 4 <= 12.620000000000001)
             If (feature 2 in {2.0})
              If (feature 5 <= 117.32499999999999)
               Predict: 3.0
              Else (feature 5 > 117.32499999999999)
               Predict: 11.0
             Else (feature 2 not in {2.0})
              Predict: 11.0
            Else (feature 4 > 12.620000000000001)
             Predict: 3.0
          Else (feature 6 > 6.375)
           If (feature 2 in {2.0})
            If (feature 6 <= 6.835)
             If (feature 4 <= 12.620000000

In [6]:
trainingdata2=csvdata.map(createlabeledpoints_w)
model2 = DecisionTree.trainClassifier(trainingdata2,maxDepth=20, numClasses=5, categoricalFeaturesInfo={0:5,1:5,2:5,3:2,7:2})
print('Classifier tree model:')
print(model2.toDebugString())

Classifier tree model:
DecisionTreeModel classifier of depth 20 with 7683 nodes
  If (feature 0 in {2.0,1.0})
   If (feature 1 in {2.0,1.0})
    If (feature 5 <= 23.9)
     If (feature 1 in {1.0})
      If (feature 4 <= 29.14)
       If (feature 4 <= 26.585)
        If (feature 5 <= 19.9)
         If (feature 3 in {0.0})
          If (feature 5 <= 17.75)
           If (feature 6 <= 7.985)
            If (feature 6 <= 7.905)
             If (feature 5 <= 17.35)
              If (feature 4 <= 19.59)
               Predict: 0.0
              Else (feature 4 > 19.59)
               If (feature 6 <= 7.41)
                If (feature 7 in {0.0})
                 If (feature 4 <= 20.665)
                  Predict: 1.0
                 Else (feature 4 > 20.665)
                  Predict: 0.0
                Else (feature 7 not in {0.0})
                 Predict: 0.0
               Else (feature 6 > 7.41)
                If (feature 7 in {0.0})
                 Predict: 0.0
                Else

In [7]:
model1.save(sc, "runs")
model2.save(sc, "wickets")

In [8]:
Model_runs = DecisionTreeModel.load(sc, "runs")