In [1]:
from pyspark.sql import SparkSession

In [2]:
spark =SparkSession.builder.appName('Tree Methods').getOrCreate()

In [3]:
data = spark.read.csv('College.csv',inferSchema=True,header=True)
cols = data.columns

In [4]:
from pyspark.ml import Pipeline

In [5]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [6]:
data.take(1)[0].asDict()

{'School': 'Abilene Christian University',
 'Private': 'Yes',
 'Apps': 1660,
 'Accept': 1232,
 'Enroll': 721,
 'Top10perc': 23,
 'Top25perc': 52,
 'F_Undergrad': 2885,
 'P_Undergrad': 537,
 'Outstate': 7440,
 'Room_Board': 3300,
 'Books': 450,
 'Personal': 2200,
 'PhD': 70,
 'Terminal': 78,
 'S_F_Ratio': 18.1,
 'perc_alumni': 12,
 'Expend': 7041,
 'Grad_Rate': 60}

In [7]:
data.na.drop().show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

In [8]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [9]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler,StringIndexer

In [10]:
stages=[]

In [11]:
indexer = StringIndexer(inputCol='Private',outputCol='label')
stages+=[indexer]

In [12]:
assembler_input=['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']
assembler = VectorAssembler(inputCols=assembler_input,outputCol='features')
stages+=[assembler]

In [13]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(data)
data = pipelineModel.transform(data)
selectedcols = ["label", "features"] + cols
data = data.select(selectedcols)


In [14]:
data.take(1)[0].asDict()

{'label': 0.0,
 'features': DenseVector([1660.0, 1232.0, 721.0, 23.0, 52.0, 2885.0, 537.0, 7440.0, 3300.0, 450.0, 2200.0, 70.0, 78.0, 18.1, 12.0, 7041.0, 60.0]),
 'School': 'Abilene Christian University',
 'Private': 'Yes',
 'Apps': 1660,
 'Accept': 1232,
 'Enroll': 721,
 'Top10perc': 23,
 'Top25perc': 52,
 'F_Undergrad': 2885,
 'P_Undergrad': 537,
 'Outstate': 7440,
 'Room_Board': 3300,
 'Books': 450,
 'Personal': 2200,
 'PhD': 70,
 'Terminal': 78,
 'S_F_Ratio': 18.1,
 'perc_alumni': 12,
 'Expend': 7041,
 'Grad_Rate': 60}

In [15]:
dataset = data.select('label','features')

In [16]:
train_data,test_data= dataset.randomSplit([0.7,0.3])

In [17]:
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier,GBTClassifier

In [18]:
classifier_DT = DecisionTreeClassifier()
classifier_RF = RandomForestClassifier(numTrees=200)
classifier_GBT= GBTClassifier()

stagesml = [classifier_DT,classifier_RF,classifier_GBT]

In [19]:
dt_model = classifier_DT.fit(train_data)
rf_model = classifier_RF.fit(train_data)
gbt_model =classifier_GBT.fit(train_data)
stagesml+=[dt_model,rf_model,gbt_model]

In [20]:
pred_dt= dt_model.transform(test_data)
pred_rf=dt_model.transform(test_data)
pred_gbt=gbt_model.transform(test_data)
stagesml+=[pred_dt,pred_rf,pred_gbt]

In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [22]:
eval = MulticlassClassificationEvaluator(metricName='accuracy')
stagesml+=[eval]

In [23]:
accu_dt=eval.evaluate(pred_dt)
accu_rf=eval.evaluate(pred_rf)
accu_gbt=eval.evaluate(pred_gbt)

In [24]:
stagesml = [accu_dt,accu_rf,accu_gbt]

In [25]:
print("Here are the results!")
print('-'*80)
print('A single decision tree had an accuracy of: {0:2.2f}%'.format(accu_dt*100))
print('-'*80)
print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(accu_rf*100))
print('-'*80)
print('A ensemble using GBT had an accuracy of: {0:2.2f}%'.format(accu_gbt*100))

Here are the results!
--------------------------------------------------------------------------------
A single decision tree had an accuracy of: 92.49%
--------------------------------------------------------------------------------
A random forest ensemble had an accuracy of: 92.49%
--------------------------------------------------------------------------------
A ensemble using GBT had an accuracy of: 92.96%


DataFrame[label: double, features: vector]