In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('ComparisonBwTrees').getOrCreate()

In [3]:
df = spark.read.csv('College.csv', header = True, inferSchema = True)

In [4]:
df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [5]:
df.describe().show()

+-------+--------------------+-------+------------------+------------------+----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+------------------+
|summary|              School|Private|              Apps|            Accept|          Enroll|         Top10perc|         Top25perc|      F_Undergrad|      P_Undergrad|          Outstate|        Room_Board|             Books|          Personal|               PhD|          Terminal|         S_F_Ratio|       perc_alumni|          Expend|         Grad_Rate|
+-------+--------------------+-------+------------------+------------------+----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-------

In [6]:
# we see that only School and Private are string
# other vectors are integers and have the variance retained

In [7]:
cols = df.columns

In [8]:
ls_feature = cols[2:]

In [9]:
df.groupBy('Private').count().show()

+-------+-----+
|Private|count|
+-------+-----+
|     No|  212|
|    Yes|  565|
+-------+-----+



Machine Learning Starts

In [10]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer

from pyspark.ml import Pipeline

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

In [11]:
indexer = StringIndexer(inputCol="Private", outputCol="PrivateIndex")
assembler = VectorAssembler(inputCols = ls_feature, outputCol = 'features')
scalar = StandardScaler(inputCol = 'features' , outputCol = 'scaledFeature')

In [12]:
# Single Tree
dtc = DecisionTreeClassifier(labelCol='PrivateIndex',featuresCol='scaledFeature')

# Random Forest
rfc = RandomForestClassifier(labelCol='PrivateIndex',featuresCol='scaledFeature')

# Gradient Boosted Tree
gbt = GBTClassifier(labelCol='PrivateIndex',featuresCol='scaledFeature')

In [13]:
# pipeline setup

pipeline_dtc = Pipeline(stages = [indexer, assembler, scalar, dtc])
pipeline_rfc = Pipeline(stages = [indexer, assembler, scalar, rfc])
pipeline_gbt = Pipeline(stages = [indexer, assembler, scalar, gbt])

In [14]:
# train, test data split

train, test = df.randomSplit([0.7,0.3])

In [15]:
dtc_model = pipeline_dtc.fit(train)
rfc_model = pipeline_rfc.fit(train)
gbt_model = pipeline_gbt.fit(train)

In [16]:
dtc_predictions = dtc_model.transform(test)
rfc_predictions = rfc_model.transform(test)
gbt_predictions = gbt_model.transform(test)

Model Evaluation
- Accuracy
- AUC

In [17]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [18]:
# Accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol="PrivateIndex", predictionCol="prediction", metricName="accuracy")
# AUC
evaluator_auc = BinaryClassificationEvaluator(labelCol="PrivateIndex", rawPredictionCol="prediction")

In [19]:
ls_model = [dtc_predictions, rfc_predictions, gbt_predictions]
ls_algo = ['Single Decission Tree', 'Random Forest', 'Gradient Boost']
for i in range(3):
    print (ls_algo[i]+' Accuracy:',round(evaluator_acc.evaluate(ls_model[i]),2),
           'and AUC:',round(evaluator_auc.evaluate(ls_model[i]),2))
    print ('-'*80)

Single Decission Tree Accuracy: 0.93 and AUC: 0.9
--------------------------------------------------------------------------------
Random Forest Accuracy: 0.95 and AUC: 0.9
--------------------------------------------------------------------------------
Gradient Boost Accuracy: 0.93 and AUC: 0.91
--------------------------------------------------------------------------------
