In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('treecode').getOrCreate()

In [4]:
df = spark.read.csv('College.csv',inferSchema=True,header=True)

In [5]:
df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [6]:
df.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

In [7]:
df.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [8]:
for item in df.head():
    print(item)

Abilene Christian University
Yes
1660
1232
721
23
52
2885
537
7440
3300
450
2200
70
78
18.1
12
7041
60


In [9]:
# format data in two columns
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [10]:
assembler = VectorAssembler(
inputCols=['Apps',
         'Accept',
         'Enroll',
         'Top10perc',
         'Top25perc',
         'F_Undergrad',
         'P_Undergrad',
         'Outstate',
         'Room_Board',
         'Books',
         'Personal',
         'PhD',
         'Terminal',
         'S_F_Ratio',
         'perc_alumni',
         'Expend',
         'Grad_Rate'],
outputCol='features')

In [11]:
output = assembler.transform(df)

In [13]:
from pyspark.ml.feature import StringIndexer

In [16]:
indexer = StringIndexer(inputCol='Private',outputCol='PrivateIndex')
output_fixed = indexer.fit(output).transform(output)

In [18]:
final_data = output_fixed.select('features','PrivateIndex')

In [19]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [20]:
# The Classifier
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier,GBTClassifier

In [25]:
dtc = DecisionTreeClassifier(featuresCol='features',labelCol='PrivateIndex')
rfc = RandomForestClassifier(featuresCol='features',labelCol='PrivateIndex')
gbt = GBTClassifier(featuresCol='features',labelCol='PrivateIndex')

In [26]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [27]:
# evaluation
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

In [28]:
# evaluation matrix
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [31]:
# select prediction and true label to compute error
mul_acc_eval = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='PrivateIndex', metricName='accuracy')

In [54]:
dtc_acc = acc_eval.evaluate(dtc_predictions)
rfc_acc = acc_eval.evaluate(rfc_predictions)
gbt_acc = acc_eval.evaluate(gbt_predictions)

In [55]:
print('DTC:',dtc_acc)

DTC: 0.946058091286307


In [56]:
print('RFC',rfc_acc)

RFC 0.9543568464730291


In [57]:
print('GBT',gbt_acc)

GBT 0.946058091286307


In [58]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [59]:
bi_acc_eval = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='PrivateIndex', metricName='areaUnderROC')

In [60]:
bi_dtc_acc = bi_acc_eval.evaluate(dtc_predictions)
bi_rfc_acc = bi_acc_eval.evaluate(rfc_predictions)
bi_gbt_acc = bi_acc_eval.evaluate(gbt_predictions)

In [61]:
print('Binary DTC:',bi_dtc_acc)

Binary DTC: 0.9591719632768362


In [62]:
print('Binary RFC:',bi_rfc_acc)

Binary RFC: 0.9855667372881356


In [63]:
print('Binary GBT:',bi_gbt_acc)

Binary GBT: 0.9712217514124291


In [64]:
gbt_bi_acc_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='PrivateIndex', metricName='areaUnderROC')

In [65]:
print(gbt_bi_acc_eval.evaluate(gbt_predictions))

0.928363347457627
