In [72]:
from pyspark.sql import SparkSession

In [73]:
spark = SparkSession.builder.appName('ClsTree').getOrCreate()

In [74]:
import os

In [75]:
college_data_file = os.path.join(os.path.curdir, 'data', 'College.csv')

In [76]:
college_data = spark.read.csv(college_data_file,
              inferSchema=True,
              header=True)

In [77]:
college_data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [78]:
college_data.count()

777

In [79]:
college_data.head().asDict()

{'School': 'Abilene Christian University',
 'Private': 'Yes',
 'Apps': 1660,
 'Accept': 1232,
 'Enroll': 721,
 'Top10perc': 23,
 'Top25perc': 52,
 'F_Undergrad': 2885,
 'P_Undergrad': 537,
 'Outstate': 7440,
 'Room_Board': 3300,
 'Books': 450,
 'Personal': 2200,
 'PhD': 70,
 'Terminal': 78,
 'S_F_Ratio': 18.1,
 'perc_alumni': 12,
 'Expend': 7041,
 'Grad_Rate': 60}

In [80]:
college_data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [81]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml import pipeline

In [82]:
assembler = VectorAssembler(inputCols=[
    'Apps',
     'Accept',
     'Enroll',
     'Top10perc',
     'Top25perc',
     'F_Undergrad',
     'P_Undergrad',
     'Outstate',
     'Room_Board',
     'Books',
     'Personal',
     'PhD',
     'Terminal',
     'S_F_Ratio',
     'perc_alumni',
     'Expend',
     'Grad_Rate'], outputCol='features')

In [83]:
indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')

In [84]:
data = indexer.fit(college_data).transform(college_data)

In [85]:
data.head().asDict()

{'School': 'Abilene Christian University',
 'Private': 'Yes',
 'Apps': 1660,
 'Accept': 1232,
 'Enroll': 721,
 'Top10perc': 23,
 'Top25perc': 52,
 'F_Undergrad': 2885,
 'P_Undergrad': 537,
 'Outstate': 7440,
 'Room_Board': 3300,
 'Books': 450,
 'Personal': 2200,
 'PhD': 70,
 'Terminal': 78,
 'S_F_Ratio': 18.1,
 'perc_alumni': 12,
 'Expend': 7041,
 'Grad_Rate': 60,
 'PrivateIndex': 0.0}

In [86]:
transformed_data = assembler.transform(data)

In [87]:
transformed_data.head().asDict()

{'School': 'Abilene Christian University',
 'Private': 'Yes',
 'Apps': 1660,
 'Accept': 1232,
 'Enroll': 721,
 'Top10perc': 23,
 'Top25perc': 52,
 'F_Undergrad': 2885,
 'P_Undergrad': 537,
 'Outstate': 7440,
 'Room_Board': 3300,
 'Books': 450,
 'Personal': 2200,
 'PhD': 70,
 'Terminal': 78,
 'S_F_Ratio': 18.1,
 'perc_alumni': 12,
 'Expend': 7041,
 'Grad_Rate': 60,
 'PrivateIndex': 0.0,
 'features': DenseVector([1660.0, 1232.0, 721.0, 23.0, 52.0, 2885.0, 537.0, 7440.0, 3300.0, 450.0, 2200.0, 70.0, 78.0, 18.1, 12.0, 7041.0, 60.0])}

In [88]:
final_data = transformed_data.select(['features', 'PrivateIndex'])

In [89]:
final_data.head()

Row(features=DenseVector([1660.0, 1232.0, 721.0, 23.0, 52.0, 2885.0, 537.0, 7440.0, 3300.0, 450.0, 2200.0, 70.0, 78.0, 18.1, 12.0, 7041.0, 60.0]), PrivateIndex=0.0)

In [90]:
training_data, testing_data = final_data.randomSplit([0.70, 0.3])

In [91]:
from pyspark.ml.classification import (DecisionTreeClassifier,
                                      RandomForestClassifier,
                                      GBTClassifier)

In [92]:
d_classifier = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features')
rf_classifier = RandomForestClassifier(numTrees=40,
                                      labelCol='PrivateIndex', featuresCol='features')
gb_classifier = GBTClassifier(labelCol='PrivateIndex', featuresCol='features')

In [93]:
d_model = d_classifier.fit(final_data)
rf_model = rf_classifier.fit(final_data)
gb_model = gb_classifier.fit(final_data)

In [94]:
d_preds = d_model.transform(testing_data)
rf_preds = rf_model.transform(testing_data)
gb_preds = gb_model.transform(testing_data)

In [95]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [96]:
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'PrivateIndex')

In [97]:
print('Decision Tree performance')
print(my_binary_eval.evaluate(d_preds))

Decision Tree performance
0.9858315677966102


In [98]:
print('Random Forest performance')
print(my_binary_eval.evaluate(rf_preds))

Random Forest performance
0.9939088983050848


In [99]:
my_binary_eval_gb = BinaryClassificationEvaluator(labelCol = 'PrivateIndex',
                                                 rawPredictionCol='prediction')

In [100]:
print('Gradient Boosting performance')
print(my_binary_eval_gb.evaluate(gb_preds))

Gradient Boosting performance
0.9971751412429379


In [102]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [103]:
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',
                                            metricName='accuracy')

In [104]:
print(acc_eval.evaluate(rf_preds))

0.966804979253112
