## Creating Spark Session

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Tree").getOrCreate()

## Reading the input file

In [2]:
data = spark.read.csv("College.csv", inferSchema= True, header = True)

## Target is Private

In [3]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [4]:
data.describe().show()

+-------+--------------------+-------+------------------+------------------+----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+------------------+
|summary|              School|Private|              Apps|            Accept|          Enroll|         Top10perc|         Top25perc|      F_Undergrad|      P_Undergrad|          Outstate|        Room_Board|             Books|          Personal|               PhD|          Terminal|         S_F_Ratio|       perc_alumni|          Expend|         Grad_Rate|
+-------+--------------------+-------+------------------+------------------+----------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-------

In [5]:
data.show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

In [6]:
data.head()

Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)

In [7]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

## Checking for Null in the features

In [43]:
from pyspark.sql.functions import *

data.select([count(when(isnan(x)|col(x).isNull(), x)).alias(x) for x in data.columns]).show()

+------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|     0|      0|   0|     0|     0|        0|        0|          0|          0|       0|         0|    0|       0|  0|       0|        0|          0|     0|        0|
+------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+



## Converting all the features in the form of array for Pyspark ML algorithim

In [8]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad', 
                                         'P_Undergrad', 'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD', 
                                         'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate'],
                             outputCol = "features")

In [9]:
output = assembler.transform(data)

## Converting Target (Private -- Categorical) to numeric

In [10]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol = "Private", outputCol = "PrivateIndex")

In [11]:
output_fixed = indexer.fit(output).transform(output)

In [12]:
output_fixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [13]:
final_data = output_fixed.select(["features", "PrivateIndex"])

final_data.show()

+--------------------+------------+
|            features|PrivateIndex|
+--------------------+------------+
|[1660.0,1232.0,72...|         0.0|
|[2186.0,1924.0,51...|         0.0|
|[1428.0,1097.0,33...|         0.0|
|[417.0,349.0,137....|         0.0|
|[193.0,146.0,55.0...|         0.0|
|[587.0,479.0,158....|         0.0|
|[353.0,340.0,103....|         0.0|
|[1899.0,1720.0,48...|         0.0|
|[1038.0,839.0,227...|         0.0|
|[582.0,498.0,172....|         0.0|
|[1732.0,1425.0,47...|         0.0|
|[2652.0,1900.0,48...|         0.0|
|[1179.0,780.0,290...|         0.0|
|[1267.0,1080.0,38...|         0.0|
|[494.0,313.0,157....|         0.0|
|[1420.0,1093.0,22...|         0.0|
|[4302.0,992.0,418...|         0.0|
|[1216.0,908.0,423...|         0.0|
|[1130.0,704.0,322...|         0.0|
|[3540.0,2001.0,10...|         1.0|
+--------------------+------------+
only showing top 20 rows



## Splitting data into train, test

In [14]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

## Creating Model

In [15]:
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier, RandomForestClassifier
from pyspark.ml import Pipeline

dtc = DecisionTreeClassifier(featuresCol = "features", labelCol = "PrivateIndex")
gbt = GBTClassifier(featuresCol = "features", labelCol = "PrivateIndex")
rfc = RandomForestClassifier(featuresCol = "features", labelCol = "PrivateIndex")

## Training the models

In [16]:
dtc_model = dtc.fit(train_data)
gbt_model = gbt.fit(train_data)
rfc_model = rfc.fit(train_data)

## Prediction

In [26]:
dtc_preds = dtc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)

In [23]:
dtc_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [24]:
gbt_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [27]:
rfc_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



## Model Evaluation

## AUC

In [20]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

my_binary_eval = BinaryClassificationEvaluator(rawPredictionCol = "rawPrediction", labelCol = "PrivateIndex")

In [28]:
print("AUC DTC: ", my_binary_eval.evaluate(dtc_preds))
print("AUC GBT: ", my_binary_eval.evaluate(gbt_preds))
print("AUC RFC: ", my_binary_eval.evaluate(rfc_preds))

AUC DTC:  0.9413937058445729
AUC GBT:  0.9742025262256475
AUC RFC:  0.979233568828944


## Calculating Accuracy Score

In [36]:
accuracy_eval = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "PrivateIndex", 
                                                  metricName= "accuracy")

In [37]:
print("Accuracy Score DTC: ", accuracy_eval.evaluate(dtc_preds))
print("Accuracy Score GBT: ", accuracy_eval.evaluate(gbt_preds))
print("Accuracy Score RFC: ", accuracy_eval.evaluate(rfc_preds))

Accuracy Score DTC:  0.8810572687224669
Accuracy Score GBT:  0.9030837004405287
Accuracy Score RFC:  0.9383259911894273


## Calculating F1 Score

In [39]:
F1_eval = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "PrivateIndex", 
                                                  metricName= "f1")

In [40]:
print("F1 Score DTC: ", F1_eval.evaluate(dtc_preds))
print("F1 Score GBT: ", F1_eval.evaluate(gbt_preds))
print("F1 Score RFC: ", F1_eval.evaluate(rfc_preds))

F1 Score DTC:  0.8860723859512873
F1 Score GBT:  0.9066010557721607
F1 Score RFC:  0.9383259911894273


## Classification Report for Random Forest

In [29]:
from sklearn.metrics import classification_report

print(classification_report(rfc_preds.select('PrivateIndex').toPandas(), rfc_preds.select('prediction').toPandas()))

             precision    recall  f1-score   support

        0.0       0.96      0.96      0.96       173
        1.0       0.87      0.87      0.87        54

avg / total       0.94      0.94      0.94       227



## Confusion Matrix for Random Forest

In [42]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(rfc_preds.select('PrivateIndex').toPandas(), rfc_preds.select('prediction').toPandas()))

[[166   7]
 [  7  47]]
