In [1]:
import findspark
findspark.init('/home/shashank/spark-2.3.2-bin-hadoop2.7')

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('code').getOrCreate()

In [4]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier

In [5]:
data = spark.read.csv('College.csv', inferSchema=True, header=True)

In [6]:
data.createOrReplaceTempView('data')

In [8]:
spark.sql("FROM data SELECT *").show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

In [9]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [10]:
from pyspark.ml.feature import VectorAssembler

In [12]:
assembler = VectorAssembler(inputCols=['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], outputCol='features')

In [13]:
output = assembler.transform(data)

In [15]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [16]:
private_indexer = StringIndexer(inputCol='Private', outputCol='PrivateIndex')

In [17]:
output_fixed = private_indexer.fit(output).transform(output)

In [19]:
output_fixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [20]:
output_fixed.createOrReplaceTempView('output_fixed')

In [21]:
final_data = spark.sql("FROM output_fixed SELECT features, PrivateIndex as label")

In [22]:
final_data.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1660.0,1232.0,72...|  0.0|
|[2186.0,1924.0,51...|  0.0|
|[1428.0,1097.0,33...|  0.0|
|[417.0,349.0,137....|  0.0|
|[193.0,146.0,55.0...|  0.0|
|[587.0,479.0,158....|  0.0|
|[353.0,340.0,103....|  0.0|
|[1899.0,1720.0,48...|  0.0|
|[1038.0,839.0,227...|  0.0|
|[582.0,498.0,172....|  0.0|
|[1732.0,1425.0,47...|  0.0|
|[2652.0,1900.0,48...|  0.0|
|[1179.0,780.0,290...|  0.0|
|[1267.0,1080.0,38...|  0.0|
|[494.0,313.0,157....|  0.0|
|[1420.0,1093.0,22...|  0.0|
|[4302.0,992.0,418...|  0.0|
|[1216.0,908.0,423...|  0.0|
|[1130.0,704.0,322...|  0.0|
|[3540.0,2001.0,10...|  1.0|
+--------------------+-----+
only showing top 20 rows



In [23]:
train, test = final_data.randomSplit([0.7,0.3])

In [50]:
dtc = DecisionTreeClassifier(maxBins=50, maxDepth=3, minInfoGain=0.1)
dtc2 = DecisionTreeClassifier() #default
rfc = RandomForestClassifier(maxBins=50, maxDepth=20, minInfoGain=0.1, numTrees=200)
rfc2 = RandomForestClassifier() #default
gbc = GBTClassifier(maxBins=50, maxDepth=20, minInfoGain=0.1, maxIter=200)
gbc2 = GBTClassifier()


In [49]:
dtc2_model = dtc2.fit(train)
rfc2_model = rfc2.fit(train)
gbc2_model = gbc2.fit(train)

In [51]:
dtc_model = dtc.fit(train)
rfc_model = rfc.fit(train)
gbc_model = gbc.fit(train)

In [52]:
dtc_pred = dtc_model.transform(test)
rfc_pred = rfc_model.transform(test)
gbc_pred = gbc_model.transform(test)
dtc2_pred = dtc2_model.transform(test)
rfc2_pred = rfc2_model.transform(test)
gbc2_pred = gbc2_model.transform(test)

In [53]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

In [54]:
acc_eval = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
auc_eval = BinaryClassificationEvaluator()

In [55]:
print("Reparameterized DTC has a prediction accuracy of {}% and AUC of {}".format(acc_eval.evaluate(dtc_pred)*100, auc_eval.evaluate(dtc_pred)))
print("Default DTC has a prediction accuracy of {}% and AUC of {}".format(acc_eval.evaluate(dtc2_pred)*100, auc_eval.evaluate(dtc2_pred)))
print("Reparameterized RFC has a prediction accuracy of {}% and AUC of {}".format(acc_eval.evaluate(rfc_pred)*100, auc_eval.evaluate(rfc_pred)))
print("Default RFC has a prediction accuracy of {}% and AUC of {}".format(acc_eval.evaluate(rfc2_pred)*100, auc_eval.evaluate(rfc2_pred)))
print("Reparameterized GBC has a prediction accuracy of {}% and AUC of {}".format(acc_eval.evaluate(gbc_pred)*100, auc_eval.evaluate(gbc_pred)))
print("Default GBC has a prediction accuracy of {}% and AUC of {}".format(acc_eval.evaluate(gbc2_pred)*100, auc_eval.evaluate(gbc2_pred)))

Reparameterized DTC has a prediction accuracy of 92.0% and AUC of 0.8505291005291005
Default DTC has a prediction accuracy of 91.11111111111111% and AUC of 0.9242112482853224
Reparameterized RFC has a prediction accuracy of 92.0% and AUC of 0.9824612972761123
Default RFC has a prediction accuracy of 94.66666666666667% and AUC of 0.9743288261806784
Reparameterized GBC has a prediction accuracy of 92.0% and AUC of 0.9603174603174602
Default GBC has a prediction accuracy of 90.66666666666666% and AUC of 0.9610523221634335
