In [7]:
import findspark

In [8]:
findspark.init('/home/pushya/spark-2.1.0-bin-hadoop2.7')

In [9]:
import pyspark

In [10]:
from pyspark.sql import SparkSession

In [11]:
spark = SparkSession.builder.appName('mytree').getOrCreate()

In [14]:
data=spark.read.csv('College.csv',inferSchema='true',header='true')

In [16]:
data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [22]:
data.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

In [19]:
from pyspark.ml.feature import VectorAssembler

In [23]:
assembler= VectorAssembler(inputCols=['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], 
                           
                           outputCol='features')

In [24]:
output = assembler.transform(data)

In [25]:
from pyspark.ml.feature import StringIndexer

In [28]:
abc = StringIndexer(inputCol='Private', outputCol= 'Privateindex')

In [31]:
full_data = abc.fit(output).transform(output)

In [32]:
full_data.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- Privateindex: double (nullable = true)



In [33]:
final_data = full_data.select(
    'features',
    'Privateindex'

)

In [34]:
final_data.show()

+--------------------+------------+
|            features|Privateindex|
+--------------------+------------+
|[1660.0,1232.0,72...|         0.0|
|[2186.0,1924.0,51...|         0.0|
|[1428.0,1097.0,33...|         0.0|
|[417.0,349.0,137....|         0.0|
|[193.0,146.0,55.0...|         0.0|
|[587.0,479.0,158....|         0.0|
|[353.0,340.0,103....|         0.0|
|[1899.0,1720.0,48...|         0.0|
|[1038.0,839.0,227...|         0.0|
|[582.0,498.0,172....|         0.0|
|[1732.0,1425.0,47...|         0.0|
|[2652.0,1900.0,48...|         0.0|
|[1179.0,780.0,290...|         0.0|
|[1267.0,1080.0,38...|         0.0|
|[494.0,313.0,157....|         0.0|
|[1420.0,1093.0,22...|         0.0|
|[4302.0,992.0,418...|         0.0|
|[1216.0,908.0,423...|         0.0|
|[1130.0,704.0,322...|         0.0|
|[3540.0,2001.0,10...|         1.0|
+--------------------+------------+
only showing top 20 rows



In [42]:
from pyspark.ml.classification import (RandomForestClassifier,DecisionTreeClassifier,GBTClassifier)

In [43]:
from pyspark.ml.regression import (RandomForestRegressor,DecisionTreeRegressor,GBTRegressor)

In [40]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [46]:
rfc=RandomForestClassifier(labelCol='Privateindex')
dtc=DecisionTreeClassifier(labelCol='Privateindex')
gbt=GBTClassifier(labelCol='Privateindex')

In [47]:
rfr=RandomForestRegressor(labelCol='Privateindex')
dtr=DecisionTreeRegressor(labelCol='Privateindex')
gbr=GBTRegressor(labelCol='Privateindex')

In [48]:
rfc_fit=rfc.fit(train_data)
dtc_fit=dtc.fit(train_data)
gbt_fit=gbt.fit(train_data)

In [58]:
rfc_final=rfc_fit.transform(test_data)
dtc_final=dtc_fit.transform(test_data)
gbt_final=gbt_fit.transform(test_data)

In [50]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [60]:
gbt_final.show()

+--------------------+------------+----------+
|            features|Privateindex|prediction|
+--------------------+------------+----------+
|[150.0,130.0,88.0...|         0.0|       0.0|
|[152.0,128.0,75.0...|         0.0|       0.0|
|[213.0,155.0,75.0...|         0.0|       0.0|
|[222.0,185.0,91.0...|         0.0|       0.0|
|[244.0,198.0,82.0...|         0.0|       0.0|
|[283.0,201.0,97.0...|         0.0|       0.0|
|[291.0,245.0,126....|         0.0|       0.0|
|[313.0,228.0,137....|         0.0|       0.0|
|[321.0,318.0,172....|         0.0|       0.0|
|[325.0,284.0,95.0...|         0.0|       0.0|
|[342.0,254.0,126....|         0.0|       0.0|
|[367.0,274.0,158....|         0.0|       0.0|
|[369.0,312.0,90.0...|         0.0|       0.0|
|[372.0,362.0,181....|         0.0|       0.0|
|[380.0,237.0,104....|         0.0|       0.0|
|[392.0,351.0,155....|         0.0|       0.0|
|[404.0,400.0,169....|         0.0|       0.0|
|[427.0,385.0,143....|         0.0|       0.0|
|[434.0,321.0

In [51]:
my_binary_eval=BinaryClassificationEvaluator(labelCol='Privateindex')

In [62]:
print('rfc:')
my_binary_eval.evaluate(rfc_final)

rfc:


0.9889373794783315

In [55]:
print('dtc:')
my_binary_eval.evaluate(dtc_final)

dtc:


0.941033187861565

In [63]:
my_binary_eval_2=BinaryClassificationEvaluator(labelCol='Privateindex',rawPredictionCol='Prediction')

In [64]:
print('dtc:')
my_binary_eval.evaluate(dtc_final)

dtc:


0.941033187861565

In [65]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [68]:
acc_eval=MulticlassClassificationEvaluator(labelCol='Privateindex',
                                           metricName='accuracy')

In [69]:
acc_eval.evaluate(rfc_final)

0.9557522123893806