In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tree_project').getOrCreate()

In [0]:
df = spark.read.table('dog_food_csv')
df.show(5)

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
+---+---+----+---+-------+
only showing top 5 rows



In [0]:
#Which of A, B, C, D --> leads to Spoiled = 1

In [0]:
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [0]:
from pyspark.ml.feature import VectorAssembler
df.columns

Out[7]: ['A', 'B', 'C', 'D', 'Spoiled']

In [0]:
assembler = VectorAssembler(
    inputCols = [
        'A','B','C','D'
    ], 
    outputCol = 'features'
)
output = assembler.transform(df)

In [0]:
final_data = output.select('features','Spoiled')
final_data.show(5)

+------------------+-------+
|          features|Spoiled|
+------------------+-------+
|[4.0,2.0,12.0,3.0]|    1.0|
|[5.0,6.0,12.0,7.0]|    1.0|
|[6.0,2.0,13.0,6.0]|    1.0|
|[4.0,2.0,12.0,1.0]|    1.0|
|[4.0,2.0,12.0,3.0]|    1.0|
+------------------+-------+
only showing top 5 rows



In [0]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier, RandomForestClassifier

dtc = DecisionTreeClassifier(labelCol = 'Spoiled',featuresCol = 'features')
rfc = RandomForestClassifier(labelCol = 'Spoiled',featuresCol = 'features')
gbt = GBTClassifier(labelCol = 'Spoiled',featuresCol = 'features')

In [0]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [0]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'Spoiled')

print('DTC Results', my_binary_eval.evaluate(dtc_preds))
print('RFC Results', my_binary_eval.evaluate(rfc_preds))
print('GBT Results', my_binary_eval.evaluate(gbt_preds))

DTC Results 0.9435010482180293
RFC Results 0.9851153039832286
GBT Results 0.950524109014675


In [0]:
print('DTC Feature Importance: ',dtc_model.featureImportances)
print('RFC Feature Importance: ',rfc_model.featureImportances)
print('GBT Feature Importance: ',gbt_model.featureImportances)

DTC Feature Importance:  (4,[0,1,2,3],[0.029188739547305283,0.0035622436571635862,0.9486197519226692,0.018629264872861897])
RFC Feature Importance:  (4,[0,1,2,3],[0.02419877617752555,0.027540406398270652,0.9157824057604714,0.0324784116637324])
GBT Feature Importance:  (4,[0,1,2,3],[0.025577115397607108,0.006504023307322651,0.9486197519226691,0.019299109372401094])
