In [1]:
import findspark
findspark.init('/home/shashank/spark-2.3.2-bin-hadoop2.7')

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('dog_food').getOrCreate()

In [4]:
data = spark.read.csv('dog_food.csv', inferSchema=True, header=True)

In [5]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [8]:
data.createOrReplaceTempView('data')

In [10]:
spark.sql("FROM data SELECT Spoiled, AVG(A), AVG(B) , AVG(C), AVG(D) GROUP BY Spoiled").show()

+-------+-----------------+-----------------+------------------+------------------+
|Spoiled|           avg(A)|           avg(B)|            avg(C)|            avg(D)|
+-------+-----------------+-----------------+------------------+------------------+
|    0.0|5.422857142857143|             5.66|  8.01142857142857|5.6085714285714285|
|    1.0|5.814285714285714|5.114285714285714|11.914285714285715| 5.507142857142857|
+-------+-----------------+-----------------+------------------+------------------+



Just looking at averages, we can see that there is a big difference between spoiled and good for Chemical C

In [11]:
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier

In [12]:
data.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [13]:
from pyspark.ml.feature import VectorAssembler

In [14]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol='features')

In [15]:
output = assembler.transform(data)
output.head()

Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0, features=DenseVector([4.0, 2.0, 12.0, 3.0]))

In [16]:
output.createOrReplaceTempView('output')

In [17]:
final_data = spark.sql("FROM output SELECT features, Spoiled AS label")

In [18]:
final_data.show()

+-------------------+-----+
|           features|label|
+-------------------+-----+
| [4.0,2.0,12.0,3.0]|  1.0|
| [5.0,6.0,12.0,7.0]|  1.0|
| [6.0,2.0,13.0,6.0]|  1.0|
| [4.0,2.0,12.0,1.0]|  1.0|
| [4.0,2.0,12.0,3.0]|  1.0|
|[10.0,3.0,13.0,9.0]|  1.0|
| [8.0,5.0,14.0,5.0]|  1.0|
| [5.0,8.0,12.0,8.0]|  1.0|
| [6.0,5.0,12.0,9.0]|  1.0|
| [3.0,3.0,12.0,1.0]|  1.0|
| [9.0,8.0,11.0,3.0]|  1.0|
|[1.0,10.0,12.0,3.0]|  1.0|
|[1.0,5.0,13.0,10.0]|  1.0|
|[2.0,10.0,12.0,6.0]|  1.0|
|[1.0,10.0,11.0,4.0]|  1.0|
| [5.0,3.0,12.0,2.0]|  1.0|
| [4.0,9.0,11.0,8.0]|  1.0|
| [5.0,1.0,11.0,1.0]|  1.0|
|[4.0,9.0,12.0,10.0]|  1.0|
| [5.0,8.0,10.0,9.0]|  1.0|
+-------------------+-----+
only showing top 20 rows



In [43]:
#check first with default models
lr_model = LogisticRegression()
dtc_model = DecisionTreeClassifier()
rfc_model = RandomForestClassifier()
gbc_model = GBTClassifier()
lr_fit = lr_model.fit(final_data)
dtc_fit = dtc_model.fit(final_data)
rfc_fit = rfc_model.fit(final_data)
gbc_fit = gbc_model.fit(final_data)

In [47]:
rfc_fit.featureImportances
#Chemical C

SparseVector(4, {0: 0.0135, 1: 0.0231, 2: 0.9406, 3: 0.0228})

In [48]:
dtc_fit.featureImportances
#Chemical C

SparseVector(4, {0: 0.0026, 1: 0.0089, 2: 0.9686, 3: 0.0199})

In [49]:
gbc_fit.featureImportances

SparseVector(4, {0: 0.0434, 1: 0.0539, 2: 0.7923, 3: 0.1104})

In [51]:
lr_fit.coefficients
#chemical C

DenseVector([-0.1349, -0.1067, 4.3961, -0.213])