In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
# from user_definition import *
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

In [6]:
# Create SparkContext & SparkSession.
sc = SparkContext()
ss = SparkSession.builder.getOrCreate()

In [11]:
data_path = 'shardonnay697/shardonnay697.csv'

In [23]:
df = ss.read.csv(data_path, header=True, inferSchema=True)
df.show(1)

+---+--------+-------+--------+------------+---------+-----------------+----------------------+----------+-----------------+---------+------------+--------+-------------+----------+-----------+
|_c0|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_order|product_id|add_to_cart_order|reordered|product_name|aisle_id|department_id|department|      aisle|
+---+--------+-------+--------+------------+---------+-----------------+----------------------+----------+-----------------+---------+------------+--------+-------------+----------+-----------+
|  0| 2539329|      1|   prior|           1|        2|                8|                  null|     196.0|              1.0|      0.0|        Soda|    77.0|          7.0| beverages|soft drinks|
+---+--------+-------+--------+------------+---------+-----------------+----------------------+----------+-----------------+---------+------------+--------+-------------+----------+-----------+
only showing top 1 row



In [1]:
# Converting strings to numeric values.
from pyspark.ml.feature import StringIndexer

def indexStringColumns(df, cols):
    # variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

In [None]:
df = indexStringColumns(df, ['ordered_true'])

In [19]:
train_df = df.filter('eval_set == "prior"').withColumnRenamed('ordered_true', 'label') 
test_df = df.filter('eval_set == "train"').withColumnRenamed('ordered_true', 'label') 
# .drop('_c0','eval_set', 'product_name', 'department', 'aisle')

In [26]:
train_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: integer (nullable = true)
 |-- order_hour_of_day: integer (nullable = true)
 |-- days_since_prior_order: double (nullable = true)
 |-- product_id: double (nullable = true)
 |-- add_to_cart_order: double (nullable = true)
 |-- reordered: double (nullable = true)
 |-- aisle_id: double (nullable = true)
 |-- department_id: double (nullable = true)



In [27]:
train_df.count()

894

In [28]:
# train_df.cache()
# test_df.cache()

In [None]:
# Create feature vector and label column.
va = VectorAssembler(outputCol="features", inputCols=train_df.columns[0:-1]) #except the last col.
carttrain = va.transform(train_df).select("features", "label")

va = VectorAssembler(outputCol="features", inputCols=test_df.columns[0:-1]) #except the last col.
cartvalid = va.transform(test_df).select("features", "label")

In [None]:
carttrain.show(5)

In [None]:
# Cache the data.
carttrain.cache()
cartvalid.cache()

In [None]:
# Create a RandomForestClassifer and build a model using training dataset.
rf = RandomForestClassifier(maxDepth=30)
rfmodel = rf.fit(carttrain)

In [None]:
# Evaluate the model using MulticlassClassificationEvaluator and test data.
# Caclulate F1 score as evaluation metric.
rfpredicts = rfmodel.transform(cartvalid)
rfpredicts.show() # this is the DF that shows the model workings

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(rfpredicts)
print('F1 = %.4f' % f1_score)

In [None]:
# Unpersist the datasets.
carttrain.unpersist()
cartvalid.unpersist()

In [None]:
# Stop SparkContext & SparkSession.
sc.stop()
ss.stop()