In [1]:
import findspark
findspark.init()

In [3]:
import os
import numpy as np
import pandas as pd
from pyspark.sql.types import *

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName("featureselector").getOrCreate()

In [6]:
df =spark.read.csv("newd/dl_data.csv",inferSchema=True,header=True)

In [7]:
df.printSchema()

root
 |-- Visit_Number_Bucket: string (nullable = true)
 |-- Page_Views_Normalized: double (nullable = true)
 |-- Orders_Normalized: integer (nullable = true)
 |-- Internal_Search_Successful_Normalized: double (nullable = true)
 |-- Internal_Search_Null_Normalized: double (nullable = true)
 |-- Email_Signup_Normalized: double (nullable = true)
 |-- Total_Seconds_Spent_Normalized: double (nullable = true)
 |-- Store_Locator_Search_Normalized: double (nullable = true)
 |-- Mapped_Last_Touch_Channel: string (nullable = true)
 |-- Mapped_Mobile_Device_Type: string (nullable = true)
 |-- Mapped_Browser_Type: string (nullable = true)
 |-- Mapped_Entry_Pages: string (nullable = true)
 |-- Mapped_Site_Section: string (nullable = true)
 |-- Mapped_Promo_Code: string (nullable = true)
 |-- Maped_Product_Name: string (nullable = true)
 |-- Mapped_Search_Term: string (nullable = true)
 |-- Mapped_Product_Collection: string (nullable = true)



In [8]:
df =df.withColumnRenamed("Orders_Normalized","label")

In [9]:
df.columns

['Visit_Number_Bucket',
 'Page_Views_Normalized',
 'label',
 'Internal_Search_Successful_Normalized',
 'Internal_Search_Null_Normalized',
 'Email_Signup_Normalized',
 'Total_Seconds_Spent_Normalized',
 'Store_Locator_Search_Normalized',
 'Mapped_Last_Touch_Channel',
 'Mapped_Mobile_Device_Type',
 'Mapped_Browser_Type',
 'Mapped_Entry_Pages',
 'Mapped_Site_Section',
 'Mapped_Promo_Code',
 'Maped_Product_Name',
 'Mapped_Search_Term',
 'Mapped_Product_Collection']

In [10]:
from pyspark.ml.feature import OneHotEncoder,VectorAssembler,StringIndexer

In [11]:
from pyspark.ml import Pipeline

In [12]:
from pyspark.sql.functions import udf, StringType
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier

### Split into Train and Test Sets

In [13]:
train,validation,test = df.randomSplit([0.7,0.2,0.1],1234)

### Data Preprocessing

In [15]:
cat_columns= [item[0] for item in df.dtypes if item[1].startswith('string')]
num_columns = [item[0] for item in df.dtypes if item[1].startswith('double')]
indexers= [StringIndexer(inputCol=column,outputCol='{0}_index)'.format(column)) for column in cat_columns]

In [16]:
featured=VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers] + num_columns,outputCol="features")

### Model Building

In [20]:
layers = [len(featured.getInputCols()), 4, 2, 2]


In [21]:
classifier = MultilayerPerceptronClassifier(labelCol="label",featuresCol="features",maxIter=100,layers=layers,blockSize=128,seed=123)

In [22]:
pipeline = Pipeline(stages=indexers + [featured, classifier])
model = pipeline.fit(train)

In [23]:
train_output_df = model.transform(train)
validation_output_df = model.transform(validation)
test_output_df = model.transform(test)

In [24]:
train_predictionAndLabels = train_output_df.select('prediction', 'label')
validation_predictionAndLabels = validation_output_df.select('prediction', 'label')
test_predictionAndLabels = test_output_df.select('prediction', 'label')

### Model Evaluation

In [25]:
metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']
for metric in metrics:
        evaluator = MulticlassClassificationEvaluator(metricName=metric)
        print('Train ' + metric + ' = ' + str(evaluator.evaluate(
            train_predictionAndLabels)))
        print('Validation ' + metric + ' = ' + str(evaluator.evaluate(
            validation_predictionAndLabels)))
        print('Test ' + metric + ' = ' + str(evaluator.evaluate(
            test_predictionAndLabels)))

Train weightedPrecision = 0.9730591604209184
Validation weightedPrecision = 0.9757287562606041
Test weightedPrecision = 0.974310573777863
Train weightedRecall = 0.9723427015462414
Validation weightedRecall = 0.9751811841925339
Test weightedRecall = 0.9738114423851733
Train accuracy = 0.9723427015462414
Validation accuracy = 0.9751811841925339
Test accuracy = 0.9738114423851733
