In [1]:
# Setting the environment variables

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"]="notebook --no-browser"
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_161/jre"
os.environ["SPARK_HOME"] = "/home/ec2-user/spark-2.4.4-bin-hadoop2.7"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

# Ecommerce Churn Assignment

The aim of the assignment is to build a model that predicts whether a person purchases an item after it has been added to the cart or not. Being a classification problem, you are expected to use your understanding of all the three models covered till now. You must select the most robust model and provide a solution that predicts the churn in the most suitable manner. 

For this assignment, you are provided the data associated with an e-commerce company for the month of October 2019. Your task is to first analyse the data, and then perform multiple steps towards the model building process.

The broad tasks are:
- Data Exploration
- Feature Engineering
- Model Selection
- Model Inference

### Data description

The dataset stores the information of a customer session on the e-commerce platform. It records the activity and the associated parameters with it.

- **event_time**: Date and time when user accesses the platform
- **event_type**: Action performed by the customer
            - View
            - Cart
            - Purchase
            - Remove from cart
- **product_id**: Unique number to identify the product in the event
- **category_id**: Unique number to identify the category of the product
- **category_code**: Stores primary and secondary categories of the product
- **brand**: Brand associated with the product
- **price**: Price of the product
- **user_id**: Unique ID for a customer
- **user_session**: Session ID for a user


### Initialising the SparkSession

The dataset provided is 5 GBs in size. Therefore, it is expected that you increase the driver memory to a greater number. You can refer to notebook 1 for the steps involved here.

In [3]:
# Spark environment
from pyspark import SparkConf
from pyspark.sql import SparkSession

In [4]:
# initialising the session with 14 GB driver memory
MAX_MEMORY = "14G"

spark = SparkSession \
    .builder \
    .appName("demo") \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()

spark

In [5]:
spark.sparkContext.getConf().get('spark.driver.memory')

'14G'

In [6]:
# Loading the clean data

df = spark.read.parquet('LR_Transform_Output.parquet')

In [7]:
df.head()

Row(brand='samsung', price=283.62, event_day=3, sub_categ='smartphone', user_session_activity=2, user_product_count=2, sub_categ_user_count=3, prod_avg_spend=270.4633333333333, user_sess_count=2, label=0, brand_ind=0.0, brand_enc=SparseVector(20, {0: 1.0}), sub_categ_ind=0.0, sub_categ_enc=SparseVector(38, {0: 1.0}), event_day_ind=1.0, event_day_enc=SparseVector(7, {1: 1.0}), features=SparseVector(71, {0: 1.0, 20: 1.0, 59: 1.0, 65: 283.62, 66: 2.0, 67: 3.0, 68: 270.4633, 69: 2.0, 70: 2.0}))

<hr>

## Task 3: Model Selection
3 models for classification:	
- Logistic Regression
- Decision Tree
- Random Forest

### Model 2: Decision Trees

In [8]:
# Additional steps for Decision Trees, if any
from pyspark.ml.classification import DecisionTreeClassifier

In [9]:
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")

#### Feature Transformation (Code will be same; check for the columns)

In [10]:
# Check if only the required columns are present to build the model
# If not, drop the redundant columns


In [11]:
# Categorising the attributes into its type - Continuous and Categorical


In [12]:
# Feature transformation for categorical features


In [13]:
# Vector assembler to combine all the features


In [14]:
# Pipeline for the tasks


In [15]:
# Transforming the dataframe df


In [16]:
# Schema of the transformed df


In [17]:
# Checking the elements of the transformed df - Top 20 rows


In [18]:
# Storing the transformed df in S3 bucket to prevent repetition of steps again


#### Train-test split

In [19]:
# Splitting the data into train and test (Remember you are expected to compare the model later)

traindata, testdata = df.randomSplit([0.7,0.3], seed=100)

In [20]:
# Number of rows in train and test data

traindata.count()

548387

In [21]:
testdata.count()

235974

#### Model Fitting

In [22]:
# Building the model with hyperparameter tuning
# Create ParamGrid for Cross Validation

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import numpy as np

dtparamGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [int(x) for x in np.linspace(start = 10, stop = 20, num = 3)])
             .addGrid(dt.maxBins, [int(x) for x in np.linspace(start = 20, stop = 70, num = 4)])
             .build())

In [23]:
# Run cross-validation steps

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="label")

dtcv = CrossValidator(estimator = dt,
                      estimatorParamMaps = dtparamGrid,
                      evaluator = evaluator,
                      numFolds = 3)

In [24]:
# Fitting the models on transformed df

dtcvModel = dtcv.fit(traindata)

In [25]:
# Best model from the results of cross-validation

dtcvModel.bestModel.explainParam('maxBins')

'maxBins: Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature. (default: 32, current: 70)'

In [26]:
dtcvModel.bestModel.explainParam('maxDepth')

'maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 5, current: 20)'

#### Model Analysis

Required Steps:
- Fit on test data
- Performance analysis
    - Appropriate Metric with reasoning

In [27]:
predictions_train = dtcvModel.transform(traindata)
predictions_test = dtcvModel.transform(testdata)

In [28]:
print('Area under ROC for training set:', evaluator.evaluate(predictions_train))
print('Area under ROC for test set:', evaluator.evaluate(predictions_test))

Area under ROC for training set: 0.7220827759649531
Area under ROC for test set: 0.6875748587259931


In [29]:
from pyspark.sql.types import FloatType
from pyspark.mllib.evaluation import MulticlassMetrics
import pyspark.sql.functions as F

preds_and_labels = predictions_test.select(['prediction','label']).withColumn('label',F.col('label').cast(FloatType())).orderBy('prediction')

preds_and_labels = preds_and_labels.select(['prediction','label'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

print(metrics.confusionMatrix().toArray())

[[ 50287.  39492.]
 [ 21528. 124667.]]


In [30]:
print("Accuracy = ",(50287+124667)/(50287+39492+21528+124667))

Accuracy =  0.7414121894785018


In [31]:
print("Precision = ",(124667)/(124667+39492))

Precision =  0.7594283590908814


In [32]:
print("Recall = ",(124667)/(124667+21528))

Recall =  0.8527446219090941


In [33]:
print("F-Score = ",\
      2 * ((124667)/(124667+39492)) * ((124667)/(124667+21528)) / ((124667)/(124667+39492) + (124667)/(124667+21528)))

F-Score =  0.8033858110415847


#### Summary of the best Decision Tree model