In [1]:
# Setting the environment variables

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"]="notebook --no-browser"
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_161/jre"
os.environ["SPARK_HOME"] = "/home/ec2-user/spark-2.4.4-bin-hadoop2.7"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

# Ecommerce Churn Assignment

The aim of the assignment is to build a model that predicts whether a person purchases an item after it has been added to the cart or not. Being a classification problem, you are expected to use your understanding of all the three models covered till now. You must select the most robust model and provide a solution that predicts the churn in the most suitable manner. 

For this assignment, you are provided the data associated with an e-commerce company for the month of October 2019. Your task is to first analyse the data, and then perform multiple steps towards the model building process.

The broad tasks are:
- Data Exploration
- Feature Engineering
- Model Selection
- Model Inference

### Data description

The dataset stores the information of a customer session on the e-commerce platform. It records the activity and the associated parameters with it.

- **event_time**: Date and time when user accesses the platform
- **event_type**: Action performed by the customer
            - View
            - Cart
            - Purchase
            - Remove from cart
- **product_id**: Unique number to identify the product in the event
- **category_id**: Unique number to identify the category of the product
- **category_code**: Stores primary and secondary categories of the product
- **brand**: Brand associated with the product
- **price**: Price of the product
- **user_id**: Unique ID for a customer
- **user_session**: Session ID for a user


### Initialising the SparkSession

The dataset provided is 5 GBs in size. Therefore, it is expected that you increase the driver memory to a greater number. You can refer to notebook 1 for the steps involved here.

In [3]:
# Spark environment
from pyspark import SparkConf
from pyspark.sql import SparkSession

In [5]:
# initialising the session with 14 GB driver memory
MAX_MEMORY = "14G"

spark = SparkSession \
    .builder \
    .appName("demo") \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()

spark

In [6]:
spark.sparkContext.getConf().get('spark.driver.memory')

'14G'

In [7]:
# Loading the clean data

df = spark.read.parquet('LR_Transform_Output.parquet')

## Task 3: Model Selection
3 models for classification:	
- Logistic Regression
- Decision Tree
- Random Forest

### Model 3: Random Forest

In [12]:
# Additional steps for Decision Trees, if any

from pyspark.ml.classification import RandomForestClassifier

In [13]:
rf = RandomForestClassifier(featuresCol='features',labelCol='label')

#### Feature Transformation (Code will be same; check for the columns)

In [None]:
# Check if only the required columns are present to build the model
# If not, drop the redundant columns


In [None]:
# Categorising the attributes into its type - Continuous and Categorical


In [None]:
# Feature transformation for categorical features


In [None]:
# Vector assembler to combine all the features


In [None]:
# Pipeline for the tasks


In [None]:
# Transforming the dataframe df


In [None]:
# Schema of the transformed df


In [None]:
# Checking the elements of the transformed df - Top 20 rows


In [None]:
# Storing the transformed df in S3 bucket to prevent repetition of steps again


#### Train-test split

In [9]:
# Splitting the data into train and test (Remember you are expected to compare the model later)

traindata, testdata = df.randomSplit([0.7,0.3], seed=100)

In [10]:
# Number of rows in train and test data

traindata.count()

548387

In [11]:
testdata.count()

235974

#### Model Fitting

In [14]:
# Building the model with hyperparameter tuning
# Create ParamGrid for Cross Validation

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import numpy as np

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [int(x) for x in np.linspace(start = 10, stop = 50, num = 3)]) \
    .addGrid(rf.maxDepth, [int(x) for x in np.linspace(start = 10, stop = 20, num = 3)]) \
    .build()

In [15]:
# Run cross-validation steps
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="label")

rfcv = CrossValidator(estimator = rf,
                      estimatorParamMaps = paramGrid,
                      evaluator = evaluator,
                      numFolds = 3)

In [16]:
# Fitting the models on transformed df

Model = rfcv.fit(traindata)

In [17]:
# Best model from the results of cross-validation

Model.bestModel.explainParam('maxDepth')

'maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 5, current: 20)'

In [18]:
Model.bestModel.explainParam('numTrees')

'numTrees: Number of trees to train (>= 1) (default: 20, current: 50)'

#### Model Analysis

Required Steps:
- Fit on test data
- Performance analysis
    - Appropriate Metric with reasoning

In [19]:
predictions_train = Model.transform(traindata)
predictions_test = Model.transform(testdata)

In [20]:
print('Area under ROC for training set:', evaluator.evaluate(predictions_train))
print('Area under ROC for test set:', evaluator.evaluate(predictions_test))

Area under ROC for training set: 0.8017536776104406
Area under ROC for test set: 0.7749477617683679


In [44]:
from pyspark.sql.types import FloatType
from pyspark.mllib.evaluation import MulticlassMetrics
import pyspark.sql.functions as F

preds_and_labels=predictions_test.select(['prediction','label']).withColumn('label',F.col('label').\
                                                                            cast(FloatType())).orderBy('prediction')

preds_and_labels=preds_and_labels.select(['prediction','label'])
metrics=MulticlassMetrics(preds_and_labels.rdd.map(tuple))

print(metrics.confusionMatrix().toArray())

[[ 33719.  56060.]
 [ 11196. 134999.]]


In [22]:
print("Accuracy = ",(56060+134999)/(56060+33719+11196+134999))

Accuracy =  0.8096612338647479


In [23]:
print("Precision = ",(134999)/(134999+33719))

Precision =  0.8001458054268069


In [25]:
print("Recall = ",(134999)/(134999+11196))

Recall =  0.9234173535346626


In [26]:
print("F-Score = ",\
      2 * ((134999)/(134999+33719)) * ((134999)/(134999+33719)) / ((134999)/(134999+33719) + (134999)/(134999+33719)))

F-Score =  0.8001458054268069


#### Summary of the best Random Forest model

## Task 4: Model Inference

- Feature Importance
- Model Inference
- Feature exploration

In [27]:
Model.bestModel

RandomForestClassificationModel (uid=RandomForestClassifier_9258fcea336e) with 50 trees

In [36]:
Model.bestModel.featureImportances

SparseVector(71, {0: 0.011, 1: 0.0068, 2: 0.0074, 3: 0.0109, 4: 0.0024, 5: 0.0016, 6: 0.0016, 7: 0.0008, 8: 0.0012, 9: 0.0007, 10: 0.0013, 11: 0.0007, 12: 0.0012, 13: 0.0008, 14: 0.0013, 15: 0.0009, 16: 0.0009, 17: 0.0007, 18: 0.0003, 19: 0.0, 20: 0.019, 21: 0.0058, 22: 0.0039, 23: 0.0027, 24: 0.0027, 25: 0.0022, 26: 0.0015, 27: 0.002, 28: 0.0012, 29: 0.0016, 30: 0.0019, 31: 0.001, 32: 0.0012, 33: 0.0014, 34: 0.0008, 35: 0.0005, 36: 0.001, 37: 0.0005, 38: 0.0004, 39: 0.0007, 40: 0.0004, 41: 0.0004, 42: 0.0002, 43: 0.0003, 44: 0.0003, 45: 0.0002, 46: 0.0002, 47: 0.0002, 48: 0.0001, 49: 0.0001, 50: 0.0001, 51: 0.0002, 52: 0.0, 53: 0.0, 54: 0.0, 55: 0.0, 56: 0.0, 57: 0.0, 58: 0.0047, 59: 0.0045, 60: 0.0048, 61: 0.005, 62: 0.0047, 63: 0.0045, 64: 0.0043, 65: 0.0396, 66: 0.4931, 67: 0.1157, 68: 0.0389, 69: 0.0589, 70: 0.114})

In [41]:
import pandas as pd

def ExtractFeatures(features,dataset,Col):
    list_temp=[]
    
    for i in dataset.schema[Col].metadata["ml_attr"]["attrs"]:
        list_temp=list_temp+dataset.schema[Col].metadata["ml_attr"]["attrs"][i]
        
    finalList=pd.DataFrame(list_temp)
    finalList['score']=finalList['idx'].apply(lambda x: features[x])
    
    return(finalList.sort_values('score',ascending=False))

In [42]:
ExtractFeatures(Model.bestModel.featureImportances,predictions_test,"features").head(10)

Unnamed: 0,idx,name,score
1,66,user_product_count,0.493058
2,67,sub_categ_user_count,0.115676
5,70,user_session_activity,0.114018
4,69,user_sess_count,0.058881
0,65,price,0.039573
3,68,prod_avg_spend,0.038888
26,20,sub_categ_enc_smartphone,0.018989
6,0,brand_enc_samsung,0.010989
9,3,brand_enc_xiaomi,0.010917
8,2,brand_enc_others,0.007435


In [43]:
ExtractFeatures(Model.bestModel.featureImportances,predictions_train,"features").head(10)

Unnamed: 0,idx,name,score
1,66,user_product_count,0.493058
2,67,sub_categ_user_count,0.115676
5,70,user_session_activity,0.114018
4,69,user_sess_count,0.058881
0,65,price,0.039573
3,68,prod_avg_spend,0.038888
26,20,sub_categ_enc_smartphone,0.018989
6,0,brand_enc_samsung,0.010989
9,3,brand_enc_xiaomi,0.010917
8,2,brand_enc_others,0.007435
