In [1]:
# Setting the environment variables
import os
import sys
import pandas as pd
import numpy as np
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"]="/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"]="notebook --no-browser"
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_161/jre"
os.environ["SPARK_HOME"] = "/home/ec2-user/spark-2.4.4-bin-hadoop2.7"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

# Ecommerce Churn Assignment

The aim of the assignment is to build a model that predicts whether a person purchases an item after it has been added to the cart or not. Being a classification problem, you are expected to use your understanding of all the three models covered till now. You must select the most robust model and provide a solution that predicts the churn in the most suitable manner. 

For this assignment, you are provided the data associated with an e-commerce company for the month of October 2019. Your task is to first analyse the data, and then perform multiple steps towards the model building process.

The broad tasks are:
- Data Exploration
- Feature Engineering
- Model Selection
- Model Inference

### Data description

The dataset stores the information of a customer session on the e-commerce platform. It records the activity and the associated parameters with it.

- **event_time**: Date and time when user accesses the platform
- **event_type**: Action performed by the customer
            - View
            - Cart
            - Purchase
            - Remove from cart
- **product_id**: Unique number to identify the product in the event
- **category_id**: Unique number to identify the category of the product
- **category_code**: Stores primary and secondary categories of the product
- **brand**: Brand associated with the product
- **price**: Price of the product
- **user_id**: Unique ID for a customer
- **user_session**: Session ID for a user


### Initialising the SparkSession

The dataset provided is 5 GBs in size. Therefore, it is expected that you increase the driver memory to a greater number. You can refer to notebook 1 for the steps involved here.

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import countDistinct ,col, avg, mean, isnan, when, count, col, to_timestamp
from pyspark.sql.types import IntegerType
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
from datetime import timedelta
import pandas as pd
import numpy as np
import pyspark.sql.functions as f

In [3]:
# initialising the session with 14 GB driver memory
MAX_MEMORY = "14G"

spark = SparkSession \
    .builder \
    .appName("demo") \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()

spark

In [4]:
# Loading the clean data
df_clean = spark.read.csv('/home/ec2-user/df_updated2', header= False, inferSchema= True)

In [5]:
df_clean.show(5)

+----+---+---+-----------+-----------+---+---+---+----+---+----+------+----+----+
| _c0|_c1|_c2|        _c3|        _c4|_c5|_c6|_c7| _c8|_c9|_c10|  _c11|_c12|_c13|
+----+---+---+-----------+-----------+---+---+---+----+---+----+------+----+----+
|view|540|  6|electronics| smartphone| 35|  1| 35|null| 35| 3.0|huawei|   0|   0|
|view|114|  5| appliances|environment|  1|  1|  1|null|  4| 0.0|others|   0|   0|
|view| 39|  3|electronics|     clocks|  2|  2|  2|null|  2| 3.0| casio|   0|   0|
|view|167|  6|electronics|      audio|  2|  2|  2|null| 14| 2.0|others|   0|   0|
|view|161|  6|electronics|      audio| 12|  1|  1|null|121| 2.0| apple|   0|   0|
+----+---+---+-----------+-----------+---+---+---+----+---+----+------+----+----+
only showing top 5 rows



In [6]:
#renaming the column names
df_clean = df_clean.select(col("_c0").alias("event_type"), col("_c1").alias("price"), col("_c2").alias("day_of_week"), col("_c3").alias("category1"), col("_c4").alias("category2"), col("_c5").alias("activity_countval"), col("_c6").alias("product_view_counts"), col("_c7").alias("category2_view_counts"), col("_c8").alias("average_shopping_expense"), col("_c9").alias("session_counts"), col("_c10").alias("binnedhour"), col("_c11").alias("brand_new"), col("_c12").alias("is_purchased"), col("_c13").alias("label"))

In [7]:
df_clean.show(5)

+----------+-----+-----------+-----------+-----------+-----------------+-------------------+---------------------+------------------------+--------------+----------+---------+------------+-----+
|event_type|price|day_of_week|  category1|  category2|activity_countval|product_view_counts|category2_view_counts|average_shopping_expense|session_counts|binnedhour|brand_new|is_purchased|label|
+----------+-----+-----------+-----------+-----------+-----------------+-------------------+---------------------+------------------------+--------------+----------+---------+------------+-----+
|      view|  540|          6|electronics| smartphone|               35|                  1|                   35|                    null|            35|       3.0|   huawei|           0|    0|
|      view|  114|          5| appliances|environment|                1|                  1|                    1|                    null|             4|       0.0|   others|           0|    0|
|      view|   39|       

In [8]:
df_clean.count()

15874983

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

In [10]:
#import required libraries
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from pyspark.mllib.util import MLUtils

In [11]:
df_clean.show(5)

+----------+-----+-----------+-----------+-----------+-----------------+-------------------+---------------------+------------------------+--------------+----------+---------+------------+-----+
|event_type|price|day_of_week|  category1|  category2|activity_countval|product_view_counts|category2_view_counts|average_shopping_expense|session_counts|binnedhour|brand_new|is_purchased|label|
+----------+-----+-----------+-----------+-----------+-----------------+-------------------+---------------------+------------------------+--------------+----------+---------+------------+-----+
|      view|  540|          6|electronics| smartphone|               35|                  1|                   35|                    null|            35|       3.0|   huawei|           0|    0|
|      view|  114|          5| appliances|environment|                1|                  1|                    1|                    null|             4|       0.0|   others|           0|    0|
|      view|   39|       

## Task 3: Model Selection
3 models for classification:	
- Logistic Regression
- Decision Tree
- Random Forest

### Model 3: Random Forest

In [12]:
# Additional steps for Decision Trees, if any


In [13]:
df_clean=df_clean.withColumn("is_purchased",df_clean["is_purchased"].cast(IntegerType()))

In [14]:
df_clean=df_clean.drop('user_session')

In [15]:
df_clean.printSchema()

root
 |-- event_type: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- category1: string (nullable = true)
 |-- category2: string (nullable = true)
 |-- activity_countval: integer (nullable = true)
 |-- product_view_counts: integer (nullable = true)
 |-- category2_view_counts: integer (nullable = true)
 |-- average_shopping_expense: double (nullable = true)
 |-- session_counts: integer (nullable = true)
 |-- binnedhour: double (nullable = true)
 |-- brand_new: string (nullable = true)
 |-- is_purchased: integer (nullable = true)
 |-- label: integer (nullable = true)



#### Feature Transformation (Code will be same; check for the columns)

In [16]:
# Check if only the required columns are present to build the model
# If not, drop the redundant columns
df_clean.count()

15874983

In [17]:
# Categorising the attributes into its type - Continuous and Categorical
all_categorical_features = ['day_of_week', 'category1', 'category2', 'binnedhour', 'brand_new']
all_continuous_features = ['price', 'activity_countval', 'product_view_counts', 'category2_view_counts', 'session_counts', 'average_shopping_expense']

In [18]:
# Importing the libraries for data transormation
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

In [19]:
stages=[]

In [20]:
# Building a function for encoding all the categorical variables
for categoricalCol in all_categorical_features:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index').setHandleInvalid("keep")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

In [21]:
# Vector assembler to combine all the features
assemblerInputs = [c + "classVec" for c in all_categorical_features] + all_continuous_features
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features").setHandleInvalid("keep")
stages += [assembler]

In [22]:
# Pipeline for the tasks
# Loading all the steps in a pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = stages)

In [23]:
# Transforming the dataframe df
# Fitting the steps on the dataFrame
pipelineModel = pipeline.fit(df_clean)
df_clean = pipelineModel.transform(df_clean)

In [24]:
# Schema of the transformed df
df_clean.printSchema()

root
 |-- event_type: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- category1: string (nullable = true)
 |-- category2: string (nullable = true)
 |-- activity_countval: integer (nullable = true)
 |-- product_view_counts: integer (nullable = true)
 |-- category2_view_counts: integer (nullable = true)
 |-- average_shopping_expense: double (nullable = true)
 |-- session_counts: integer (nullable = true)
 |-- binnedhour: double (nullable = true)
 |-- brand_new: string (nullable = true)
 |-- is_purchased: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- day_of_weekIndex: double (nullable = false)
 |-- day_of_weekclassVec: vector (nullable = true)
 |-- category1Index: double (nullable = false)
 |-- category1classVec: vector (nullable = true)
 |-- category2Index: double (nullable = false)
 |-- category2classVec: vector (nullable = true)
 |-- binnedhourIndex: double (nullable = false)
 |-- binnedhourclassVec:

In [25]:
# Checking the elements of the transformed df - Top 20 rows
df_clean.show(20)

+----------+-----+-----------+-----------+-----------+-----------------+-------------------+---------------------+------------------------+--------------+----------+---------+------------+-----+----------------+-------------------+--------------+-----------------+--------------+-----------------+---------------+------------------+--------------+-----------------+--------------------+
|event_type|price|day_of_week|  category1|  category2|activity_countval|product_view_counts|category2_view_counts|average_shopping_expense|session_counts|binnedhour|brand_new|is_purchased|label|day_of_weekIndex|day_of_weekclassVec|category1Index|category1classVec|category2Index|category2classVec|binnedhourIndex|binnedhourclassVec|brand_newIndex|brand_newclassVec|            features|
+----------+-----+-----------+-----------+-----------+-----------------+-------------------+---------------------+------------------------+--------------+----------+---------+------------+-----+----------------+---------------

In [26]:
df_clean.count()

15874983

In [27]:
# Checking the elements of the transformed df - Top 20 rows
df_clean.show(20)

+----------+-----+-----------+-----------+-----------+-----------------+-------------------+---------------------+------------------------+--------------+----------+---------+------------+-----+----------------+-------------------+--------------+-----------------+--------------+-----------------+---------------+------------------+--------------+-----------------+--------------------+
|event_type|price|day_of_week|  category1|  category2|activity_countval|product_view_counts|category2_view_counts|average_shopping_expense|session_counts|binnedhour|brand_new|is_purchased|label|day_of_weekIndex|day_of_weekclassVec|category1Index|category1classVec|category2Index|category2classVec|binnedhourIndex|binnedhourclassVec|brand_newIndex|brand_newclassVec|            features|
+----------+-----+-----------+-----------+-----------+-----------------+-------------------+---------------------+------------------------+--------------+----------+---------+------------+-----+----------------+---------------

#### Train-test split

In [28]:
# Splitting the data into train and test (Remember you are expected to compare the model later)
traindata, testdata = df_clean.randomSplit([0.7,0.3], seed=100)

In [29]:
# Number of rows in train and test data
traindata.count()

11113821

In [30]:
testdata.count()

4761162

#### Model Fitting

In [31]:
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [32]:
# Building the model with hyperparameter tuning
# Create ParamGrid for Cross Validation
rf = RandomForestClassifier("label", "features")

In [33]:
# Run cross-validation steps


In [35]:
# Fitting the models on transformed df
rf_model=rf.fit(traindata)

TypeError: fit() missing 1 required positional argument: 'y'

In [36]:
def get_dt_graph(dt):
    dot_data = StringIO()
    export_graphviz(dt_classifier, out_file=dot_data, filled=True,rounded=True,
                    feature_names=X.columns, 
                    class_names=['is_purchased', "No_purchase"])
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    return graph 

In [37]:
# Best model from the results of cross-validation
rf_estimators_

rf_estimators_[0]
gph = get_dt_graph(sample_tree)
Image[gph.create_png()]

NameError: name 'rf_estimators_' is not defined

In [38]:
#OOB Score
rf.oob_score_

AttributeError: 'RandomForestClassifier' object has no attribute 'oob_score_'

In [39]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs= -1)

In [40]:
from sklearn.model_selection import GridSearchCV

In [41]:
# Create the parameter grid based on results of random search
params = {
    'max_depth': [1, 2, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'max_features': [2,3,4],
    'n_estimators': [10, 30, 50, 100, 200]
}

In [42]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=classifier_rf, param_grid=params, 
                          cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [43]:
%%time
grid_search.fit(X,y)

NameError: name 'X' is not defined

In [None]:
rf_best_est = grid_search.best_estimator_

In [None]:
rf_best_est

In [None]:
rf_best_est.fit(X_train, y_train)

In [None]:
sample_tree = rf_best_est.estimators_[0]

In [None]:
evaluate_model(rf_best_est)

In [None]:
graphh = get_dt_graph(sample_tree)
Image(graphh.create_png())

In [None]:
graphh = get_dt_graph(rf_best_est.estimators_[0])
Image(graphh.create_png(), height=600, width=600)

In [None]:
graphh = get_dt_graph(rf_best.estimators_[10])    
Image(graphh.create_png(), height=600, width=600)

#### Model Analysis

Required Steps:
- Fit on test data
- Performance analysis
    - Appropriate Metric with reasoning

#### Summary of the best Random Forest model