In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Author: Zhao Feng
# Date: 02 Feb 2021

Instructions

You are the growth analyst for an e-commerce site. The company CEO isn't very happy with the volume of sales and, especially, of sales coming from new users. Therefore, she asked you to investigate whether there is something wrong in the conversion funnel or, in general, if you could suggest how conversion rate can be improved.

Deliverable

A short presentation of your funnel analysis, key findings  & recommendations that you will present to your CEO of how the conversion rate can be improved.



# Explore data

To get an overview of the data I'm going to work on. By visulizing the data, I get a sense of how the variables associate.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read the files

homepage = pd.read_csv('../input/ecommerce-website-funnel-analysis/home_page_table.csv')
payment = pd.read_csv('../input/ecommerce-website-funnel-analysis/payment_page_table.csv')
confirmation = pd.read_csv('../input/ecommerce-website-funnel-analysis/payment_confirmation_table.csv')
search = pd.read_csv('../input/ecommerce-website-funnel-analysis/search_page_table.csv')
user = pd.read_csv('../input/ecommerce-website-funnel-analysis/user_table.csv')

In [None]:
# Check the datasets

homepage.info()

In [None]:
payment.info()

In [None]:
confirmation.info()

In [None]:
search.info()

In [None]:
user.info()

# Prepare data
By observing five tables' data, I find users drop on every step, from the homepage to the payment page. The info collected through .info() is not straightforward. To make it easier to read, I need to prepare data in an understandable manner.

In [None]:
# Check if all tables have the same size

homepage['user_id'].count()
search['user_id'].count()
payment['user_id'].count()
confirmation['user_id'].count()
user['user_id'].count()

In [None]:
# Create a new DataFrame 'drop_by_step'

drop_by_step = pd.DataFrame([['Homepage',homepage['user_id'].count()],['Search',search['user_id'].count()],['Payment',payment['user_id'].count()],['Confirmation',confirmation['user_id'].count()]], columns =['Step','Count'])
drop_by_step

In [None]:
#Visulizing the funnel. 

#reference: https://plotly.com/python/funnel-charts/

from plotly import graph_objects as go

fig = go.Figure(go.Funnel(
    y = ["Homepage","Search","Payment", "Confirmation"],
    x = [90400,45200,6030,452],
    textposition = "outside",
    textinfo = "value+percent initial"))

fig.show()

In [None]:
#Rename columns for merging tables

homepage=homepage.rename(columns={'page':'Homepage'})
search=search.rename(columns={'page':'Search'})
payment=payment.rename(columns={'page':'Payment'})
confirmation=confirmation.rename(columns={'page':'Confirmation'})

In [None]:
#Merge all the tables

flow = user.merge(homepage, how='outer', on ='user_id').merge(search, how='outer', on='user_id').merge(payment,how='outer', on='user_id').merge(confirmation, how='outer', on='user_id')
flow.head(5)

In [None]:
#Check the data in the big table.

flow.info()

In [None]:
# Check the number of unique user_ids

users = flow['user_id'].nunique
users

## Discover the correlations between gender and sales, and device and sales.

In [None]:
# Manipulate data to convert categorical variables into idicator variables, using get_dummies() function

flow2 = pd.get_dummies(data=flow, columns = ['sex','device','Homepage','Search','Payment','Confirmation'], dummy_na = False)
flow2.head()

In [None]:
flow2.rename(columns = {'Homepage_home_page':'Homepage', 'Search_search_page':'Search','Payment_payment_page':'Payment','Confirmation_payment_confirmation_page':'Confirmation', 'sex_Female':'Female', 'sex_Male':'Male', 'device_Desktop': 'Desktop','device_Mobile':'Mobile'}, inplace=True)
flow2.head(5)

In [None]:
flow2.corr()

### Gender vs. Sales

In [None]:
# Observe user gender distribution. Female users and male users are even. The products are equally popular in both male and female users.

gender = flow.groupby(['sex']).user_id.count().reset_index()
gender.head()

In [None]:
# Count the number of male users and female users on each stage, and pivot the table for better understanding. 

gender_one = flow.groupby(['sex','Homepage']).user_id.count().reset_index()
gender_one_pivot = gender_one.pivot(index='sex',columns = 'Homepage', values ='user_id')
gender_one_pivot

In [None]:
gender_two = flow.groupby(['sex','Search']).user_id.count().reset_index()
gender_two_pivot = gender_two.pivot(index='sex', columns='Search',values='user_id')
gender_two_pivot

In [None]:
gender_three = flow.groupby(['sex','Payment']).user_id.count().reset_index()
gender_three_pivot = gender_three.pivot(index='sex', columns='Payment',values='user_id')
gender_three_pivot

In [None]:
gender_four = flow.groupby(['sex','Confirmation']).user_id.count().reset_index()
gender_four_pivot = gender_four.pivot(index='sex', columns='Confirmation',values='user_id')
gender_four_pivot

In [None]:
#Using .merge() to create a new table 'gender_drop' to show the gender distribution.

gender_drop = gender_one_pivot.merge(gender_two_pivot, how='outer', on='sex').merge(gender_three_pivot, how='outer',on='sex').merge(gender_four_pivot, how='outer', on='sex')

gender_drop

In [None]:
#Using graph_objects to plot gender_drop.

from plotly import graph_objects as go

fig = go.Figure()

fig.add_trace(go.Funnel(
    name = 'Female',
    y = ["Homepage", "Search", "Payment", "Confirmation"],
    x = [45075, 22676, 3100, 241],
    textposition = 'inside',
    textinfo = "value+percent initial"))

fig.add_trace(go.Funnel(
    name = 'Male',
    orientation = "h",
    y = ["Homepage", "Search", "Payment", "Confirmation"],
    x = [45325, 22524, 2930, 211],
    textposition = "inside",
    textinfo = "value+percent previous"))

fig.show()

### Device vs. Sales

In [None]:
#Observe users' devices. It shows that users tend to shop on their desktops.

tool = flow.groupby('device').user_id.count().reset_index()
tool.head()

In [None]:
#Repeat the same approach to get a table of device_drop

device_one = flow.groupby(['device','Homepage']).user_id.count().reset_index()
device_one
device_one_pivot = device_one.pivot(index='device',columns = 'Homepage', values ='user_id')
device_one_pivot

In [None]:
device_two = flow.groupby(['device','Search']).user_id.count().reset_index()
device_two
device_two_pivot = device_two.pivot(index='device',columns = 'Search', values ='user_id')
device_two_pivot

In [None]:
device_three = flow.groupby(['device','Payment']).user_id.count().reset_index()
device_three
device_three_pivot = device_three.pivot(index='device',columns = 'Payment', values ='user_id')
device_three_pivot

In [None]:
device_four = flow.groupby(['device','Confirmation']).user_id.count().reset_index()
device_four
device_four_pivot = device_four.pivot(index='device',columns = 'Confirmation', values ='user_id')
device_four_pivot

In [None]:
device_drop = device_one_pivot.merge(device_two_pivot, how='outer', on='device').merge(device_three_pivot, how='outer',on='device').merge(device_four_pivot, how='outer', on='device')

device_drop

In [None]:
#Plot the graph to show the funnel

from plotly import graph_objects as go

fig = go.Figure()

fig.add_trace(go.Funnel(
    name = 'Desktop',
    y = ["Homepage", "Search", "Payment", "Confirmation"],
    x = [60200, 30100, 3010, 150],
    textposition = 'inside',
    textinfo = "value+percent initial"))

fig.add_trace(go.Funnel(
    name = 'Mobile',
    orientation = "h",
    y = ["Homepage", "Search", "Payment", "Confirmation"],
    x = [30200, 15100, 3020, 302],
    textposition = "inside",
    textinfo = "value+percent previous"))

fig.show()

The graph shows that mobile users are more likely to pay comparing to the desktop user.

### Discover Gender&Device vs. Sales

In [None]:
#Observe how devices used by users according to the gender

flow_device = flow.groupby(['sex','device']).user_id.count().reset_index()
flow_device
flow_device_pivot = flow_device.pivot(index = 'sex', columns = 'device', values ='user_id')
flow_device_pivot

In [None]:
# Extract the gender&device vs. sales data from the table 'flow'

drop_one = flow.groupby(['sex','device', 'Homepage']).user_id.count().reset_index()
drop_one

In [None]:
drop_two = flow.groupby(['sex','device', 'Search']).user_id.count().reset_index()
drop_two

In [None]:
drop_three = flow.groupby(['sex','device', 'Payment']).user_id.count().reset_index()
drop_three

In [None]:
drop_four = flow.groupby(['sex','device', 'Confirmation']).user_id.count().reset_index()
drop_four

In [None]:
# Create a new dataframe 'device_gender_sales'

data = {'Stage': ['Homepage','Search','Payment','Confirmation'],'Desktop Male':[30203, 15009, 1480, 76],'Mobile Male':[15122, 7515, 1450, 135], 'Desktop Female':[29997, 15091, 1530, 74], 'Mobile Female':[15078, 7585, 1570, 167]}
device_gender_sales = pd.DataFrame(data)
device_gender_sales.T

In [None]:
# Plot the funnel

import plotly.express as px

fig = px.funnel(device_gender_sales, x =['Desktop Male','Mobile Male','Desktop Female','Mobile Female'],y='Stage')
fig.show()

### Discover churn vs time

In [None]:
# Convert data type by .apply(pd.to_datetime)

flow['date'] = flow['date'].apply(pd.to_datetime)
flow.head()

In [None]:
flow['month'] = pd.DatetimeIndex(flow['date']).month
flow

In [None]:
# Change month to month name by using calendar and lambda

import calendar

flow['month'] = flow['month'].apply(lambda x:calendar.month_abbr[x])
flow

In [None]:
# Observe how churn occur in individual month

month_home_dist = flow.groupby(['month','Homepage']).user_id.count().reset_index()
month_home_pivot = month_home_dist.pivot(index='Homepage', columns = 'month', values = 'user_id')
month_home_pivot

In [None]:
month_search_dist = flow.groupby(['month','Search']).user_id.count().reset_index()
month_search_pivot = month_search_dist.pivot(index='Search', columns = 'month', values = 'user_id')
month_search_pivot

In [None]:
month_payment_dist = flow.groupby(['month','Payment']).user_id.count().reset_index()
month_payment_pivot = month_payment_dist.pivot(index='Payment', columns = 'month', values = 'user_id')
month_payment_pivot

In [None]:
month_confirmation_dist = flow.groupby(['month','Confirmation']).user_id.count().reset_index()
month_confirmation_pivot = month_confirmation_dist.pivot(index='Confirmation', columns = 'month', values = 'user_id')
month_confirmation_pivot

In [None]:
# By merging tables to have an overall churn vs. time

month_drop_dist = pd.concat([month_home_pivot, month_search_pivot,month_payment_pivot,month_confirmation_pivot],axis=0)
month_drop_dist.T

In [None]:
# Observe how churn occur according to devices in individual months

month_device_home = flow.groupby(['month','device', 'Homepage']).user_id.count().reset_index()
month_device_home

In [None]:
month_device_search = flow.groupby(['month','device','Search']).user_id.count().reset_index()
month_device_search

In [None]:
month_device_payment = flow.groupby(['month','device', 'Payment']).user_id.count().reset_index()
month_device_payment

In [None]:
month_device_confirmation = flow.groupby(['month','device', 'Confirmation']).user_id.count().reset_index()
month_device_confirmation

I have difficulites to plot the datasets in an ideal way so that I copy the datasets into Google Sheets and let the app to generate graphs.

My biggest obstacle is to manage the datatypes, and learning curve of using plotly is quite stiff due to the hardread documentation.
