In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Business Data Science - Online shoppers intentions

## 0. Before we start (to be modified)
> We will introduce the case, our goal and our research methodology in this part

### Case Introduction (to be modified by yue)

.....

### Case Objectif (to be modified by sunidhi)
- #### Data Visualization Part
to see what influences people's purchasing intention
- #### Machine Learning Part
to see how to predict/improve future people's purchasing intension based on bounce rates etc.

>    Assumptions for ML part
>    - Persona of the "premium traffic" (do a segmentation to all visitors)
> 
>    Who are the high-quality customers (the returning customers who buys a lot)? And who are potential high-quality customers (the new customers with similar charactistics as current high-quality customers)?
>    
>    - Instant suggesting mechanism
>
>    find the boundary of exit/bounce rate of purchase/exit after training the data, then create a mechanism to track and respond to future customers' web activities (e.g. suggest more attractive webpages for them if they are about to leave)

## 1. Dataset Information


### Data Source

xxxxx

### Attributes
> Among all 18 attributes, attribute 1-9 describes webpage related info (page type, page duration, likelihood of stay/exit); attribute 10-18 describes in which condition the consumer placed the order ot not.

1.	Administrative:	Number of pages visited by the user for user account management related activities.
2.	Administrative Duration:	Time spent on Admin pages by the user.
3.	Informational:	Number of pages visited by the user about the website
4.	Informational Duration:	Time spent on Informational pages by the user
5.	Product Related:	Number of product related pages visited by the user
6.	Product Related Duration:	Time spent on Product related pages by the user.
7.	Bounce Rates:	Average bounce rate of the pages visited by the user. It represents the percentage of visitors who enter the site and then leave pages within the same site.
8.	Exit Rates:	Average exit rate of the pages visited by the user. It is the percentage of people who left your site from that page.
9.	Page Values:	Average page value of the pages visited by the user. It is the average value for a page that a user visited before landing on the goal page or completing an Ecommerce transaction (or both). This value is intended to give you an idea of which page in your site contributed more to your site's revenue.
10.	Special Day:	Closeness of the visiting day to a special event like Mother‟s Day or festivals like Christmas.
11.	Month:	Visiting month during the whole year.
12.	Operating Systems:	Operating Systems of the visitor(no precise info, thus represented numerically).
13.	Browser:	Browser of the visitor (no precise info, thus represented numerically).
14.	Region:	Geographic region from which the session has been started by the visitor (no precise info, thus represented numerically).
15.	Traffic Type:	Traffic source through which user has entered the website (no precise info, thus represented numerically).
16.	Visitor Type:	Defines the nature of user（in which 3 types, the 'other' type includes but not limited to people whose ip not recognized etc.）.
17.	Weekend:	Defines the timing of the user.
18.	Revenue:		Define the order is placed or not, which refers to website's revenue instead of customers'.

> ***Note 1:***
>
> As "Page Values" follows specific calculating methods in Google Analytics, we put it here for you to understand this feature better:
> 
> 1. Function: 
>
>    Page Value = (Ecommerce Revenue + Total Goal Value)/ Number of Unique Pageviews for Given Page
> 
> 2. Example: 
>
>   1) Supposing your purchase path on this website go through: Page A, B, C, B, D (Goal Page), E (Receipt Page). What we want to calculate is the page value of page B, which you have visited 2 times (but only considered as 1 unique page view).
>
>   2) Ecommerce Revenue: How much a consumer pays -> we assume 100 here (value from ecommerce transaction on Page E)
> 
>   3) Total Goal Value: Goal value is assigned to Goal Page D by website at he beginning, we assume 10 here; and total goal value is goal value * unique pageviews of Page B, which is 1 * 10 = 10 here
>   
>  4) Page Value of Page B (each click) = (100+ 1 * 10) / 1 = 110
>
> 3. Application in this case
> 
> We don't have page value for an exact page, instead, we only have average page value of all visited pages by customer.
>
> ***Note 2:***
>
> To avoid misunderstanding, the bounce rate and exit rate don't mean 1 customer's bounce/exit rate. Each page's bounce/exit rate is calculated based on all historical visits of past visitors; and the bounce/exit rate we see here in each line refers to the average rate of all pages 1 customer visited.
>
> ***Note 3:***
>
> Regarding "Month", we don't have data from January and April, but we assume it's a complete dataset for 1 whole year sales.

## 2. Data Cleaning

### Importing Libraries

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Importing data

In [None]:
df = pd.read_csv("../input/online-shoppers-intention/online_shoppers_intention.csv")
df

### Check data

In [None]:
#check first 10 lines
df.head (10)

In [None]:
#check the types
df.dtypes

In [None]:
#check the nulls
df.dropna() 
df

In [None]:
#change attribute types to simplify later analysis
#transfer Visitor Type "returning""new""other" to 1,2,3
df.replace(['Returning_Visitor', 'New_Visitor','Other'],[1,2,3])
#transfer Month abb. to number 1-12
df.replace(['Jan','Feb','Mar','Apr','May','June','Jul','Aug','Sep','Oct','Nov','Dec'],[1,2,3,4,5,6,7,8,9,10,11,12])

## 3. Data Visualization

### Analysis Structure

> We aim to improve merchandising volume and efficiency of our website.

#### _**Correlation Matrix**_

- We could have a general view on relation of all attributes.

#### _**PART 1 : To improve current merchandising volume**_

- With main attribute 'Revenue', we measure general sales performance by 6 dimensions (month, special day, weekend, region, visitor type, operating systems) to see in what condition orders are more likely to be placed; in other words, we are looking for next revenue growth point.


#### _**PART 2 : To improve merchandising efficiency (from marketing side and operational side)**_

- Marketing side: to improve traffic volume

    With main attribute "TrafficType", we measure our marketing performance by 6 dimensions (revenue, month, special day, weekend, region, visitor type) to see which is our main traffic source and how we could target and attract more potential customers with limited marketing cost.

- Operational side: to improve page conversion rate

    With main attribute "PageValues", we measure the page conversion rate by 3 dimensions (revenue, exit rate, bounce rate) to see if we have set proper page value of goal pages and guide people to more effective and efficient purchase paths.


### Correlation Matrix

***Conclusion:***
- Transaction result ("Revenue") is highly related to page values, meanwhile shows a positive relation with page types,page duration,bounce rates and exit rates, thus in Machine Learning part we will see how to segment customers and predict their purchasing behavior based on these attributes.
- Relation between other attributes and transaction result ("Revenue") is not apprent, which needs to be analyzed by different dimensions later on.

In [None]:
#correlation matrix
plt.figure(figsize=(20,10))
heatmap = sns.heatmap(df.corr(),cmap=sns.diverging_palette(20, 220, n=200),center = 0,annot=True)
heatmap.set_xticklabels(heatmap.get_xticklabels(),rotation=15,horizontalalignment='right');

### PART 1 : To improve current merchandising volume
***Conclusion:***

- Genarally speaking, high transaction volume took place:

    1) in March, May, November and December
    
    2) on weekdays
    
    3) right on special days
    
    4) among returning visitors
    
    5) in region 1, 3, 4
    
    6) through operating system 2
    
    
- As for visitor type:
  
  1) returning visitors always occupy largest proportion, other visitors could be ignored (only little impact)
  
  2) monthly distribution of visitor type is close to that of revenue, this is beacause in each type, sales performance follows similar sales ratio (order volume / total visitor volume)
  
***Strategy:***

- Determine different operational budget and promotion strategies for off season and peak seasons: 

  1) sales in November counts for almost 50% of whole year's sales, should maintain; 2) visitor volume in May is largest, mainly are returning visitors, should improve purchase intention; 3) sales in February is nearly 0, should analyze precisely 
  
- Have a strategy for each operating system :

  1) Operating system #2 seems to be the main one used, but #1 and #3 are still very important. Having a different strategy is especially essential because at least one of these 3 main OS is mobile. If the site doesn't have a good mobile version, that's a good point where to start !

- Have a program to make money with non buying and returning visitors :

  1) Returning visitors not buying anything are the main group on the site, so they are also the main cost. Finding a way to make money with them could be very lucrative ! Amazon prime is certainly the best example for such a program

- Organizing "special week ends" :

  1) Because the bulk of the revenues is generated during the week,it is certainly possible to incentivise customers to come back the week end,giving an additional reason to buy at this time is crucial because customers have time to compare with othe sites



In [None]:
#sales performance by 6 dimensions
fig, ax = plt.subplots(3,3,figsize = (20,10))
plt.tight_layout(pad= 3)

sns.countplot(data = df, x ='Month',hue = 'Revenue',ax =ax[1,0],
             palette = "rocket_r").set_title('Sales ratio per Month')
sns.countplot(data = df, x ='Weekend',hue = 'Revenue',ax =ax[0,1],
             palette = "rocket_r").set_title('Sales ratio on Weekdays(Weekends)')
df2 = df[df['SpecialDay']!=0]
sns.countplot(data = df2, x ='SpecialDay',hue = 'Revenue',ax =ax[0,2],
             palette = "rocket_r").set_title('Sales ratio around Special Days')
sns.countplot(data = df, x ='VisitorType',hue = 'Revenue',ax =ax[0,0],
             palette = "rocket_r").set_title('Sales ratio per Visitor Type')
sns.countplot(data = df, x ='Region',hue = 'Revenue',ax =ax[1,1],
             palette = "rocket_r").set_title('Sales ratio per Region')
sns.countplot(data = df, x ='OperatingSystems',hue = 'Revenue',ax =ax[1,2],
             palette = "rocket_r").set_title('Sales ratio per Operating System')
#visitor type by 3 dimensions
sns.countplot(data = df, x ='Month',hue = 'VisitorType',ax =ax[2,0],
             palette = "viridis").set_title('Visitor Type per Month');
sns.countplot(data = df, x ='Region',hue = 'VisitorType',ax =ax[2,1],
             palette = "viridis").set_title('Visitor Type per Region');
sns.countplot(data = df, x ='OperatingSystems',hue = 'VisitorType',ax =ax[2,2],
             palette = "viridis").set_title('Visitor Type per Operating System');

### PART 2 : To improve merchandising efficiency

- ### Marketing side: to improve traffic volume

***Conclusion and Strategy :***

* First, it is obvious that traffic types 1 to 4 are doing all the heavy lifting. We cannot explain exactly why here because we don't have the necessary data, but if there is some investment in advertising that needs to be made, it would be on the platform corresponding to these traffic types

* It is also clear that traffic type depends a lot on the time dimension : some months show way stronger use of some types, the same would be true fo weekends and special days, again, this raises more questions than it answers : we need more data !

* When looking at returning vs new visitors by traffic type, we cannot not notice that new visitors mainly use the traffic type #2, so, even if returning visitors make the most of the revenues, they onece were new visitors, traffic type #2 is where the most investment is needed !

In [None]:
#marketing performance by 6 dimensions
fig, ax1 = plt.subplots(2,2,figsize = (15,5))
plt.tight_layout(pad= 3)
fig, ax2 = plt.subplots(1,figsize = (20,5))
fig, ax3 = plt.subplots(1,figsize = (20,5))

sns.countplot(data = df, x ='TrafficType',hue = 'Revenue',ax =ax1[0,0],
              palette="rocket_r").set_title('Sales ratio per Traffic Type')
sns.countplot(data = df, x ='TrafficType',hue = 'VisitorType',ax =ax1[0,1],
             palette="rocket_r").set_title('Traffic volume by Visitor Type per Traffic Type')
sns.countplot(data = df, x ='TrafficType',hue = 'Weekend',ax =ax1[1,0],
              palette="rocket_r").set_title('Traffic volume around Weekends per Traffic Type')
sns.countplot(data = df2, x ='TrafficType',hue = 'SpecialDay',ax =ax1[1,1],
             palette="rocket_r").set_title('Traffic volume around Special Day per Traffic Type')
sns.countplot(data = df, x ='TrafficType',hue = 'Month',ax =ax2,
             palette="rocket_r").set_title('Traffic volume by Month per Traffic Type')
sns.countplot(data = df, x ='TrafficType',hue = 'Region',ax =ax3,
             palette="rocket_r").set_title('Traffic volume by Region per Traffic Type');

- ### Operational side: to improve page conversion rate

***Conclusion:***

- **Is page value we set before reasonable ?** 
   
  Basically Bounce Rate and Exit Rate decline when Page Value increases, which is in line with commen sense: customers who finally buy things are less likely to bounce between pages(with clearer purchase path); and of course don't exit website before payment. These group of people mainly have an average page value between [0,100].
  
  However, there are a wide range of customers who didn't buy anything (Revenue = False), however, show an average page value between [0,75], . This means their page value consists of only goal page values (no transaction value), i.e. some of the ideal purchase paths and goal pages we set before are not accurate, thus customers who clicked these pages didn't purchase as we expected.
  
- **In which range Bounce/Exit Rates could facilitate purchase behavior?**
  
  We could easily see a positive linear relationship between Exit Rate and Bounce Rate, i.e. customers who visit pages of higher average bounce rates are more likely to exit the website.
  
  To be precise, for customers who finally placed the order, most of them visited pages of bounce rate and exit rate between [0,0.075]
  
- **How does number of pages visited impact purchase behavior? And in which range duration on page could facilitate purchase behavior?**

  Generally speaking, both product related and administrative pages (or page duration) have a range in which they could improve purchase intention; while  informational pages have little impact.
  
  For customers who placed the order, the approximate range of number of product related pages they visited is [15,45], the median around 30, meanwhile, the approximate range of duration on this type of page is [500,1500]s, the median around 900s; when it comes to administrative pages, the four figures are [0,5],2.5, [0,100]s,40s.
  
  However, considering some customers who didn't buy anything might also visited similar number of pages and stay same duration time, we finally target a even smaller group of people -- who seem to be 100% sure will purchase: they mainly visited approximately [30,45] product related pages and have a duration time between [1000,1500]s. In fact, this group of people will help us design new purchasing paths to guide customers visit proper number of pages and stay reasonable time on our website.



***Strategy:***
- We should re-design the purchase paths through: 

    1) assigning new page values, 

    2) naming new goal pages, 

    3) improving UI to guide customer to high value pages etc. 
> As we are in lack of more precise information on specific webpages, we don't give our assumputions here.

In [None]:
#check if page value we set before reasonable
df2 = df[df["PageValues"]!=0]
fig, ax = plt.subplots(2,2, figsize=(20,10))

sns.scatterplot(data = df2, x= 'PageValues',y='BounceRates',hue = 'Revenue',ax= ax[0,0],
                palette = "viridis").set_title('Is Page Value aligned with Bounce Rates and transaction result?')
sns.scatterplot(data = df2, x= 'PageValues',y='ExitRates',hue = 'Revenue',ax= ax[0,1],
                palette = "rocket_r").set_title('Is Page Value aligned with Exit Rates and transaction result?')

#more detailed range (page values [0,100])
sns.regplot(x="PageValues", y="BounceRates", data=df2,
            truncate=True,robust = True, x_bins=100, color="#38818c", ax= ax[1,0]).set(ylim = (0,0.02),xlim=(0,100))
sns.regplot(x="PageValues", y="ExitRates", data=df2,
            truncate=True, robust = True,x_bins=100, color="#75556c", ax= ax[1,1]).set(ylim = (0,0.04),xlim=(0,100));

In [None]:
#In which range Bounce/Exit Rates could facilitate purchase behavior?
fig, ax = plt.subplots(2,2, figsize=(20,5))

sns.scatterplot(data = df, x='ExitRates',y='BounceRates',hue = 'Revenue',ax=ax[0,0],
                palette = "viridis").set_title('Is Exit Rate aligned with Bounce Rate and transaction result?');
#more detailed range([0,0.075],[0,0.03],[0,0.015] )
sns.regplot(data = df, x="ExitRates", y="BounceRates", 
            truncate=True,robust = True, x_bins=100, color="#38818c", ax= ax[0,1]).set(ylim = (0,0.075),xlim=(0,0.075))
sns.regplot(data = df, x="ExitRates", y="BounceRates", 
            truncate=True,robust = True, x_bins=100, color="#38818c", ax= ax[1,0]).set(ylim = (0,0.03),xlim=(0,0.03))
sns.regplot(data = df, x="ExitRates", y="BounceRates", 
            truncate=True,robust = True, x_bins=100, color="#38818c", ax= ax[1,1]).set(ylim = (0,0.015),xlim=(0,0.015));

In [None]:
#How does number of pages visited impact purchase behavior?
fig, ax = plt.subplots(4,3, figsize=(15,10))
plt.tight_layout(pad= 3)
sns.scatterplot(data = df, x="PageValues", y="ProductRelated",hue = 'Revenue',ax = ax[0,0],
            palette = "rocket_r").set_title('Is page value aligned with Product Related Pages?')
sns.scatterplot(data = df, x="PageValues", y="Administrative",hue = 'Revenue',ax = ax[0,1],
            palette = "viridis").set_title('Is page value aligned with Administrative Pages?')
sns.scatterplot(data = df, x="PageValues", y="Informational",hue = 'Revenue',ax = ax[0,2],
            palette = "mako").set_title('Is page value aligned with Informational Pages?')

#more detailed range([0,100],[0,15],[0,5] )
df6 = df[df["ProductRelated"]<100]
sns.boxplot(data = df6, x="Revenue",y="ProductRelated",ax = ax[1,0],
           palette = "rocket_r").set_title('Distribution of Product Related Page Numbers on Revenue')
df7 = df[df["Administrative"]<15]
sns.boxplot(data = df7, x="Revenue",y="Administrative",ax = ax[1,1],
           palette = "viridis").set_title('Distribution of Administrative Page Numbers on Revenue')
df8 = df[df["Informational"]<5]
sns.boxplot(data = df8, x="Revenue",y="Informational",ax = ax[1,2],
           palette = "mako").set_title('Distribution of Informational Page Numbers on Revenue')

#In which range duration on page could facilitate purchase behavior?
sns.boxplot(data = df, x="Revenue",y="ProductRelated_Duration",ax = ax[2,0],
           palette = "rocket_r").set_title('Distribution of Product Related Page Duration on Revenue')
sns.boxplot(data = df, x="Revenue",y="Administrative_Duration",ax = ax[2,1],
           palette = "viridis").set_title('Distribution of Administrative Page Duration on Revenue')
sns.boxplot(data = df, x="Revenue",y="Informational_Duration",ax = ax[2,2],
           palette = "mako").set_title('Distribution of Informational Page Duration on Revenue')

#more detailed range (duration [0,3500], [0,300], [0,300])
df3 = df[df["ProductRelated_Duration"]<3500]
sns.boxplot(data = df3, x="Revenue",y="ProductRelated_Duration",ax = ax[3,0],
           palette = "rocket_r").set_title('Distribution of Product Related Page Duration on Revenue')
df4 = df[df["Administrative_Duration"]<300]
sns.boxplot(data = df4, x="Revenue",y="Administrative_Duration",ax = ax[3,1],
           palette = "viridis").set_title('Distribution of Administrative Page Duration on Revenue')
df5 = df[df["Informational_Duration"]<300]
sns.boxplot(data = df5, x="Revenue",y="Informational_Duration",ax = ax[3,2],
           palette = "mako").set_title('Distribution of Informational Page Duration on Revenue');

# BELOW ARE ALL ABANDONED MODULES

In [None]:
#abandoned versions

# by xiyan [MODIFIED AS ABOVE]
'''
# 3.2 Model and Indicators (to be modified,not alighed with visualization yet)
In this case, we aim to improve merchandising efficiency and total sales volume.
In order to quantify the demand, we build a Purchase Intention Index Model made up by following indicators:

1. Customer_conversion_rate (CCR) = Order_volume / Uique_visitor_volume
2. Page_conversion_rate (PCR) = Order_volume / Visited_Page_Value
3. Advertising_conversion_rate (ACR) = Order_volume / Traffic_volume (by Traffic type)

With CCR and order volume, we measure sales performance by month, special day, weekend, region, visitor type to see when/where/how more orders are likely to be placed;

With PCR, we check if page value assigned to different page types aligned with final order volume, and see how to attract customer to pages of higher quality;

With ACR, we suggest better advertising channels by month, special day, browser,operating system and region, to reduce cost of attracting customers.

**Obviously, we need to maximize each indicator value.**

> Note that:
> 1. Visiting pages is considered to be a cost, because it consumes resources.
> 2. As we don't have exact amount of revenues for each order, we only use order volume to measure transaction result.
> 3. Definition of variables in the functions above:
>    
>    1) Order_volume= Count of lines where value = "True" in "Revenue" column
>    
>    2) Non_order_volume = Count of lines where value = "False" in "Revenue" column
>     
>    3) Unique_visitor_volume = Order_volume + Non_order_volume
>     
>    4) Visited_Page_volume = sum of values in "PageValues" column
>     
>    5) Traffic_volume = count of lines where value = [1,2,...20] in "TrafficType" column
>    
> 4. Exception Management
>    
>    If Visited_Page_volume = 0, we replace the 0 by a 1 in the Page_conversion_rate (PCR) indicator because by default at least a page was visited if revenue was generated
'''
# by vincent-Model Description [MODIFIED AS ABOVE]
'''
First, visiting pages is considered to be a cost, because it consumes resources.
Second, because we don't have the precise amount we will simply count Revenues

1) The model is made of 2 parts :
    Gross Merchandise Volume = # Unique Visitors * Conversion rate
    Gross Margin Indicator= # Revenues / # Pages Visited
    
    Then we want to maximize the Gross Profit Indicator defined by :
        Gross Profit Indicator = Gross Merchandise Volume * Gross Margin Indicator

2) Precise defintion of every element of the model
    # Revenues = Count of the # of lines of the "Revenue" column where Revenue = True
    # non Revenues = Count of the # of lines of the "Revenue" column where Revenue = Fakse
    # Unique Visitors = # Revenues + # non Revenues
    Conversion Rate = # Revenues / # Unique Visitors
    # Pages Visited = Sum of the Page Values column

It should be noted that here we necessarily have :
    Gross Merchandise Volume = # Revenues
    
3) Exception Management
    If # Pages Visited = 0, we replace the 0 by a 1 in the Gross Margin Indicator because by default at least a page was visited if revenue was generated

4) Abstract
    We want to maximize :
        Gross Profit Indicator = (# Revenues)^2 / # Pages Visited
'''

# Check the correlations
'''
List of columns :
    Administrative             float64
    Administrative_Duration    float64
    Informational              float64
    Informational_Duration     float64
    ProductRelated             float64
    ProductRelated_Duration    float64
    BounceRates                float64
    ExitRates                  float64
    PageValues                 float64
    SpecialDay                 float64
    Month                       object
    OperatingSystems             int64
    Browser                      int64
    Region                       int64
    TrafficType                  int64
    VisitorType                 object
    Weekend                       bool
    Revenue                       bool

We can divide these columns in 3 categories :
    1) Time dimension
        Month
        SpecialDay
        Weekend
    2) Webpage dynamic
        Administrative
        Administrative_Duration
        Informational
        Informational_Duration
        ProductRelated
        ProductRelated_Duration
        BounceRates
        ExitRates
        PageValues
    3) Socio-geographical
        OperatingSystems
        Browser
        Region
        TrafficType
        VisitorType

Note that the non numerical columns are automatically excluded from the analysis, so only the 2nd category will be analyzed in detail here
'''
''' Webpage dynamic'''
'''
DATA_CORR_WD = DATA[['Administrative','Administrative_Duration','Informational','Informational_Duration','ProductRelated','ProductRelated_Duration','BounceRates','ExitRates','PageValues']]
correlation_wd = DATA_CORR_WD.corr()
Values_corr_wd = False # True to show values in the heatmap, False to hide them
hm = sns.heatmap(correlation_wd, annot=Values_corr_wd,cmap=sns.diverging_palette(20, 220, n=200),center = 0,square=True)
hm.set_xticklabels(hm.get_xticklabels(),rotation=45,horizontalalignment='right');

# Graphs : x axis = chosen dimension ; y axis = Gross Profit Indicator from model above
'''
'''
#Create a function to simplify and re use of the model easily
def GPI (data):
    REV = data[['Revenue']].value_counts().loc[True].item()
    PV = data[['PageValues']].sum(axis=0).item()
    if PV == 0:
        return 1
    else:
        return REV*REV/PV

Graphs for every axis (line if ordered naturally, histogram if logical order but not as clear, ordered vertical histogram else)
#transfer 'True' or 'False' values to 1 or 0
df = pd.DataFrame(DATA)
df['Weekend']=df['Weekend'].astype('int')
df['Revenue']=df['Revenue'].astype('int')
''';

In [None]:
#abandoned version

#Month impact on customer conversion rate
gp=df.groupby(['Month'])
Order_volume = gp['Revenue'].sum()
Uique_visitor_volume = gp['Revenue'].count()
Order_volume = Order_volume.sort_index(False)
CCR = Order_volume/Uique_visitor_volume
CCR = pd.DataFrame (CCR)
CCR.columns = ['Customer_conversion_rate']
#graph
fig, (ax1) = plt.subplots(1, figsize=(17,7))
CCR.plot(ax=ax1)

ax1.set_title("Month impact on customer conversion rate", size=13)
ax1.set_ylabel("Cutomer_conversion_rate", size=13)
ax1.set_xlabel("Month", size=13);