# A/B Testing for Shoefly.com

- Name: Stefanus Bernard Melkisedek
- Codecademy: [@DatenMeister](https://www.codecademy.com/profiles/DatenMeister)
- Course: Python Pandas for Data Science
- Sub-course: Aggregates in Pandas


## Project Description

The online shoe store have two different versions of an ad, which they have placed in emails, as well as in banner ads on Facebook, Twitter, and Google. They want to know how the two ads are performing on each of the different platforms on each day of the week.


## Project Goals

- Analyze the data using aggregate measures
- Compare the effectiveness of two different versions of an ad
- Recommend the ad which the company should use for future marketing campaign


## Data Wrangling


In [None]:
# import necessary library
import pandas as pd

In [None]:
# Load the csv files into the dataframe
df_ad_clicks = pd.read_csv("./data/ad_clicks.csv")

# View the first 10 rows of the dataset
df_ad_clicks.head(10)

## Exploratory Data Analysis


### 1. Which ad platform is getting the most views?


In [None]:
# Group the dataframe by 'utm_source' and count the number of 'user_id' for each group
df_grouped = df_ad_clicks.groupby("utm_source").user_id.count()

# Reset the index of the grouped dataframe
df_reset = df_grouped.reset_index()

# Sort the dataframe in descending order by 'user_id'
df_sorted = df_reset.sort_values(by="user_id", ascending=False)

# Get the first row of the sorted dataframe
df_most_view = df_sorted.head(1)

# Rename the columns for better understanding
df_most_view.rename(
    columns={"utm_source": "ad_platform", "user_id": "count"}, inplace=True
)

# Display the dataframe
df_most_view

### 2. What is the percentage of people who clicked on ads from each platform?


In [None]:
# Create a new column 'is_click' in the dataframe 'df_ad_clicks'.
# This column will be True if 'ad_click_timestamp' is not NaN (i.e., there was a click), and False otherwise.
df_ad_clicks["is_click"] = ~df_ad_clicks.ad_click_timestamp.isna()

# Create a new dataframe 'df_not_clicks' that contains the first 10 rows from 'df_ad_clicks' where 'is_click' is False.
df_not_clicks = df_ad_clicks[df_ad_clicks["is_click"] == False].head(10)

# Create a new dataframe 'df_clicks' that contains the first 10 rows from 'df_ad_clicks' where 'is_click' is True.
df_clicks = df_ad_clicks[df_ad_clicks["is_click"] == True].head(10)

In [None]:
df_ad_clicks.head()

In [None]:
# Group the 'df_ad_clicks' dataframe by 'utm_source' and 'is_click' columns
# Count the number of 'user_id' for each group
# Reset the index of the resulting dataframe
# This will give us the number of clicks for each ad source
click_percent = (
    df_ad_clicks.groupby(["utm_source", "is_click"])
    .user_id.count()
    .reset_index()
)

# Display the first 5 rows of the dataframe
# This will give us a quick overview of the data
click_percent.head()

In [None]:
# Pivot the 'click_percent' dataframe to create a new dataframe 'clicks_pivot'.
# The 'utm_source' column values will become the new index.
# The 'is_click' column values will become the new columns.
# The 'user_id' column values will fill the new dataframe.
# Reset the index of the resulting dataframe and remove the axis name.
clicks_pivot = (
    click_percent
    .pivot(index="utm_source", columns="is_click", values="user_id")
    .reset_index()
    .rename_axis(None, axis=1)
)

# Rename the columns of 'clicks_pivot' for better understanding.
# 'utm_source' is renamed to 'platform', False is renamed to 'false', and True is renamed to 'true'.
clicks_pivot.rename(columns={
    "utm_source": "platform",
    False: "false",
    True: "true"}, inplace=True)

# Display the first 5 rows of 'clicks_pivot' to get a quick overview of the data.
clicks_pivot.head()

In [None]:
# Calculate the percentage of clicks for each platform.
# This is done by dividing the number of true clicks by the total number of clicks (true + false).
# The result is multiplied by 100 to get a percentage and rounded to 2 decimal places.
# The result is stored in a new column 'percent_clicked' in the 'clicks_pivot' dataframe.
clicks_pivot["percent_clicked"] = (
    round(clicks_pivot.true / (clicks_pivot.true + clicks_pivot.false) * 100, 2)
)

# Display the 'clicks_pivot' dataframe.
clicks_pivot

### 3. Which ad has the greater percentage of users clicked ?

In [None]:
# Check the number of people shown in both ads
group_usr_count = (
    df_ad_clicks.groupby('experimental_group')
    .user_id.count()
    .reset_index())

group_usr_count.rename(columns={
    'user_id':'user_count'
}, inplace=True)

group_usr_count.head()

In [None]:
# Check the dominant group percentage of users click
click_user_group = (
    df_ad_clicks.groupby(['experimental_group', 'is_click'])
    .user_id
    .count()
    .reset_index()
)

click_user_group

In [None]:
# Pivot the 'click_user_group' dataframe to create a new dataframe 'pivot_user_group'.
# The 'experimental_group' column values will become the new index.
# The 'is_click' column values will become the new columns.
# The 'user_id' column values will fill the new dataframe.
# Reset the index of the resulting dataframe and remove the axis name.
pivot_user_group = (
    click_user_group
    .pivot(index='experimental_group', columns='is_click', values='user_id')
    .reset_index()
    .rename_axis(None, axis=1)
)

# Display the 'pivot_user_group' dataframe.
pivot_user_group

In [None]:
df_ad_clicks.head()

In [None]:
# Create DataFrame a_clicks
a_clicks = df_ad_clicks[df_ad_clicks.experimental_group == 'A']
a_clicks.head()

In [None]:
# Create DataFrame b_clicks
b_clicks = df_ad_clicks[df_ad_clicks.experimental_group == 'B']
b_clicks.head()

In [None]:
# Calculate the percent of user who click on the ad A each day
usr_pct_a = (
    a_clicks.groupby(['day','is_click'])
    .user_id
    .count()
    .reset_index()
)

# Display the 'usr_pct_a' dataframe 
usr_pct_a

In [None]:
# Pivot the 'usr_pct_a' dataframe to create a new dataframe 'usr_pct_a_pivot'.
# The 'day' column values will become the new index.
# The 'is_click' column values will become the new columns.
# The 'user_id' column values will fill the new dataframe.
# Reset the index of the resulting dataframe and remove the axis name.
usr_pct_a_pivot = (
    usr_pct_a
    .pivot(index='day', columns='is_click', values='user_id')
    .reset_index()
    .rename_axis(None, axis=1)
)

# Rename the columns of 'usr_pct_a_pivot' for better understanding.
# False is renamed to 'false', and True is renamed to 'true'.
usr_pct_a_pivot.rename(columns={False: 'false', True: 'true'}, inplace=True)

# Display the 'usr_pct_a_pivot' dataframe.
usr_pct_a_pivot

In [None]:
# Calculate the click percentage for each day.
# This is done by dividing the number of true clicks by the total number of clicks (true + false).
# The result is multiplied by 100 to get a percentage and rounded to 2 decimal places.
# The result is stored in a new column 'click_percentage' in the 'usr_pct_a_pivot' dataframe.
usr_pct_a_pivot['click_percentage'] = (
    round((usr_pct_a_pivot.true) / (usr_pct_a_pivot.true + usr_pct_a_pivot.false) * 100, 2)
)

# Display the 'usr_pct_a_pivot' dataframe.
usr_pct_a_pivot

In [None]:
# Calculate the percent of user who click on the ad B each day
usr_pct_b = (
    b_clicks.groupby(['day','is_click'])
    .user_id
    .count()
    .reset_index()
)

# Display the 'usr_pct_b' dataframe 
usr_pct_b

In [None]:
# Pivot the 'usr_pct_b' dataframe to create a new dataframe 'usr_pct_b_pivot'.
# The 'day' column values will become the new index.
# The 'is_click' column values will become the new columns.
# The 'user_id' column values will fill the new dataframe.
# Reset the index of the resulting dataframe and remove the axis name.
usr_pct_b_pivot = (
    usr_pct_b
    .pivot(index='day', columns='is_click', values='user_id')
    .reset_index()
    .rename_axis(None, axis=1)
)

# Rename the columns of 'usr_pct_b_pivot' for better understanding.
# False is renamed to 'false', and True is renamed to 'true'.
usr_pct_b_pivot.rename(columns={False: 'false', True: 'true'}, inplace=True)

# Display the 'usr_pct_b_pivot' dataframe.
usr_pct_b_pivot

In [None]:
# Calculate the click percentage for each day in the 'usr_pct_b_pivot' dataframe.
# This is done by dividing the number of true clicks by the total number of clicks (true + false).
# The result is multiplied by 100 to get a percentage and rounded to 2 decimal places.
# The result is stored in a new column 'click_percentage'.
usr_pct_b_pivot['click_percentage'] = (
    round((usr_pct_b_pivot.true) / (usr_pct_b_pivot.true + usr_pct_b_pivot.false) * 100, 2)
)

# Display the 'usr_pct_b_pivot' dataframe.
usr_pct_b_pivot

In [None]:
# Compare the result for ad A and ad B
usr_pct_a_pivot

In [None]:
usr_pct_b_pivot

## Conclusion

After comparing the two ads, it is recommended that the company use **"ad a"** for future marketing campaigns because it **has a higher daily click-through rate** than "ad b".