In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#  **Testing for Shoefly.com**

Our favorite online shoe store, ShoeFly.com is performing an A/B Test. They have two different versions of an ad, which they have placed in emails, as well as in banner ads on Facebook, Twitter, and Google. They want to know how the two ads are performing on each of the different platforms on each day of the week. Help them analyze the data using aggregate measures. Some questions that will be answered are the following:
1. How many views came from each utm_source?
1. Was there a difference in click rates for each source?
1. Were approximately the same number of people shown both ads?
1. Product manager for the A/B test thinks that the clicks might have changed by day of the week.
1. Do you recommend that your company use Ad A or Ad B?


In [None]:
# Fill in the line below: Specify the path of the CSV file to read
my_filepath = "../input/ad-clicks/ad_clicks.csv"

# Fill in the line below: Read the file into a variable my_data
ad_clicks = pd.read_csv(my_filepath)

In [None]:
# To preview the dataframe to see what data we are working with
ad_clicks.head()

In [None]:
# How many views (ie, rows of the table) came from each utm_source?
# Can use any of the columns because using the count() to find value of rows of the table. 
# Recall that added .reset_index() to return result from a series to a dataframe.

ad_clicks.groupby('utm_source')\
    .user_id.count()\
    .reset_index()

In [None]:
# Filter the data to remove NaN values since those mean the user never clicked the ads 
# and add a new column is_click. ~ is a NOT operator to invert the results of .isnull() 
# since .isnull() will return True for NaN and we want it to return False

ad_clicks['is_click'] = ~ad_clicks\
   .ad_click_timestamp.isnull()

# We want to know the percent of people who clicked on ads from each utm_source.
# Start by grouping by utm_source and is_click and counting the number of user_id‘s in each of those groups. 
# Save your answer to the variable clicks_by_source.
clicks_by_source = ad_clicks\
   .groupby(['utm_source',
             'is_click'])\
   .user_id.count()\
   .reset_index()

clicks_by_source

In [None]:
# Pivot the data so that columns are is_click (True or False), the index is utm_source, 
# and the values are user_id

clicks_pivot = clicks_by_source\
   .pivot(index='utm_source',
          columns='is_click',
          values='user_id')\
   .reset_index()

# Was there a difference in click rates for each source? Find the percent clicked and add 
# a new columm in the clicks_pivot table

clicks_pivot['percent_clicked'] = \
   clicks_pivot[True] / \
   (clicks_pivot[True] + 
    clicks_pivot[False])\
    *100
    
clicks_pivot

We found that there was a difference in click rates for each source. Facebook ads had the highest percentage at 35.71% closely followed by
Google ads at 35.15%. The lowest performing ads clicked were from Twitter.

In [None]:
# Analyzing an A/B test. Column experimental_group tells us whether the user was shown Ad A or Ad B. 
# Using the column is_click, check to see if a greater percentage of users clicked on Ad A or Ad B. 

clicks_exp_pivot = ad_clicks\
   .groupby(['experimental_group',
             'is_click'])\
   .user_id.count()\
   .reset_index()\
   .pivot(
     index='experimental_group',
     columns='is_click',
     values='user_id')\
   .reset_index()\

clicks_exp_pivot['percent_clicked'] = \
   clicks_exp_pivot[True] / \
   (clicks_exp_pivot[True] + 
    clicks_exp_pivot[False])\
    *100
    
clicks_exp_pivot

A user who was given Ad A was more likely to click the ad at 37.48% than a user who was given Ad B. 

In [None]:
# Also check to see if clicks might have changed by day of the week. Do you recommend that 
# your company use Ad A or Ad B?
# Creating DataFrame for a_clicks.
a_clicks = ad_clicks[
   ad_clicks.experimental_group == 'A']

a_clicks_pivot = a_clicks\
  .groupby(['is_click','day'])\
  .user_id.count()\
  .reset_index()\
  .pivot(
     index = 'day',
     columns = 'is_click',
     values = 'user_id'
   )\
   .reset_index()

a_clicks_pivot['percent_clicked'] = \
   a_clicks_pivot[True] / \
   (a_clicks_pivot[True] + 
    a_clicks_pivot[False])\
    *100

a_clicks_pivot

In [None]:
# Creating DataFrame for b_clicks.
b_clicks = ad_clicks[
   ad_clicks.experimental_group == 'B']

b_clicks_pivot = b_clicks\
  .groupby(['is_click','day'])\
  .user_id.count()\
  .reset_index()\
  .pivot(
     index = 'day',
     columns = 'is_click',
     values = 'user_id'
   )\
   .reset_index()

b_clicks_pivot['percent_clicked'] = \
   b_clicks_pivot[True] / \
   (b_clicks_pivot[True] + 
    b_clicks_pivot[False])\
    *100

b_clicks_pivot

During the days of the week, the performance of Ad A outpaced that of Ad B. Ad B, in particular only performed better than Ad A on Tuesday.

To summarize the results, we learned that Ad A performed better than Ad B regardless of the day of the week the ad was shown. We also learned that ads on Facebook were clicked at a higher rate than its competitors. Therefore, I would recommend that the company run Ad A instead of Ad B and allocate more resources into running the ad on Facebook and Google than the other platforms to maximize user traffic into ShoeFly.com.


# **Analyzing Farmburg’s A/B test**

Brian ran an A/B test with three different groups: A, B, and C. He has provided us with a CSV file of his results named clicks.csv. It has the following columns:
user_id: a unique id for each visitor to the FarmBurg site
group: either 'A', 'B', or 'C' depending on which group the visitor was assigned to
is_purchase: either 'Yes' if the visitor made a purchase or 'No' if they did not.

In [None]:
# Fill in the line below: Specify the path of the CSV file to read
my_filepath_1 = "../input/clicks/clicks.csv"

# Fill in the line below: Read the file into a variable my_data
abdata = pd.read_csv(my_filepath_1)

# To preview the dataframe to see what data we are working with
abdata.head()

Note that we have two categorical variables: group and is_purchase. We are interested in whether visitors are more likely to make a purchase if they are in any one group compared to the others. Because we want to know if there is an association between two categorical variables, we’ll start by using a Chi-Square test to address our question.

In order to run a Chi-Square test, we first need to create a contingency table of the variables group and is_purchase. Use pd.crosstab() to create this table and name the result Xtab, then print it out. Which group appears to have the highest number of purchases? Answer: Group A with 316 purchases.

In [None]:
# Create a contingency table with pd.crosstab and print
Xtab = pd.crosstab(abdata.group, abdata.is_purchase)
Xtab

To conduct the Chi-Square Test, import chi2_contingency from scipy.stats.

Then, use the function chi2_contingency with the data in Xtab to calculate the p-value. Remember that of the four values returned by chi2_contingency, the p-value is the second value.

Save the p-value to a variable named pval and print the result. Using a significance threshold of 0.05, is there a significant difference in the purchase rate for groups A, B, and C? Answer: the p-value is less than 0.05 and we can conclude that there is a significant difference in the purchase rate for groups A, B, and C.


In [None]:
# Import chi2_contingency module
from scipy.stats import chi2_contingency

# Calculate the p-value
chi2, pval, dof, expected = chi2_contingency(Xtab)

# Print the p-value
print(pval)

In [None]:
# Determine if the p-value is significant
is_significant = True

Our day is a little less busy than expected, so we decide to ask Brian about his test.

Us: Hey Brian! What was that test you were running anyway?

Brian: We are trying to get users to purchase a small FarmBurg upgrade package. It’s called a microtransaction. We’re not sure how much to charge for it, so we tested three different price points: \\$0.99 (group A), \\$1.99 (group B), and \\$4.99 (group C). It looks like significantly more people bought the upgrade package for \\$0.99, so I guess that’s what we’ll charge.

Us: Oh no! We should have asked you this before we did that Chi-Square test. That wasn’t the right test at all. It’s true that more people wanted to purchase the upgrade at \\$0.99; you probably expected that. What we really want to know is whether each price point allows us to make enough money that we can exceed some target goal. Brian, how much do you think it cost to build this feature?

Brian: Hmm. I guess that we need to generate a minimum of $1000 in revenue per week in order to justify this project.

Us: We have some work to do!

In order to justify this feature, we will need to calculate the necessary purchase rate for each price point. Let’s start by calculating the number of visitors to the site this week.

It turns out that Brian ran his original test over the course of a week, so the number of visitors in abdata is equal to the number of visitors in a typical week. Calculate the number of visitors in the data and save the value in a variable named num_visits. Make sure to print the value.

In [None]:
# Calculate and print the number of visits
num_visits = len(abdata)

# Print the number of visits
num_visits

Now that we know how many visitors we generally get each week (num_visits), we need to calculate the number of visitors who would need to purchase the upgrade package at each price point (\\$0.99, \\$1.99, \\$4.99) in order to generate Brian’s minimum revenue target of $1,000 per week.

To start, calculate the number of sales that would be needed to reach \\$1,000 dollars of revenue at a price point of $0.99. Save the result as num_sales_needed_099 and print it out.

Now that we know how many sales we need at a \\$0.99 price point, calculate the proportion of weekly visitors who would need to make a purchase in order to meet that goal. Remember that the number of weekly visitors is saved as num_visits. Save the result as p_sales_needed_099 and print it out.

Print out the proportions. Note that for higher price points, you’ll need to sell fewer upgrade packages in order to meet your minimum revenue target — so the proportions should decrease as the price points increase.

In [None]:
# Calculate the purchase rate needed at 0.99
num_sales_needed_099 = 1000/0.99
p_sales_needed_099 = num_sales_needed_099/num_visits

# Print the purchase rate needed at 0.99
print(p_sales_needed_099)

# Calculate the purchase rate needed at 1.99
num_sales_needed_199 = 1000/1.99
p_sales_needed_199 = num_sales_needed_199/num_visits

# Print the purchase rate needed at 1.99
print(p_sales_needed_199)

# Calculate the purchase rate needed at 4.99
num_sales_needed_499 = 1000/4.99
p_sales_needed_499 = num_sales_needed_499/num_visits

# Print the purchase rate needed at 4.99
print(p_sales_needed_499)


Now let’s return to Brian’s question. To start, we want to know if the percent of Group A (the \\$0.99 price point) that purchased an upgrade package is significantly greater than p_sales_needed_099 (the percent of visitors who need to buy an upgrade package at \\$0.99 in order to make our minimum revenue target of \\$1,000).

To answer this question, we want to focus on just the visitors in group A. Then, we want to compare the number of purchases in that group to p_sales_needed_099.

Since we have a single sample of categorical data and want to compare it to a hypothetical population value, a binomial test is appropriate. In order to run a binomial test for group A, we need to know two pieces of information:

The number of visitors in group A (the number of visitors who were offered the \\$0.99 price point)
The number of visitors in Group A who made a purchase
Calculate these two numbers and save them as samp_size_099 and sales_099, respectively. Note that you can use the contingency table that you printed earlier to get these numbers OR you can use Python syntax.

In [None]:
# Calculate samp size & sales for 0.99 price point
samp_size_099 = np.sum(abdata.group == 'A')
sales_099 = np.sum((abdata.group == 'A') & (abdata.is_purchase == 'Yes'))

# Print samp size & sales for 0.99 price point
print(samp_size_099)
print(sales_099)

# Calculate samp size & sales for 1.99 price point
samp_size_199 = np.sum(abdata.group == 'B')
sales_199 = np.sum((abdata.group == 'B') & (abdata.is_purchase == 'Yes'))

# Print samp size & sales for 1.99 price point
print(samp_size_199)
print(sales_199)

# Calculate samp size & sales for 4.99 price point
samp_size_499 = np.sum(abdata.group == 'C')
sales_499 = np.sum((abdata.group == 'C') & (abdata.is_purchase == 'Yes'))

# Print samp size & sales for 4.99 price point
print(samp_size_499)
print(sales_499)


For Group A (\\$0.99 price point), perform a binomial test using binom_test() to see if the observed purchase rate is significantly greater than p_sales_needed_099. Remember that there are four inputs to binom_test():
* x will be the number of purchases for Group A
* n will be the total number of visitors assigned group A
* p will be the target percent of purchases for the \\$0.99 price point

Alternative will indicate the alternative hypothesis for this test; in this case, we want to know if the observed purchase rate is significantly 'greater' than the purchase rate that results in the minimum revenue target.

Save the results to pvalueA, and print its value. Note that you’ll first need to import the binom_test() function from scipy.stats using the following line of code:


In [None]:
# Import the binom_test module
from scipy.stats import binom_test

# Calculate the p-value for Group A
pvalueA = binom_test(sales_099, n=samp_size_099, p=p_sales_needed_099, alternative='greater')

# Print the p-value for Group A
print(pvalueA)

# Calculate the p-value for Group B
pvalueB = binom_test(sales_199, n=samp_size_199, p=p_sales_needed_199, alternative='greater')

# Print the p-value for Group B
print(pvalueB)

# Calculate the p-value for Group C
pvalueC = binom_test(sales_499, n=samp_size_499, p=p_sales_needed_499, alternative='greater')

# Print the p-value for Group C
print(pvalueC)

In [None]:
# Create a lambda function, which is an anonymous function
x = lambda sales, samp_size, p_sales_needed: binom_test(sales, n=samp_size, p=p_sales_needed, alternative='greater')
print(x(316,1666,0.20210104243717691))

Based on the three p-values you calculated for the binomial tests in each group and a significance threshold of 0.05, were there any groups where the purchase rate was significantly higher than the target? Based on this information, what price should Brian charge for the upgrade package?

pvalueC is the only p-value below the threshold of 0.05. Therefore, the C group is the only group where we would conclude that the purchase rate is significantly higher than the target needed to reach \\$1000 revenue per week. Therefore, Brian should charge \\$4.99 for the upgrade.

In [None]:
# Set the correct value for the final answer variable
final_answer = '4.99'

# Print the chosen price group
print(final_answer)

In this project, we performed a Chi-Square test to determine if the categorical data for two independent variables (ie., group and is_purchase) have a relationship. We were interested in determining whether visitors are more likely to make a purchase if they are in any one group compared to the others. Given a p-value less than 0.05, we can conclude that there is a significant difference in the purchase rate for groups A, B, and C.





# **Test for MuscleHub**

In [None]:
# Setup libraries that will be used
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for calculations
import seaborn as sns # for creating visualizations

%matplotlib inline

## **Project Goals** ##
You’ve been hired as the newest member of the data analytics team at MuscleHub, a fancy gym, and your first assignment is to run an A/B test!
Currently, when a MuscleHub visitor purchases a membership, they follow the following steps:
1. Take a fitness test with a personal trainer.
1. Fill out an application for the gym.
1. Send in their payment for their first month’s membership.

Janet, the manager of MuscleHub, thinks that the fitness test intimidates some prospective members, so she has set up an A/B test.
Visitors are randomly be assigned to one of two groups:
* **Group A** is still asked to take a fitness test with a personal trainer.
* **Group B** skips the fitness test and proceed directly to the application.

Janet’s hypothesis is that visitors assigned to Group B will be more likely to eventually purchase a membership to MuscleHub than visitors assigned to Group A. So that the null and alternate hypotheses are as follows:
* **Null Hypothesis** = There will no difference between the visitors in Group A that purchase membership and the visitors in Group B that purchase membership.
* **Alternate Hypothesis** = There will be more visitors in Group B that will purchase membership than visitors in Group A that will purchase membership.

The significance threshold we will set as the benchmark to either accept or fail to reject the null hypothesis will be:
* 𝛼 = 0.05

You will help her analyze the data and create a presentation with your knowledge of conducting A/B testing with Python.

Janet of MuscleHub has a SQLite database, which contains several tables that will be helpful to you in this investigation. You have already created a csv file for each table.

Import the four csv files as pandas DataFrames and examine them.
Create the following four pandas DataFrames:
* *visits* from the **visits.csv** file, which contains information about potential gym customers who have visited MuscleHub.
* *fitness_tests* from the **fitness_tests.csv** file, which contains information about potential customers in “Group A”, who were given a fitness test.
* *applications* from the **applications.csv** file, which contains information about any potential customers (both “Group A” and “Group B”) who filled out an application. Not everyone in the **visits.csv** file will have filled out an application.
* *purchases* from the **purchases.csv** file, which contains information about customers who purchased a membership to MuscleHub.


In [None]:
# Import and read the csv files
applications = pd.read_csv("../input/musclehub-abtest/applications.csv")
fitness_tests = pd.read_csv("../input/musclehub-abtest/fitness_tests.csv")
purchases = pd.read_csv("../input/musclehub-abtest/purchases.csv")
visits = pd.read_csv("../input/musclehub-abtest/visits.csv")

In [None]:
# To preview the dataframe to see what data we are working with
applications.head()

In [None]:
fitness_tests.head()

In [None]:
purchases.head()

In [None]:
visits.head()

It would be helpful to have a single DataFrame with all of this data.

Create a DataFrame containing all of this data. Keep in mind that not all visits in visits.csv occurred during the A/B test. You’ll only want to pull data where visit_date is on or after 7-1-17.

You’ll want to perform a series of left joins using the pandas .merge() method to combine the four DataFrames. You’ll need to perform the joins on the first_name, last_name, and email variables.
You’ll need the following columns:
* visits.first_name
* visits.last_name
* visits.gender
* visits.email
* visits.visit_date
* fitness_tests.fitness_test_date
* applications.application_date
* purchases.purchase_date

Your result should have 5004 rows.

In [None]:
# Create a new visits DataFrame based on date
visits = visits[visits['visit_date'] >= '7-1-17']
# View shape of new DataFrame
visits.shape

In [None]:
# Merge all four DataFrames
df = visits.merge(fitness_tests,on=['first_name', 'last_name', 'email', 'gender'], how='left').merge(
    applications,on=['first_name', 'last_name', 'email', 'gender'], how='left').merge(
    purchases,on=['first_name', 'last_name', 'email', 'gender'], how='left')

# Examine the new DataFrame
df.head()

In [None]:
# Check the shape of the DataFrame. The result does indeed have 5004 rows.
df.shape

### Visualize the Groups ###
Having compiled the DataFrame we can use to begin our project.

Make a visualization that depicts the distribution of potential customers that were given a fitness test and those that were not.

Make at least one visualization that depicts the distribution of potential customers that were given a fitness test and those that were not:
- Create a bar plot of the test group variable's distribution.
- Create a pie cart using `plt.pie()`.



To conduct the A/B test you need to determine which customers were given a fitness test. Use your variable containing fitness test dates to create a new variable with values of A if the fitness test date variable is not None, and B if the fitness test date variable is None.

In [None]:
# Create new ab_test_group variable
df['ab_test_group'] = df.fitness_test_date.apply(lambda x:
                                                'A' if pd.notnull(x) else 'B')

Now we'll do a quick sanity check to ensure that Janet split her visitors such that about half are in A and half are in B.

In [None]:
# Obtain value counts of each group
df['ab_test_group'].value_counts()

In [None]:
# Obtain percentages of each group
df['ab_test_group'].value_counts(normalize=True)

In [None]:
# Create a barplot of test group
sns.countplot(x="ab_test_group", data=df)

In [None]:
# Create a pie chart of test group
plt.pie(df['ab_test_group'].value_counts(), labels=['A', 'B'], autopct='%0.2f%%')
plt.axis('equal')
plt.show()

### Determine the count of applications ###
Recall that the sign-up process for MuscleHub has several steps:

1. Take a fitness test with a personal trainer (only Group A).
2. Fill out an application for the gym.
3. Send in their payment for their first month's membership.

Determine the percentage of people in each group who complete Step 2, filling out an application.

In [None]:
# Create is_application variable
df['is_application'] = df.application_date.apply(lambda x: 'Application'
                                                  if pd.notnull(x) else 'No Application')

Now we'll group by is_application and ab_test_group to count how many people from Group A and Group B either do or don't pick up an application.

We'll save this result as a new pandas DataFrame called app_counts.

In [None]:
# Create new app_counts DataFrame
app_counts = df.groupby(['ab_test_group', 'is_application'])\
               .first_name.count().reset_index()

Next, we'll calculate the percent of people in each group who complete an application. It's going to be much easier to do this if we pivot the new DataFrame such that:

The index is ab_test_group
The columns are is_application
After pivoting, we'll save it to the variable app_pivot.

In [None]:
# Pivot app_counts DataFrame
app_pivot = app_counts.pivot(columns='is_application',
                            index='ab_test_group',
                            values='first_name')\
            .reset_index()

# View app_pivot
app_pivot

Next, we'll define a new column called Total, which is the sum of Application and No Application.

In [None]:
# Create the total variable
app_pivot['Total'] = app_pivot.Application + app_pivot['No Application']

Finally, we'll calculate another column called Percent with Application, which is equal to Application divided by Total.

In [None]:
# Create the percent with application variable
app_pivot['Percent with Application'] = app_pivot.Application / app_pivot.Total
app_pivot

It looks like more people from Group B turned in an application. Why might that be?

We need to know if this difference is statistically significant.

### Calculate the statistical significance of applications ###
Having calculated the difference in who turned in an application between groups, determine if this difference is statistically significant.
Choose a hypothesis test, import it from scipy and perform it. Be sure to note the p-value. Is this result significant?

In [None]:
# Import hypothesis test module
from scipy.stats import chi2_contingency

# Calculate the p-value
contingency = [[250, 2254], [325, 2175]]
chi2_contingency(contingency)

A p-value of 0.00096 relative to a significance threshold of 0.05 indicates that there is a statistically signifant difference between the two groups.

### Detemine the count of memberships from applications ###
Of those who picked up an application, how many purchased a membership?

Determine how many potential customers purchased a membership out of those that picked up an application.

In [None]:
# Create an is_member variable
df['is_member'] = df.purchase_date.apply(lambda x: 'Member' if pd.notnull(x) else 'Not Member')

In [None]:
# Create the just_apps DataFrame
just_apps = df[df.is_application == 'Application']

In [None]:
# Create member_count DataFrame
member_count = just_apps.groupby(['ab_test_group', 'is_member'])\
                 .first_name.count().reset_index()

# Pivot member_count
member_pivot = member_count.pivot(columns='is_member',
                                  index='ab_test_group',
                                  values='first_name')\
                           .reset_index()

# Create the Total variable
member_pivot['Total'] = member_pivot.Member + member_pivot['Not Member']

# Create the Percent Purchase variable
member_pivot['Percent Purchase'] = member_pivot.Member / member_pivot.Total
member_pivot

It looks like people who took the fitness test were more likely to purchase a membership if they picked up an application. Why might that be?

### Calculate the statistical significance of memberships ###

Calculate if the difference between the following groups is statistically significant:

* The customers that picked up an application and took a fitness test.
* The customers that did not take a fitness test and picked up an application.

In [None]:
# Calculate the p-value
contingency = [[200, 50], [250, 75]]
chi2_contingency(contingency)

A p-value of 0.432 relative to a significance threshold of 0.05 does not refelct a statistically significant difference between the two groups, and would lead us to fail to reject the null hypothesis.

### Detemine the count of all memberships ###

Previously, you looked at what percentage of people who picked up applications purchased memberships.
Now, determine what percentage of ALL visitors purchased memberships.

In [None]:
# Create final_member_count DataFrame
final_member_count = df.groupby(['ab_test_group', 'is_member'])\
                 .first_name.count().reset_index()
# Pivot final_member_count
final_member_pivot = final_member_count.pivot(columns='is_member',
                                  index='ab_test_group',
                                  values='first_name')\
                           .reset_index()

# Create the Total variable
final_member_pivot['Total'] = final_member_pivot.Member + final_member_pivot['Not Member']

# Create the Percent Purchase variable
final_member_pivot['Percent Purchase'] = final_member_pivot.Member / final_member_pivot.Total
final_member_pivot

Previously, when we only considered people who had already picked up an application, we saw that there was no significant difference in membership between Group A and Group B.

Now, when we consider all people who visit MuscleHub, we see that there might be a significant difference in memberships between Group A and Group B.

### Calculate the statistical significance between groups ###

Determine if there is a significant difference in memberships between Group A and Group B.

In [None]:
# Calculate the p-value
contingency = [[200, 2304], [250, 2250]]
chi2_contingency(contingency)

A p-value of 0.0147 relative to a significance threshold of 0.05 indicates that there is a statistically signifant difference between the two groups. This informs us that we should not reject Janet's hypothesis that visitors assigned to Group B will be more likely to eventually purchase a membership to MuscleHub than visitors assigned to Group A.

However, it is important to note that when assessing the groups among those customers that filled out an application, those that completed a fitness test (Group A), were more likely to make a purchase than those customers that did not complete a fitness test (Group B).

### Visualize the results ###

Create visualizations for Janet that show the difference between Group A (people who were given the fitness test) and Group B (people who were not given the fitness test) at each state of the process:

* Percent of visitors who apply.
* Percent of applicants who purchase a membership.
* Percent of visitors who purchase a membership.

In [None]:
# Percent of Visitors who Apply
ax = plt.subplot()
plt.bar(range(len(app_pivot)),
       app_pivot['Percent with Application'].values)
ax.set_xticks(range(len(app_pivot)))
ax.set_xticklabels(['Fitness Test', 'No Fitness Test'])
ax.set_yticks([0, 0.05, 0.10, 0.15, 0.20])
ax.set_yticklabels(['0%', '5%', '10%', '15%', '20%'])
plt.show()
# plt.savefig('percent_visitors_apply.png')

In [None]:
# Percent of Applicants who Purchase
ax = plt.subplot()
plt.bar(range(len(member_pivot)),
       member_pivot['Percent Purchase'].values)
ax.set_xticks(range(len(member_pivot)))
ax.set_xticklabels(['Fitness Test', 'No Fitness Test'])
ax.set_yticks([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
ax.set_yticklabels(['0%', '10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%', '100%'])
plt.show()
# plt.savefig('percent_apply_purchase.png')

In [None]:
# Percent of Applicants who Purchase
ax = plt.subplot()
plt.bar(range(len(member_pivot)),
       member_pivot['Percent Purchase'].values)
ax.set_xticks(range(len(member_pivot)))
ax.set_xticklabels(['Fitness Test', 'No Fitness Test'])
ax.set_yticks([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
ax.set_yticklabels(['0%', '10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%', '100%'])
plt.show()
# plt.savefig('percent_apply_purchase.png')

#### Congratulations!! ####

Congratulations, your work has been a valuable contribution for Janet to determine MuscleHub's membership process. Janet asks if you would also help her share your work with a presentation.

### Challenge Assignment ###
Develop a presentation that demonstrates your findings to Janet. She has decided that your recommendation will determine if potential customers should take a fitness test as a component of their application. Your presentation should include the following:

* A title slide
* A description of what happened in this A/B test
* A summary of your dataset and any information you think would be helpful background
* The results of the three hypothesis tests that you ran, including an explanation of the type of test that you used and why it was appropriate
* A summary of the qualitative data
* A recommendation for MuscleHub

Create a wordcloud visualization that Janet can use to create an ad for the MuscleHub Gym with the data in interviews.txt.

In [None]:
# Import modules
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator # for creating wordclouds
from collections import Counter  # for counting objects
from matplotlib.pyplot import figure # to create a figure in matplotlib

In [None]:
# Open and read the interviews.txt file
interviews = open(r"../input/musclehub-abtest/interviews.txt", encoding='utf8')
txtContent = interviews.read()
print ("The Content of text file is : ", txtContent)

The Content of text file is :  I always wanted to work out like all of the shredded people on the fitness accounts I see on Instagram, but I never really knew how to start. MuscleHub’s introductory fitness test was super helpful for me! After taking the fitness test, I had to sign up and keep coming back so that I could impress my trainer Rachel with how much I was improving!

- Cora, 23, Hoboken



When I walked into MuscleHub I wasn’t accosted by any personal trainers trying to sell me some mumbo jumbo, which I really appreciated. Down at LiftCity they had me doing burpees 30 seconds after I walked in the door and I was like “woah guys slow your roll, this is TOOOO much for Jesse!” I still ended up not signing up for a membership because the weight machines had all those sweat stains on them and you know, no thanks.

- Jesse, 35, Gowanes



I took the MuscleHub fitness test because my coworker Laura recommended it. Regretted it.

- Sonny "Dad Bod", 26, Brooklyn



I saw an ad for MuscleHub on BookFace and thought I'd check it out! The people there were suuuuuper friendly and the whole sign-up process took a matter of minutes. I tried to sign up for LiftCity last year, but the fitness test was way too intense. This is my first gym membership EVER, and MuscleHub made me feel welcome.

- Shirley, 22, Williamsburg

In [None]:
# Print the length of the new string
print('There are {} words in the total interviews.txt file.'.format(len(txtContent)))

In [None]:
# Create a wordcloud object
wordcloud = WordCloud(width=2500, height=1250).generate(txtContent)

# Display the wordcloud with MatplotLib and save figure
figure(num=None, figsize=(20, 16), facecolor='w', edgecolor='k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# plt.savefig('response_data/responses_wordcloud.png')