# Load packages and data

In [1]:
import pandas as pd

In [85]:
# define relative paths to data files
interact_path = './CrazyEgg_data/Homepage Version 1 - Interact, 5-29-2013/Element list Homepage Version 1 - Interact, 5-29-2013.csv'
connect_path = './CrazyEgg_data/Homepage Version 2 - Connect, 5-29-2013/Element list Homepage Version 2 - Connect, 5-29-2013.csv'
learn_path = './CrazyEgg_data/Homepage Version 3 - Learn, 5-29-2013/Element list Homepage Version 3 - Learn, 5-29-2013.csv'
help_path = './CrazyEgg_data/Homepage Version 4 - Help, 5-29-2013/Element list Homepage Version 4 - Help, 5-29-2013.csv'
services_path = './CrazyEgg_data/Homepage Version 5 - Services, 5-29-2013/Element list Homepage Version 5 - Services, 5-29-2013.csv'

# load data to dfs
interact_raw = pd.read_csv(interact_path)
connect_raw = pd.read_csv(connect_path)
learn_raw = pd.read_csv(learn_path)
help_raw = pd.read_csv(help_path)
services_raw = pd.read_csv(services_path)

In [88]:
services_raw.head()

Unnamed: 0,Element ID,Tag name,Name,No. clicks,Visible?,Snapshot information
0,69,a,FIND,397,True,Homepage Version 5 - Services • http://www...
1,61,input,s.q,323,True,created 5-29-2013 • 20 days 4 hours 59 min...
2,67,a,lib.montana.edu/find/,106,True,
3,62,button,Search,85,True,
4,98,a,Hours,81,True,


In [95]:
int(services_raw.iloc[1,-1].split(' ')[-4])

2064

In [100]:
def clean_df(original_df):
    '''Function to clean the original dfs and calculate ctr'''
    # define list of category terms
    term_list = ['INTERACT', 'CONNECT', 'LEARN', 'HELP', 'SERVICES']

    # parse total amount of website visits
    visits = int(original_df.iloc[1, -1].split(' ')[-4])

    # extract needed data from original dfs and add to new dfs
    df = original_df.loc[original_df.Name.isin(term_list), ['Name', 'No. clicks']]

    # rename cols
    df.rename(columns={'Name': 'name', 'No. clicks': 'clicks'}, inplace=True)

    # add cols for visits and ctr
    df['visits'] = visits
    df['ctr'] = df.clicks / df.visits
    
    return df

# Combine tables to one

In [97]:
# create cleaned dfs
interact = clean_df(interact_raw)
connect = clean_df(connect_raw)
learn = clean_df(learn_raw)
help = clean_df(help_raw)
services = clean_df(services_raw)

df_list = [interact, connect, learn, help, services]

In [103]:
# combine all data to one df
df = pd.concat(df_list).sort_values('ctr', ascending=False).reset_index(drop=True)
df.name = df.name.str.lower()
df['no_clicks'] = df.visits - df.clicks
df

Unnamed: 0,name,clicks,visits,ctr,no_clicks
0,services,45,2064,0.021802,2019
1,connect,53,2742,0.019329,2689
2,help,38,3180,0.01195,3142
3,learn,21,2747,0.007645,2726
4,interact,42,10283,0.004084,10241


“Interact” and “Learn” are the worst performers, while “Services” and “Connect” perform much better.

# Chi-square test

* Null Hypothesis: The 5 versions of the button are equally likely to receive clicks, and the observed differences are due to chance
* Alternative Hypothesis: The observed differences are not due to chance: there is at least one version that got so many more/much less clicks than the others that this can hardly be explained just by chance (i.e. they have a better/worse CTR, a better/worse performance).

In [114]:
# set significance level = 90%
alpha = 0.1

# create contingency table

conti = df[['name', 'clicks', 'no_clicks']].transpose()
conti.columns = conti.iloc[0] # set new column header
conti = conti.iloc[1:] # drop row with labels now in header
conti

name,services,connect,help,learn,interact
clicks,45,53,38,21,42
no_clicks,2019,2689,3142,2726,10241


In [115]:
from scipy import stats

chisq, pvalue, dof, expected = stats.chi2_contingency(conti)
print(chisq, pvalue, dof)
print(expected)

96.7432353798328 4.852334301093838e-20 4
[[   19.5439665     25.96393224    30.11134374    26.01127712
     97.3694804 ]
 [ 2044.4560335   2716.03606776  3149.88865626  2720.98872288
  10185.6305196 ]]


In [116]:
if pvalue < alpha:
    print('Reject Null.')
else:
    print('Do not reject Null.')

Reject Null.
