# AB Testing Setup

## Importing Libraries & Data


In [36]:
# Importing Libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(style = "whitegrid",
        color_codes = True,
        font_scale = 1.5)
import re

The next part assumes one has access to anonymized, confidential participant data from the BEES summer bridge program. If you do not have access, please import data in the format specified in the README and replace import statements to be compatible with your dataset. If you are just interested in the methodology, click the relevant sections below this.

In [37]:
# Importing Data
from google.colab import drive
drive.mount('/content/drive')
pre_2020 = pd.read_excel('/content/drive/My Drive/BEES Research Data/Data for Studies/2020 Pre Survey.xlsx')
pre_2021 = pd.read_excel('/content/drive/My Drive/BEES Research Data/Data for Studies/2021 Pre Survey.xlsx')
pre_2022 = pd.read_excel('/content/drive/My Drive/BEES Research Data/Data for Studies/2022 Pre Survey.xlsx')
pre_2022_new = pd.read_excel('/content/drive/My Drive/BEES Research Data/Data for Studies/2022-BEES-Pre-Survey-for-Shreya.xlsx')
pre_2022 = pd.concat([pre_2022, pre_2022_new.drop(index=(0), columns=["Q1", "Q20"])]).reset_index()
pre_2022=pre_2022.drop("index", axis=1)
pre_2023 = pd.read_excel('/content/drive/My Drive/BEES Research Data/Data for Studies/2023 Pre Survey.xlsx')
post_2020 = pd.read_excel('/content/drive/My Drive/BEES Research Data/Data for Studies/2020 Post Survey.xlsx')
post_2021 = pd.read_excel('/content/drive/My Drive/BEES Research Data/Data for Studies/2021 Post Survey.xlsx')
post_2022 = pd.read_excel('/content/drive/My Drive/BEES Research Data/Data for Studies/2022 Post Survey.xlsx')
post_2023 = pd.read_excel('/content/drive/My Drive/BEES Research Data/Data for Studies/2023 Post Survey.xlsx')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Cleaning Data

In [3]:
# Creating Lists of the dataframes to reduce code redundancy in the future
responses = [pre_2020, pre_2021, pre_2022, post_2020, post_2021, post_2022, pre_2023, post_2023]
unstandardized_column_names = [pre_2020, post_2020, post_2021, post_2022]
pres=[pre_2020, pre_2021, pre_2022, pre_2023]
posts=[post_2020, post_2021, post_2022, post_2023]

In [4]:
# Dropping Intended Majors and other Unnecessary Columns
for response in unstandardized_column_names:
  response.drop(labels= response.columns[[1]], axis=1, inplace=True)
post_2020.drop(labels= post_2020.columns[[1]], axis=1, inplace=True)

Some dataframes aren't consistent with the majority, so the cell below handles such a case. Feel free to edit this as needed.

In [5]:
# Making column names uniform across all Dataframes
pre_2020.columns = pre_2021.columns
post_2020.columns = post_2021.columns

#Removing ScienceID_5 as the question is "If you are reading this item, please mark "3-Slightly Disagree""
for pre in pres:
  pre.drop(labels= "ScienceID_5", axis=1, inplace=True)
for post in posts:
  post.drop(labels= "ScienceID2_5", axis=1, inplace=True)

In [6]:
#Removing Columns with Qualitative Nominal Responses
pre_post_column_mismatches = ['Q24', 'Q26', 'Q28', 'Q30', 'Q34', 'Q38', 'Q42', 'Q44', 'Q46', 'Q48', 'Q50']
for post in posts:
  post.drop(labels= pre_post_column_mismatches, axis=1, inplace=True)

In [7]:
#Storing Questions for future reference
pre_survey_questions = pre_2022.iloc[0,:].str.replace(r'.*\n\n.* - |.* - ', '', regex=True)[1:]
pre_survey_questions.columns = ['Question_Text']

post_survey_questions = post_2022.iloc[0,:].str.replace(r'.*\n\n.* - |.* - ', '', regex=True)[1:]
post_survey_questions.columns = ['Question_Text']

In [8]:
#Making all the Columns Integers and Resetting Index to ResponseId
pre_2020 = pre_2020[1:].set_index('ResponseId').apply(pd.to_numeric)
post_2020 = post_2020[1:].set_index('ResponseId').apply(pd.to_numeric, errors='ignore')

pre_2021 = pre_2021[1:].set_index('ResponseId').replace(to_replace ='-.*', value = '', regex = True).apply(pd.to_numeric)
post_2021 = post_2021[2:].set_index('ResponseId').replace(to_replace ='-.*', value = '', regex = True).apply(pd.to_numeric, errors='ignore')

pre_2022 = pre_2022[1:].set_index('ResponseId').replace(to_replace ='-.*', value = '', regex = True).apply(pd.to_numeric)
post_2022 = post_2022[1:].set_index('ResponseId').replace(to_replace ='-.*', value = '', regex = True).apply(pd.to_numeric, errors='ignore')

pre_2023 = pre_2023[1:].set_index('ResponseId').replace(to_replace ='-.*', value = '', regex = True).apply(pd.to_numeric)
post_2023 = post_2023[1:].set_index('ResponseId').replace(to_replace ='-.*', value = '', regex = True).apply(pd.to_numeric, errors='ignore')

Observation:


> Some sentiments are negatively worded:
*   MATH_1 = Mathematics is one of my best subjects (positive)
*   MATH_2 = I often need help in mathematics  (NEGATIVE)

We need to standardize the scale for "positivity", so going forward, **Positive Sentiments ---> Higher Scores**

Switching orders for negative sentiments this way:
1.   1-->6, 2-->5, 3-->4, 4-->3, 5-->2, 6-->1
2.   1-->4, 2-->3, 3-->2, 4-->1


In [9]:
#Handling Negatively Worded Columns
pre_negative_6_options=["MathSelfConcept_2", "MathSelfConcept_4", "MathSelfConcept_6", "MathSelfConcept_8", "MathSelfConcept_10", "ProgramSelfConcept_2", "ProgramSelfConcept_4", "ProgramSelfConcept_6", "ProgramSelfConcept_8", "ProgramSelfConcept_10", "Mindset_1", "Mindset_2", "Mindset_3"]
pre_negative_4_options=["Concealment_1", "Concealment_2", "Concealment_3", "Concealment_4", "Concealment_5", "Concealment_6", "Concealment_7", "Concealment_8"]
post_negative_6_options=["MathSelfConcept2_2", "MathSelfConcept2_4", "MathSelfConcept2_6", "MathSelfConcept2_8", "MathSelfConcept2_10", "ProgramSelfConcept2_2", "ProgramSelfConcept2_4", "ProgramSelfConcept2_6", "ProgramSelfConcept2_8", "ProgramSelfConcept2_10", "Mindset2_1", "Mindset2_2", "Mindset2_3"]
post_negative_4_options=["Concealment2_1", "Concealment2_2", "Concealment2_3", "Concealment2_4", "Concealment2_5", "Concealment2_6", "Concealment2_7", "Concealment2_8"]

replacement_6_option= {1:6, 2:5, 3:4, 4:3, 5:2, 6:1}
replacement_4_option= {1:4, 2:3, 3:2, 4:1}

for column in pre_negative_6_options:
  pre_2020.replace({column:replacement_6_option}, inplace=True)
  pre_2021.replace({column:replacement_6_option}, inplace=True)
  pre_2022.replace({column:replacement_6_option}, inplace=True)
  pre_2023.replace({column:replacement_6_option}, inplace=True)
for column in pre_negative_4_options:
  pre_2020.replace({column:replacement_4_option}, inplace=True)
  pre_2021.replace({column:replacement_4_option}, inplace=True)
  pre_2022.replace({column:replacement_4_option}, inplace=True)
  pre_2023.replace({column:replacement_4_option}, inplace=True)
for column in post_negative_6_options:
  post_2020.replace({column:replacement_6_option}, inplace=True)
  post_2021.replace({column:replacement_6_option}, inplace=True)
  post_2022.replace({column:replacement_6_option}, inplace=True)
  post_2023.replace({column:replacement_6_option}, inplace=True)
for column in post_negative_4_options:
  post_2020.replace({column:replacement_4_option}, inplace=True)
  post_2021.replace({column:replacement_4_option}, inplace=True)
  post_2022.replace({column:replacement_4_option}, inplace=True)
  post_2023.replace({column:replacement_4_option}, inplace=True)

In [10]:
# Function to select a category of survey questions like Math, Programming, etc.
def select_category(df, cat) :
  return df[[col for col in df.columns if cat in col]]

In [12]:
# making sure only posts' rows exist in pres
common_rows_df_2023 = pd.merge(pre_2023, post_2023, on='ResponseId', how='inner')
pre_2023 = pre_2023.loc[list(common_rows_df_2023.reset_index()["ResponseId"])]
post_2023 = post_2023.loc[list(common_rows_df_2023.reset_index()["ResponseId"])]
common_rows_df_2022 = pd.merge(pre_2022, post_2022, on='ResponseId', how='inner')
pre_2022 = pre_2022.loc[list(common_rows_df_2022.reset_index()["ResponseId"])]
post_2022 = post_2022.loc[list(common_rows_df_2022.reset_index()["ResponseId"])]
common_rows_df_2021 = pd.merge(pre_2021, post_2021, on='ResponseId', how='inner')
pre_2021 = pre_2021.loc[list(common_rows_df_2021.reset_index()["ResponseId"])]
post_2021 = post_2021.loc[list(common_rows_df_2021.reset_index()["ResponseId"])]
common_rows_df_2020 = pd.merge(pre_2020, post_2020, on='ResponseId', how='inner')
pre_2020 = pre_2020.loc[list(common_rows_df_2020.reset_index()["ResponseId"])]
post_2020 = post_2020.loc[list(common_rows_df_2020.reset_index()["ResponseId"])]

In [13]:
# Category Labels Dictioanry
cat_labels = {'Math':'Math Self Concept', 'Program':'Program Self Concept', 'ScienceMotivation':'Science Motivation', 'ScienceID':'Science Self Indentity'
              , 'HelpSeeking':'Help Seeking' , 'Concealment':'Concealment', 'Mindset':'Mindset', 'PeerCommunity':'Peer Community'
              , 'CampusBelong':'Campus Belong', 'Resources':'Resources'}

## EDA

### Plotting Mean Scores per Category for each Year

In [21]:
# Yearly Distributions Plotting Function
def cat_yearly_plot(cat):
  pre_2020_cat_mean = pd.DataFrame({'pre' : np.mean(select_category(pre_2020, cat), axis=1)})
  post_2020_cat_mean = pd.DataFrame({'post' : np.mean(select_category(post_2020, cat), axis=1)})
  pre_2021_cat_mean = pd.DataFrame({'pre' : np.mean(select_category(pre_2021, cat), axis=1)})
  post_2021_cat_mean = pd.DataFrame({'post' : np.mean(select_category(post_2021, cat), axis=1)})
  pre_2022_cat_mean = pd.DataFrame({'pre' : np.mean(select_category(pre_2022, cat), axis=1)})
  post_2022_cat_mean = pd.DataFrame({'post' : np.mean(select_category(post_2022, cat), axis=1)})
  pre_2023_cat_mean = pd.DataFrame({'pre' : np.mean(select_category(pre_2023, cat), axis=1)})
  post_2023_cat_mean = pd.DataFrame({'post' : np.mean(select_category(post_2023, cat), axis=1)})

  fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(10, 10), sharey=True)

  print(cat)

  sns.histplot(data=pre_2020_cat_mean, x='pre', label='pre', ax=ax1, binwidth=0.35, alpha=0.6)
  sns.histplot(data=post_2020_cat_mean, x='post', label='post', ax=ax1, binwidth=0.35, alpha=0.6)
  ax1.set_xlim(1, 6)
  ax1.set_xlabel("2020")
  ax1.legend(prop={'size': 12})

  sns.histplot(data=pre_2021_cat_mean, x='pre', label='pre', ax=ax2, binwidth=0.35, alpha=0.6)
  sns.histplot(data=post_2021_cat_mean, x='post', label='post', ax=ax2, binwidth=0.35, alpha=0.6)
  ax2.set_xlim(1, 6)
  ax2.set_xlabel("2021")
  ax2.legend(prop={'size': 12})

  sns.histplot(data=pre_2022_cat_mean, x='pre', label='pre', ax=ax3, binwidth=0.35, alpha=0.6)
  sns.histplot(data=post_2022_cat_mean, x='post', label='post', ax=ax3, binwidth=0.35, alpha=0.6)
  ax3.set_xlim(1, 6)
  ax3.set_xlabel("2022")
  ax3.legend(prop={'size': 12})

  sns.histplot(data=pre_2023_cat_mean, x='pre', label='pre', ax=ax4, binwidth=0.35, alpha=0.6)
  sns.histplot(data=post_2023_cat_mean, x='post', label='post', ax=ax4, binwidth=0.35, alpha=0.6)
  ax4.set_xlim(1, 6)
  ax4.set_xlabel("2023")
  ax4.legend(prop={'size': 12})

  # Adjust layout to prevent overlapping
  plt.tight_layout()

  # Show the plots
  plt.show()


  fig.suptitle('Distribution of Mean of ' + cat_labels[cat] + ' Survey Questions', fontsize=16)
  plt.show()

Uncomment the line for the distribution you would like to see.

In [23]:
# cat_yearly_plot('Math')
# cat_yearly_plot('Program')
# cat_yearly_plot('ScienceMotivation')
# cat_yearly_plot('ScienceID')
# cat_yearly_plot('HelpSeeking')
# cat_yearly_plot('Concealment')
# cat_yearly_plot('Mindset')
# cat_yearly_plot('PeerCommunity')
# cat_yearly_plot('CampusBelong')
# cat_yearly_plot('Resources')

### Plotting Mean Scores per Category for Online vs Offline

In [24]:
# Online VS In-person Plotting Function
def cat_covid_plot(cat):
  pre_2020_cat_mean = pd.DataFrame({'pre' : np.mean(select_category(pre_2020, cat), axis=1)})
  post_2020_cat_mean = pd.DataFrame({'post' : np.mean(select_category(post_2020, cat), axis=1)})
  pre_2021_cat_mean = pd.DataFrame({'pre' : np.mean(select_category(pre_2021, cat), axis=1)})
  post_2021_cat_mean = pd.DataFrame({'post' : np.mean(select_category(post_2021, cat), axis=1)})
  pre_2022_cat_mean = pd.DataFrame({'pre' : np.mean(select_category(pre_2022, cat), axis=1)})
  post_2022_cat_mean = pd.DataFrame({'post' : np.mean(select_category(post_2022, cat), axis=1)})
  pre_2023_cat_mean = pd.DataFrame({'pre' : np.mean(select_category(pre_2023, cat), axis=1)})
  post_2023_cat_mean = pd.DataFrame({'post' : np.mean(select_category(post_2023, cat), axis=1)})
  online_pre = pd.concat([pre_2020_cat_mean, pre_2021_cat_mean], ignore_index=True)
  online_post = pd.concat([post_2020_cat_mean, post_2021_cat_mean], ignore_index=True)
  offline_pre = pd.concat([pre_2022_cat_mean, pre_2023_cat_mean], ignore_index=True)
  offline_post = pd.concat([post_2022_cat_mean, post_2023_cat_mean], ignore_index=True)

  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), sharey=True)

  print(cat)
  sns.histplot(data=online_pre, x='pre', label='pre', ax=ax1, binwidth=0.35, stat='density')
  sns.histplot(data=online_post, x='post', label='post', ax=ax1, binwidth=0.35, stat='density')
  ax1.set_xlim(1, 6)
  ax1.set_xlabel("2020 + 2021 (Online)")
  ax1.legend(prop={'size': 12})

  sns.histplot(data=offline_pre, x='pre', label='pre', ax=ax2, binwidth=0.35, stat='density')
  sns.histplot(data=offline_post, x='post', label='post', ax=ax2, binwidth=0.35, stat='density')
  ax2.set_xlim(1, 6)
  ax2.set_xlabel("2022 + 2023 (In-person)")
  ax2.legend(prop={'size': 12})

  fig.suptitle('Density Distribution of Mean of ' + cat_labels[cat] + ' Survey Questions', fontsize=16)
  plt.show()

Uncomment as needed

In [26]:
# cat_covid_plot('Math')
# cat_covid_plot('Program')
# cat_covid_plot('ScienceMotivation')
# cat_covid_plot('ScienceID')
# cat_covid_plot('HelpSeeking')
# cat_covid_plot('Concealment')
# cat_covid_plot('Mindset')
# cat_covid_plot('PeerCommunity')
# cat_covid_plot('CampusBelong')
# cat_covid_plot('Resources')

### Mean Comparisons per Question (Pre vs Post, Online vs Offline)

In [27]:
# Creating Relevant DFs
online_pre_df = pd.concat([pre_2020, pre_2021], ignore_index=True)
online_pre_per_question_mean = online_pre_df.mean(axis=0)
online_post_df = pd.concat([post_2020, post_2021], ignore_index=True)
online_post_per_question_mean = online_post_df.mean(axis=0)
offline_pre_df = pd.concat([pre_2022, pre_2023], ignore_index=True)
offline_pre_per_question_mean = offline_pre_df.mean(axis=0)
offline_post_df = pd.concat([post_2022, post_2023], ignore_index=True)
offline_post_per_question_mean = offline_post_df.mean(axis=0)

In [28]:
def category_per_question_mean_comparison(Category):
  online_pre_Category_questions_mean = online_pre_per_question_mean.loc[online_pre_per_question_mean.index.str.startswith(Category)]
  online_post_Category_questions_mean = online_post_per_question_mean.loc[online_post_per_question_mean.index.str.startswith(Category)]
  offline_pre_Category_questions_mean = offline_pre_per_question_mean.loc[offline_pre_per_question_mean.index.str.startswith(Category)]
  offline_post_Category_questions_mean = offline_post_per_question_mean.loc[offline_post_per_question_mean.index.str.startswith(Category)]
  fig, ax = plt.subplots()
  ax.scatter(online_pre_Category_questions_mean.index, online_pre_Category_questions_mean, label="online pre means")
  ax.scatter(online_pre_Category_questions_mean.index, online_post_Category_questions_mean, label="online post means")
  ax.scatter(online_pre_Category_questions_mean.index, offline_pre_Category_questions_mean, label="offline pre means")
  ax.scatter(online_pre_Category_questions_mean.index, offline_post_Category_questions_mean, label="offline post means")
  ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
  plt.title(f"{Category} Online vs Offiline Per Question Mean Comparisons")
  plt.xticks(online_pre_Category_questions_mean.index, rotation='vertical');

Run this for ouput

In [29]:
# for category in cat_labels:
#   category_per_question_mean_comparison(category)

## A/B Testing

Null hypothesis - Mean score of the pre survey response means is the same as the mean score of the post survey response means

Alternate hypothesis - Mean score of the pre survey response means is different from the mean score of the post survey response means

Significance level - 95%

### Online

In [30]:
def one_sim_mean_diff(df):
  shuffled_pre_post_mean_diff_df = df.groupby("Shuffled Labels").mean(numeric_only=True)
  return shuffled_pre_post_mean_diff_df.iloc[0,0]-shuffled_pre_post_mean_diff_df.iloc[1,0]
def online_ab_test_post_greater_pre():
  for cat in cat_labels:
    print ("Category: ", cat)
    for q_num in range(online_pre_df.filter(like=cat).shape[1]):
      test_pre_score_column = online_pre_df.filter(like=cat).iloc[:,q_num]
      pre_as_boolean = np.full(len(test_pre_score_column), "Pre")
      test_post_score_column = online_post_df.filter(like=cat).iloc[:,q_num]
      #Step 2: Observed Difference Calc
      online_observed_mean_diff = test_post_score_column.mean()-test_pre_score_column.mean()
      post_as_boolean = np.full(len(test_post_score_column), "Post")
      print("online_observed_mean_diff :", online_observed_mean_diff)
      #Step 3: 1 simulated difference function
      #Step 4: Many simulated mean differences
      repetitions = 1000
      differences = np.empty((1,0))
      for i in np.arange(repetitions):
        question_dict = {'Score': np.append(test_pre_score_column,test_post_score_column ),
              'Labels': np.append(pre_as_boolean, post_as_boolean)}
        question_df = pd.DataFrame(question_dict).sample(frac=1).reset_index(drop=True)
        shuffled_labels = question_df.sample(frac=1).reset_index(drop=True).iloc[:, 1]
        question_df["Shuffled Labels"] = shuffled_labels
        new_difference = one_sim_mean_diff(question_df)
        differences = np.append(differences, new_difference)
      if online_observed_mean_diff < 0:
        print ("Proportion of Differences LESSER than Observed Difference")
        print (cat ,"Question", q_num, ":", len(np.where(differences<online_observed_mean_diff)[0])/repetitions)
      else:
        print ("Proportion of Differences GREATER than Observed Difference")
        print (cat ,"Question", q_num, ":", len(np.where(differences>online_observed_mean_diff)[0])/repetitions)
      sns.histplot(x=differences)
      plt.axvline(online_observed_mean_diff, color="red")
      plt.show()

Run the next cell to visualize results

In [32]:
# online_ab_test_post_greater_pre()

### Offline

In [33]:
def offline_ab_test_post_greater_pre():
  for cat in cat_labels:
    print ("Category: ", cat)
    for q_num in range(offline_pre_df.filter(like=cat).shape[1]):
      test_pre_score_column = offline_pre_df.filter(like=cat).iloc[:,q_num]
      pre_as_boolean = np.full(len(test_pre_score_column), "Pre")
      test_post_score_column = offline_post_df.filter(like=cat).iloc[:,q_num]
      #Step 2: Observed Difference Calc
      offline_observed_mean_diff = test_post_score_column.mean()-test_pre_score_column.mean()
      post_as_boolean = np.full(len(test_post_score_column), "Post")
      print("online_observed_mean_diff :", offline_observed_mean_diff)
      #Step 3: 1 simulated difference function
      #Step 4: Many simulated mean differences
      repetitions = 1000
      differences = np.empty((1,0))
      for i in np.arange(repetitions):
        question_dict = {'Score': np.append(test_pre_score_column,test_post_score_column ),
              'Labels': np.append(pre_as_boolean, post_as_boolean)}
        question_df = pd.DataFrame(question_dict).sample(frac=1).reset_index(drop=True)
        shuffled_labels = question_df.sample(frac=1).reset_index(drop=True).iloc[:, 1]
        question_df["Shuffled Labels"] = shuffled_labels
        new_difference = one_sim_mean_diff(question_df)
        differences = np.append(differences, new_difference)
      if offline_observed_mean_diff < 0:
        print ("Proportion of Differences LESSER than Observed Difference")
        print (cat ,"Question", q_num, ":", len(np.where(differences<offline_observed_mean_diff)[0])/repetitions)
      else:
        print ("Proportion of Differences GREATER than Observed Difference")
        print (cat ,"Question", q_num, ":", len(np.where(differences>offline_observed_mean_diff)[0])/repetitions)
      sns.histplot(x=differences)
      plt.axvline(offline_observed_mean_diff, color="red")
      plt.show()

Run the next cell to visualize results

In [35]:
# offline_ab_test_post_greater_pre()