In [1]:
from pymongo import MongoClient
from pymongo.collection import Collection
from teaching_tools.ab_test.reset import Reset
from pymongo import MongoClient
import math
import numpy as np
import pandas as pd
from country_converter import CountryConverter
import plotly.express as px
import random
from statsmodels.stats.power import GofChisquarePower
import scipy
from teaching_tools.ab_test.experiment import Experiment
from statsmodels.stats.contingency_tables import Table2x2

In [2]:
# Create `client`
client = MongoClient(host='localhost', port=27017)

# Create `db`
db = client['wqu-abtest']

# Assign `"mscfe-applicants"` collection to `mscfe_app`
mscfe_app = db['mscfe-applicants']

In [None]:
# Aggregate applicants by nationality
result = mscfe_app.aggregate(
    [
        {
            '$group': {
                '_id': '$countryISO2', 'count': {'$count':{}}
            }
        }
    ]
)

# Load result into DataFrame
df_nationality = pd.DataFrame(result).rename({'_id':'country_iso2'}, axis='columns').sort_values('count')

print("df_nationality type:", type(df_nationality))
print("df_nationality shape", df_nationality.shape)
df_nationality.head()

In [None]:
# Instantiate `CountryConverter`
cc = CountryConverter()

# Create `"country_name"` column
df_nationality['country_name'] = cc.convert(
    df_nationality['country_iso2'], to='name_short'
)

# Create `"country_iso3"` column
df_nationality['country_iso3'] = cc.convert(
    df_nationality['country_name'], to='ISO3'
)

print("df_nationality type:", type(df_nationality))
print("df_nationality shape", df_nationality.shape)
df_nationality.head()

In [None]:
# Create `build_nat_choropleth` function
def build_nat_choropleth():
    
    fig = px.choropleth(
        data_frame=df_nationality,
        locations='country_iso3',
        color='count',
        projection='natural earth',
        color_continuous_scale=px.colors.sequential.Reds,
        title='MScFE Applicants: Nationalities'
    )
    
    return fig

# Don't delete the code below 👇
nat_fig = build_nat_choropleth()
nat_fig.write_image("images/7-5-4.png", scale=1, height=500, width=700)

nat_fig.show()

In [None]:
class MongoRepository:
    """Repository class for interacting with MongoDB database.

    Parameters
    ----------
    client : `pymongo.MongoClient`
        By default, `MongoClient(host='localhost', port=27017)`.
    db : str
        By default, `'wqu-abtest'`.
    collection : str
        By default, `'mscfe-applicants'`.

    Attributes
    ----------
    collection : pymongo.collection.Collection
        All data will be extracted from and loaded to this collection.
    """

    def __init__(
        self, 
        client=MongoClient(host='localhost', port=27017), 
        db='wqu-abtest', 
        collection='mscfe-applicants'
    ):
        
        self.collection = client[db][collection]

    def find_by_date(self, date_string):
        
        # converting date string to datetime object
        start = pd.to_datetime(date_string, format='%Y-%m-%d')
        
        # offset start by 1 day
        end = start + pd.DateOffset(days=1)
        
        # PyMongo query for no-quiz students b/w start & end
        query = {
            'createdAt': {'$gte': start, '$lt': end}, 'admissionsQuiz': 'incomplete'
        }
        
        # query execution
        result = self.collection.find(query)
        
        return list(result)

    def update_applicants(self, observations_assigned):
        
        n = 0
        n_modified = 0
        
        for doc in observations_assigned:
            result = self.collection.update_one(
                filter={'_id': doc['_id']},
                update={'$set': doc}
            )
            
            n += result.matched_count
            n_modified += result.modified_count
            
        transaction_result = {'n': n, 'nModified': n_modified}
        
        return transaction_result

    def assign_to_groups(self, date_string):
        observations = self.find_by_date(date_string)
        
        # shuffle observations
        random.seed(42)
        random.shuffle(observations)
        
        # locating mid-point
        idx = len(observations) // 2
        
        # assigning first-half to control group
        for doc in observations[:idx]:
            doc['inExperiment'] = True
            doc['group'] = 'no email (control)'
            
        # assigning second-half to treatment group
        for doc in observations[idx:]:
            doc['inExperiment'] = True
            doc['group'] = 'email (treatment)'
            
        # loading the updated data in db
        result = self.update_applicants(observations)
        
        return result

    def find_exp_observations(self):
        result = self.collection.find({'inExperiment': True})
        
        return list(result)

In [None]:
repo = MongoRepository()
print("repo type:", type(repo))
repo

In [None]:
chi_square_power = GofChisquarePower()
group_size = math.ceil(
    chi_square_power.solve_power(effect_size=0.5, alpha=0.05, power=0.8)
)

print("Group size:", group_size)
print("Total # of applicants needed:", group_size * 2)

In [None]:
# Aggregate no-quiz applicants by sign-up date
result = mscfe_app.aggregate(
    [
        {
            '$match': {'admissionsQuiz': 'incomplete'}
        },
        {
            '$group': {
                '_id':{'$dateTrunc': {'date': '$createdAt', 'unit': 'day'}},
                'count': {'$sum': 1}
            }
        }
    ]
)

# Load result into DataFrame
no_quiz_mscfe = (
    pd.DataFrame(result)
    .rename({'_id': 'date', 'count': 'new_users'}, axis='columns')
    .set_index('date')
    .sort_index()
    .squeeze()
)

print("no_quiz type:", type(no_quiz_mscfe))
print("no_quiz shape:", no_quiz_mscfe.shape)
no_quiz_mscfe

In [None]:
mean = no_quiz_mscfe.mean()
std = no_quiz_mscfe.std()
print("no_quiz mean:", mean)
print("no_quiz std:", std)

In [None]:
exp_days = 7
sum_mean = mean * exp_days
sum_std = std * np.sqrt(exp_days)
print("Mean of sum:", sum_mean)
print("Std of sum:", sum_std)

In [None]:
prob_65_or_fewer = scipy.stats.norm.cdf(
    group_size * 2,
    loc=sum_mean,
    scale=sum_std
)
prob_65_or_greater = 1 - prob_65_or_fewer

print(
    f"Probability of getting 65+ no_quiz in {exp_days} days:",
    round(prob_65_or_greater, 3),
)

In [None]:
exp = Experiment(repo=client, db="wqu-abtest", collection="mscfe-applicants")
exp.reset_experiment()
result = exp.run_experiment(days=exp_days, assignment=True)
print("result type:", type(result))
result

In [None]:
result = repo.find_exp_observations()
df = pd.DataFrame(result).dropna()

print("df type:", type(df))
print("df shape:", df.shape)
df.head()

In [None]:
data = pd.crosstab(
    index=df['group'],
    columns=df['admissionsQuiz'],
    normalize=False
)

print("data type:", type(data))
print("data shape:", data.shape)
data

In [None]:
# Create `build_contingency_bar` function
def build_contingency_bar():
    
    # creating side-by-side bar chart
    fig = px.bar(
        data_frame=data,
        barmode='group',
        title='MScFE: Admissions Quiz Completion by Group'
    )
    
    # updating axis labels
    fig.update_layout(
        xaxis_title='Group',
        yaxis_title='Frequency [count]'
    )
    
    return fig

# Don't delete the code below 👇
cb_fig = build_contingency_bar()
cb_fig.write_image("images/7-5-16.png", scale=1, height=500, width=700)

cb_fig.show()

In [None]:
contingency_table = Table2x2(data.values)

print("contingency_table type:", type(contingency_table))
contingency_table.table_orig

In [None]:
chi_square_test = contingency_table.test_nominal_association()

print("chi_square_test type:", type(chi_square_test))
print(chi_square_test)

In [None]:
odds_ratio = contingency_table.oddsratio
print("Odds ratio:", odds_ratio)