In [1]:
import random

import pandas as pd
from pymongo import MongoClient

In [2]:
client = MongoClient(host="localhost",port=27017)
db = client["abtest-db"]
ds_app = db["ds-applicants"]
print("client:", type(client))
print("ds_app:", type(ds_app))

client: <class 'pymongo.synchronous.mongo_client.MongoClient'>
ds_app: <class 'pymongo.synchronous.collection.Collection'>


Extract: Developing the Hypothesis

In [4]:
# How many applicants complete admissions quiz?
result = ds_app.aggregate(
    [
        {
            "$group":{
                "_id": "$admissionsQuiz",
                "count": { "$count": {}}
            }
        }
    ]
)

for r in result: 
    if r["_id"] == 'complete':
        complete = r["count"]
    else :
        incomplete = r["count"]
    print(r)

print("Completed quiz:", complete)
print("Did not complete quiz:", incomplete)

{'_id': 'complete', 'count': 3717}
{'_id': 'incomplete', 'count': 1308}
Completed quiz: 3717
Did not complete quiz: 1308


In [5]:

prop_incomplete = incomplete/(complete+incomplete)
print(
    "Proportion of users who don't complete admissions quiz:", round(prop_incomplete, 2)
)

Proportion of users who don't complete admissions quiz: 0.26


In [6]:
null_hypothesis = """
There is no relationship between receiveing an email and completing the admissions quiz.
Sending an email to 'no-quiz applicants' does not increase the rate of completion."""

alternate_hypothesis = """
There is a relationship between receiving an email and completing the admissions quiz.
Sending an email to 'no-quiz applicants' does increase the rate of completion.
"""

print("Null Hypothesis:", null_hypothesis)
print("Alternate Hypothesis:", alternate_hypothesis)

Null Hypothesis: 
There is no relationship between receiveing an email and completing the admissions quiz.
Sending an email to 'no-quiz applicants' does not increase the rate of completion.
Alternate Hypothesis: 
There is a relationship between receiving an email and completing the admissions quiz.
Sending an email to 'no-quiz applicants' does increase the rate of completion.



In [16]:
def find_by_date(collection, date_string):
    """Find records in a PyMongo Collection created on a given date.

    Parameters
    ----------
    collection : pymongo.collection.Collection
        Collection in which to search for documents.
    date_string : str
        Date to query. Format must be '%Y-%m-%d', e.g. '2022-06-28'.

    Returns
    -------
    observations : list
        Result of query. List of documents (dictionaries).
    """
    # Convert `date_string` to datetime object
    start = pd.to_datetime(date_string, format="%Y-%m-%d")
    end = start + pd.DateOffset(days=1)

    # Try both string and datetime query to handle both cases
    start_str = start.strftime("%Y-%m-%dT%H:%M:%S")
    end_str = end.strftime("%Y-%m-%dT%H:%M:%S")

    # Build OR query to match both possible types
    query = {
        "$and": [
            { "admissionsQuiz": "incomplete" },
            {
                "$or": [
                    { "createdAt": { "$gte": start, "$lt": end } },        # if createdAt is stored as datetime
                    { "createdAt": { "$gte": start_str, "$lt": end_str } } # if createdAt is stored as string
                ]
            }
        ]
    }

    result = collection.find(query)
    observations = list(result)

    return observations


In [19]:
find_by_date(collection=ds_app,date_string="2022-05-04")[0]

{'_id': '6525d787953844722c838415',
 'createdAt': '2022-05-05 18:39:40',
 'firstName': 'Curtis',
 'lastName': 'Rogers',
 'email': 'curtis.rogers14@microsift.com',
 'birthday': '1987-12-04 00:00:00',
 'gender': 'male',
 'highestDegreeEarned': 'Some College (1-3 years)',
 'countryISO2': 'NG',
 'admissionsQuiz': 'incomplete'}

In [20]:
observations = find_by_date(collection=ds_app, date_string="2022-05-02")

print("observations type:", type(observations))
print("observations len:", len(observations))
observations[0]

observations type: <class 'list'>
observations len: 43


{'_id': '6525d787953844722c838459',
 'createdAt': '2022-05-03 20:28:29',
 'firstName': 'Thomas',
 'lastName': 'Follmer',
 'email': 'thomas.follmer56@gmall.com',
 'birthday': '1978-05-12 00:00:00',
 'gender': 'male',
 'highestDegreeEarned': 'High School or Baccalaureate',
 'countryISO2': 'NG',
 'admissionsQuiz': 'incomplete'}

In [21]:
def assign_to_groups(observations):
    """Randomly assigns observations to control and treatment groups.

    Parameters
    ----------
    observations : list or pymongo.cursor.Cursor
        List of users to assign to groups.

    Returns
    -------
    observations : list
        List of documents from `observations` with two additional keys:
        `inExperiment` and `group`.
    """
    # Shuffle `observations`
    random.seed(42)
    random.shuffle(observations)
    # Get index position of item at observations halfway point
    idx = len(observations)//2

    # Assign first half of observations to control group
    for doc in observations[:idx]:
        doc["inExperiment"] = True
        doc["group"] = "no email (control)"

    # Assign second half of observations to treatment group      
    for doc in observations[idx:]:
        doc["inExperiment"] = True
        doc["group"] = "email (treatment)"
    

    return observations


observations_assigned = assign_to_groups(observations)

print("observations_assigned type:", type(observations_assigned))
print("observations_assigned len:", len(observations_assigned))
observations_assigned[0]

observations_assigned type: <class 'list'>
observations_assigned len: 43


{'_id': '6525d787953844722c8387ad',
 'createdAt': '2022-05-03 23:54:20',
 'firstName': 'Russell',
 'lastName': 'Sell',
 'email': 'russell.sell5@microsift.com',
 'birthday': '1986-12-04 00:00:00',
 'gender': 'male',
 'highestDegreeEarned': 'High School or Baccalaureate',
 'countryISO2': 'NP',
 'admissionsQuiz': 'incomplete',
 'inExperiment': True,
 'group': 'no email (control)'}

In [22]:
def export_treatment_emails(observations_assigned, directory="."):
    """Creates CSV file with email addresses of observations in treatment group.

    CSV file name will include today's date, e.g. `'2022-06-28_ab-test.csv'`,
    and a `'tag'` column where every row will be 'ab-test'.

    Parameters
    ----------
    observations_assigned : list
        Observations with group assignment.
    directory : str, default='.'
        Location for saved CSV file.

    Returns
    -------
    None
    """
    # Put `observations_assigned` docs into DataFrame
    df = pd.DataFrame(observations_assigned)


    # Add `"tag"` column
    df["tag"] = "ab-test"


    # Create mask for treatment group only
    mask = df["group"] == "email (treatment)"


    # Create filename with date
    date_string = pd.Timestamp.now().strftime(format="%Y-%m-%d")
    filename = directory + "/" + date_string + "_ab-test.csv"

    # Save DataFrame to directory (email and tag only)
    df[mask][["email","tag"]].to_csv(filename, index=False)


export_treatment_emails(observations_assigned=observations_assigned)

In [23]:
updated_applicant = observations_assigned[0]
applicant_id = updated_applicant["_id"]
print("applicant type:", type(updated_applicant))
print(updated_applicant)
print()
print("applicant_id type:", type(applicant_id))
print(applicant_id)

applicant type: <class 'dict'>
{'_id': '6525d787953844722c8387ad', 'createdAt': '2022-05-03 23:54:20', 'firstName': 'Russell', 'lastName': 'Sell', 'email': 'russell.sell5@microsift.com', 'birthday': '1986-12-04 00:00:00', 'gender': 'male', 'highestDegreeEarned': 'High School or Baccalaureate', 'countryISO2': 'NP', 'admissionsQuiz': 'incomplete', 'inExperiment': True, 'group': 'no email (control)'}

applicant_id type: <class 'str'>
6525d787953844722c8387ad


In [24]:
# Find original record for `applicant_id`
ds_app.find_one({"_id": applicant_id })

{'_id': '6525d787953844722c8387ad',
 'createdAt': '2022-05-03 23:54:20',
 'firstName': 'Russell',
 'lastName': 'Sell',
 'email': 'russell.sell5@microsift.com',
 'birthday': '1986-12-04 00:00:00',
 'gender': 'male',
 'highestDegreeEarned': 'High School or Baccalaureate',
 'countryISO2': 'NP',
 'admissionsQuiz': 'incomplete'}

In [25]:
result = ds_app.update_one(
    filter={ "_id": applicant_id},
    update={"$set": updated_applicant}
)
print("result type:", type(result))

result type: <class 'pymongo.results.UpdateResult'>


In [27]:
# Access methods and attributes using `dir`
type(result)
dir(result)

# Access `raw_result` attribute
result.raw_result
#result.modified_count
#result.matched_count

{'n': 1, 'nModified': 1, 'ok': 1.0, 'updatedExisting': True}

In [30]:
def update_applicants(collection, observations_assigned):
    """Update applicant documents in collection.

    Parameters
    ----------
    collection : pymongo.collection.Collection
        Collection in which documents will be updated.

    observations_assigned : list
        Documents that will be used to update collection

    Returns
    -------
    transaction_result : dict
        Status of update operation, including number of documents
        and number of documents modified.
    """
    n =0
    n_modified=0
    
    for doc in observations_assigned: 
        result = collection.update_one(
            filter={"_id": doc["_id"]},
            update={"$set": doc}
        )
        n+= result.matched_count
        n_modified+= result.modified_count
    transaction_result = {"n": n, "nModified": n_modified}
    
    return transaction_result

In [31]:
result = update_applicants(ds_app, observations_assigned)
print("result type:", type(result))
result

result type: <class 'dict'>


{'n': 43, 'nModified': 0}

In [59]:
class MongoRepository:
    """Repository class for interacting with MongoDB database.

    Parameters
    ----------
    client : `pymongo.MongoClient`
        By default, `MongoClient(host='localhost', port=27017)`.
    db : str
        By default, `'abtest-db'`.
    collection : str
        By default, `'ds-applicants'`.

    Attributes
    ----------
    collection : pymongo.collection.Collection
        All data will be extracted from and loaded to this collection.
    """

    def __init__(
        self,
        client=MongoClient(host="localhost",port=27017),
        db="abtest-db",
        collection="ds-applicants"
    ):
        self.collection = client[db][collection]

    def find_by_date(self, date_string):
        """Find records in a PyMongo Collection created on a given date.
    
        Parameters
        ----------
        collection : pymongo.collection.Collection
            Collection in which to search for documents.
        date_string : str
            Date to query. Format must be '%Y-%m-%d', e.g. '2022-06-28'.
    
        Returns
        -------
        observations : list
            Result of query. List of documents (dictionaries).
        """
        # Convert `date_string` to datetime object
        start = pd.to_datetime(date_string, format="%Y-%m-%d")
        end = start + pd.DateOffset(days=1)
    
        # Try both string and datetime query to handle both cases
        start_str = start.strftime("%Y-%m-%dT%H:%M:%S")
        end_str = end.strftime("%Y-%m-%dT%H:%M:%S")
    
        # Build OR query to match both possible types
        query = {
            "$and": [
                { "admissionsQuiz": "incomplete" },
                {
                    "$or": [
                        { "createdAt": { "$gte": start, "$lt": end } },        # if createdAt is stored as datetime
                        { "createdAt": { "$gte": start_str, "$lt": end_str } } # if createdAt is stored as string
                    ]
                }
            ]
        }
    
        result = self.collection.find(query)
        observations = list(result)
    
        return observations

    def update_applicants(self, observations_assigned):
        """Update applicant documents in collection.
    
        Parameters
        ----------
        collection : pymongo.collection.Collection
            Collection in which documents will be updated.
    
        observations_assigned : list
            Documents that will be used to update collection
    
        Returns
        -------
        transaction_result : dict
            Status of update operation, including number of documents
            and number of documents modified.
        """
        n =0
        n_modified=0
        
        for doc in observations_assigned: 
            result = self.collection.update_one(
                filter={"_id": doc["_id"]},
                update={"$set": doc}
            )
            n+= result.matched_count
            n_modified+= result.modified_count
        transaction_result = {"n": n, "nModified": n_modified}
        
        return transaction_result

    def assign_to_groups(self,date_string):
        """Randomly assigns observations to control and treatment groups.

        Parameters
        ----------
        observations : list or pymongo.cursor.Cursor
            List of users to assign to groups.

        Returns
        -------
        observations : list
            List of documents from `observations` with two additional keys:
            `inExperiment` and `group`.
        """
        observations = self.find_by_date(date_string)

        # Shuffle `observations`
        random.seed(42)
        random.shuffle(observations)
        # Get index position of item at observations halfway point
        idx = len(observations)//2

        # Assign first half of observations to control group
        for doc in observations[:idx]:
            doc["inExperiment"] = True
            doc["group"] = "no email (control)"

        # Assign second half of observations to treatment group      
        for doc in observations[idx:]:
            doc["inExperiment"] = True
            doc["group"] = "email (treatment)"
        
        result = self.update_applicants(observations);

        return result

In [60]:
repo = MongoRepository()
print("repo type:", type(repo))
repo

repo type: <class '__main__.MongoRepository'>


<__main__.MongoRepository at 0x13445ffd0>

In [61]:
c_test = repo.collection
print("c_test type:", type(c_test))
c_test

c_test type: <class 'pymongo.synchronous.collection.Collection'>


Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'abtest-db'), 'ds-applicants')

In [62]:
may_15_users = repo.find_by_date(date_string="2022-05-04")
print("may_15_users type", type(may_15_users))
print("may_15_users len", len(may_15_users))
may_15_users[:3]

may_15_users type <class 'list'>
may_15_users len 47


[{'_id': '6525d787953844722c838415',
  'createdAt': '2022-05-05 18:39:40',
  'firstName': 'Curtis',
  'lastName': 'Rogers',
  'email': 'curtis.rogers14@microsift.com',
  'birthday': '1987-12-04 00:00:00',
  'gender': 'male',
  'highestDegreeEarned': 'Some College (1-3 years)',
  'countryISO2': 'NG',
  'admissionsQuiz': 'incomplete'},
 {'_id': '6525d787953844722c838453',
  'createdAt': '2022-05-05 11:43:18',
  'firstName': 'Gilbert',
  'lastName': 'Gomez',
  'email': 'gilbert.gomez49@microsift.com',
  'birthday': '1994-02-07 00:00:00',
  'gender': 'male',
  'highestDegreeEarned': 'Some College (1-3 years)',
  'countryISO2': 'PK',
  'admissionsQuiz': 'incomplete'},
 {'_id': '6525d787953844722c83848e',
  'createdAt': '2022-05-05 12:11:58',
  'firstName': 'Tommy',
  'lastName': 'Kohler',
  'email': 'tommy.kohler32@microsift.com',
  'birthday': '1999-03-04 00:00:00',
  'gender': 'male',
  'highestDegreeEarned': "Master's degree",
  'countryISO2': 'LK',
  'admissionsQuiz': 'incomplete'}]

In [63]:
result = repo.update_applicants(observations_assigned)
print("result type:", type(result))
result

result type: <class 'dict'>


{'n': 43, 'nModified': 0}

In [64]:
result = repo.assign_to_groups(date_string="2022-05-15")
print("result type:", type(result))
result

result type: <class 'dict'>


{'n': 38, 'nModified': 38}

In [65]:
repo_test = MongoRepository()
repo_test.assign_to_groups("2022-05-16")

{'n': 42, 'nModified': 42}