In [1]:
import pymongo
from pathlib import Path
import os


db_import_dir = Path('/Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized')


## Step 1: load mongobd data from bson

There are three data files:

- fitbit.bson
- sema.bson
- surveys.bson

we load these into the local mongodb using mongorestore.

the id/user_id entry in the mongo records refers to the subject id



In [2]:
def get_collection_type_counts(db, collection_name, sample_size=3):
    # Get distinct types in a collection and sample documents for each type
    collection = db[collection_name]
    distinct_types = collection.distinct("type")
    type_counts = {}
    for dtype in distinct_types:
        count = collection.count_documents({"type": dtype})
        sample_docs = list(collection.find({"type": dtype}).limit(sample_size))
        type_counts[dtype] = count
        
    return type_counts

def get_collection_size(db, collection_name):
    # Get the numner of documents in a collection

    return db[collection_name].count_documents({})

In [3]:
# Connect to MongoDB
def get_mongo_client(host='localhost', port=27017):
    try:
        client = pymongo.MongoClient(f"mongodb://{host}:{port}/")
    except pymongo.errors.ConnectionError as e:
        raise Exception(f"Error connecting to MongoDB - have you set it up yet?: {e}")
    return client

client = get_mongo_client()

# load the database and import data if necessary
db = client['lifesnaps']
collection_lengths = {
    'fitbit': 71284346,
    'sema': 15380,
    'surveys': 935
}

# in general we will need to overwrite to get the full dataset to begin with
overwrite = True

for collection_name, expected_length in collection_lengths.items():
    actual_length = get_collection_size(db, collection_name)
    # use ge since we will removing some objects below
    if actual_length >= expected_length and not overwrite:
        print(f"Collection '{collection_name}' already loaded with {actual_length} documents.")
    else:
        # import the data from the BSON file
        print(f"Collection '{collection_name}' has {actual_length} documents, expected {expected_length}. Importing data...")
        import_file = db_import_dir / f"{collection_name}.bson"
        if not import_file.exists():
            raise FileNotFoundError(f"Import file {import_file} does not exist.")
        print(f"Importing data into collection '{collection_name}' from {import_file}...")
        command = f"mongorestore --host {client.address[0]} --port {client.address[1]} --db lifesnaps --collection {collection_name} --drop {import_file}"
        print(f"Running command: {command}")
        os.system(command)
        
        actual_length = get_collection_size(db, collection_name)
        assert actual_length >= expected_length, f"After import, collection '{collection_name}' has {actual_length} documents, expected {expected_length}."
        print(f"Successfully imported collection '{collection_name}' with {actual_length} documents.")
 

Collection 'fitbit' has 0 documents, expected 71284346. Importing data...
Importing data into collection 'fitbit' from /Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized/fitbit.bson...
Running command: mongorestore --host localhost --port 27017 --db lifesnaps --collection fitbit --drop /Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized/fitbit.bson


2025-12-16T20:53:00.855-0800	checking for collection data in /Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized/fitbit.bson
2025-12-16T20:53:00.856-0800	reading metadata for lifesnaps.fitbit from /Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized/fitbit.metadata.json
2025-12-16T20:53:00.889-0800	restoring lifesnaps.fitbit from /Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized/fitbit.bson
2025-12-16T20:53:03.854-0800	[........................]  lifesnaps.fitbit  246MB/9.02GB  (2.7%)
2025-12-16T20:53:06.854-0800	[#.......................]  lifesnaps.fitbit  511MB/9.02GB  (5.5%)
2025-12-16T20:53:09.854-0800	[#.......................]  lifesnaps.fitbit  758MB/9.02GB  (8.2%)
2025-12-16T20:53:12.854-0800	[##......................]  lifesnaps.fitbit  968MB/9.02GB  (10.5%)
2025-12-16T20:53:15.854-0800	[###.....................]  lifesnaps.fitbit  1.19GB/9.02GB  (13.2%)
2025-12-16T20:53:18.854-0800	[###...........

Successfully imported collection 'fitbit' with 71284346 documents.
Collection 'sema' has 0 documents, expected 15380. Importing data...
Importing data into collection 'sema' from /Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized/sema.bson...
Running command: mongorestore --host localhost --port 27017 --db lifesnaps --collection sema --drop /Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized/sema.bson
Successfully imported collection 'sema' with 15380 documents.
Collection 'surveys' has 0 documents, expected 935. Importing data...
Importing data into collection 'surveys' from /Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized/surveys.bson...
Running command: mongorestore --host localhost --port 27017 --db lifesnaps --collection surveys --drop /Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized/surveys.bson
Successfully imported collection 'surveys' with 935 documents.


2025-12-16T20:59:31.410-0800	checking for collection data in /Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized/sema.bson
2025-12-16T20:59:31.411-0800	reading metadata for lifesnaps.sema from /Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized/sema.metadata.json
2025-12-16T20:59:31.442-0800	restoring lifesnaps.sema from /Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized/sema.bson
2025-12-16T20:59:31.492-0800	finished restoring lifesnaps.sema (15380 documents, 0 failures)
2025-12-16T20:59:31.492-0800	no indexes to restore for collection lifesnaps.sema
2025-12-16T20:59:31.492-0800	15380 document(s) restored successfully. 0 document(s) failed to restore.
2025-12-16T20:59:31.538-0800	checking for collection data in /Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized/surveys.bson
2025-12-16T20:59:31.538-0800	reading metadata for lifesnaps.surveys from /Users/poldrack/data_unsynced/LifeS

### Step 2: remove unnecessary entries from fitbit database

The fitbit store is huge and we don't need many of the entries, so let's remove them.



First pull Profile records into a separate object store since they are a different kind of data

In [4]:
# create a new table containing all documents from fitbit with type 'Profile'
profile_collection = db['fitbit_profile']
profile_collection.drop()  # drop existing collection if it exists
fitbit_collection = db['fitbit']
profiles = list(fitbit_collection.find({"type": "Profile"}))
if len(profiles) > 0:
    profile_collection.insert_many(profiles)
    print(f"Created 'fitbit_profile' collection with {get_collection_size(db, 'fitbit_profile')} documents.")



Created 'fitbit_profile' collection with 69 documents.


Remove unwanted fitbit data types

In [5]:
fitbit_types_to_keep = [
    "heart_rate",
    "sleep",
    "steps",
    "lightly_active_minutes",
    "moderately_active_minutes",
    "very_active_minutes",
    "sedentary_minutes",
    "calories",
]

# remove unwanted fitbit data
fitbit_collection = db['fitbit']
deletion_result = fitbit_collection.delete_many({"type": {"$nin": fitbit_types_to_keep}})
print(f"Removed {deletion_result.deleted_count} unwanted documents from 'fitbit' collection.")
print(f"Remaining documents in 'fitbit' collection: {get_collection_size(db, 'fitbit')}")
print("Final document counts in 'fitbit' collection after cleanup:")
final_type_counts = get_collection_type_counts(db, 'fitbit')
for dtype, count in final_type_counts.items():
    print(f"Type: {dtype}, Count: {count}")

Removed 9845042 unwanted documents from 'fitbit' collection.
Remaining documents in 'fitbit' collection: 61439304
Final document counts in 'fitbit' collection after cleanup:
Type: calories, Count: 9675782
Type: heart_rate, Count: 48720040
Type: lightly_active_minutes, Count: 7203
Type: moderately_active_minutes, Count: 7203
Type: sedentary_minutes, Count: 7203
Type: sleep, Count: 4141
Type: steps, Count: 3010529
Type: very_active_minutes, Count: 7203


## Harmonize the documents and combine into a single database

We want to be able to treat each of the different data types similarly, but currently some of them have their value in a different location than the `value` field.

In [6]:
# for sema collection, create a 'type' field based on data['SURVEY_NAME']

sema_collection = db['sema']
# first remove documents that None for data.SURVEY_NAME
deletion_result = sema_collection.delete_many({"data.SURVEY_NAME": None})
print(f"Removed {deletion_result.deleted_count} documents with null 'data.SURVEY_NAME' from 'sema' collection.")
print(f"Remaining documents in 'sema' collection: {get_collection_size(db,'sema')}")
# now update documents to add 'type'
update_result = sema_collection.update_many(
    {"type": {"$exists": False}},
    [{"$set": {"type": "$data.SURVEY_NAME"}}]
)
print(f"Updated {update_result.modified_count} documents in 'sema' collection to add 'type' field.")
print(f"Document counts in 'sema' collection after adding 'type' field:")
sema_type_counts = get_collection_type_counts(db, 'sema')
for dtype, count in sema_type_counts.items():
    print(f"Type: {dtype}, Count: {count}") 

Removed 2 documents with null 'data.SURVEY_NAME' from 'sema' collection.
Remaining documents in 'sema' collection: 15378
Updated 15378 documents in 'sema' collection to add 'type' field.
Document counts in 'sema' collection after adding 'type' field:
Type: Context and Mood Survey, Count: 11526
Type: Step Goal Survey, Count: 3852


In [7]:
# combine the sema collection into fitbit

sema_collection = db['sema']
# drop the "Step Goal Survey" from sema
sema_collection.delete_many({"type": "Step Goal Survey"})

fitbit_collection = db['fitbit']
fitbit_collection.insert_many(sema_collection.find())
print(f"Combined 'sema' collection into 'fitbit'. New 'fitbit' collection size: {get_collection_size(db, 'fitbit')} documents.")

Combined 'sema' collection into 'fitbit'. New 'fitbit' collection size: 61450830 documents.


In [8]:
# some already are called "value": calories, active/sedentary minutes
value_variable = {
    'heart_rate': 'value.bpm',
    'sleep': 'minutesAsleep',
    'lightly_active_minutes': 'value',
    'moderately_active_minutes': 'value',
    'very_active_minutes': 'value',
    'sedentary_minutes': 'value',
    'calories': 'value',
    'steps': 'value',
    "Context and Mood Survey": 'MOOD'
}
date_variable = {
    'sleep': 'endTime',
    'calories': 'dateTime',
    'steps': 'dateTime',
    "Context and Mood Survey": 'COMPLETED_TS'
}
# for each object that has type matching one of the keys in value_variable,
# move data[value_variable] into 'value' field at root level of object

fitbit_collection = db['fitbit']

for doc_type, value_field in value_variable.items():
    # Update documents of this type to move the value from data[value_field] to root level 'value'
    update_result = fitbit_collection.update_many(
        {
            "type": doc_type,
            f"data.{value_field}": {"$exists": True}
        },
        [
            {"$set": {
                "value": f"$data.{value_field}",
                "value_origin": value_field
            }}
        ]
    )
    # fix date field if applicable
    if doc_type in date_variable:
        date_field = date_variable[doc_type]
        date_update_result = fitbit_collection.update_many(
            {
                "type": doc_type,
                f"data.{date_field}": {"$exists": True}
            },
            [
                {"$set": {
                    "date": f"$data.{date_field}",
                    "date_origin": date_field
                }}
            ]
        )
        print(f"Updated {date_update_result.modified_count} documents of type '{doc_type}' to add 'date' field from 'data.{date_field}'.")
    print(f"Updated {update_result.modified_count} documents of type '{doc_type}' to add 'value' field from 'data.{value_field}'.")

print("\nVerifying updates:")
for doc_type in value_variable.keys():
    count_with_value = fitbit_collection.count_documents({
        "type": doc_type,
        "value": {"$exists": True}
    })
    total_count = fitbit_collection.count_documents({"type": doc_type})
    print(f"Type '{doc_type}': {count_with_value}/{total_count} documents now have 'value' field.")


Updated 48720040 documents of type 'heart_rate' to add 'value' field from 'data.value.bpm'.
Updated 4141 documents of type 'sleep' to add 'date' field from 'data.endTime'.
Updated 4141 documents of type 'sleep' to add 'value' field from 'data.minutesAsleep'.
Updated 7203 documents of type 'lightly_active_minutes' to add 'value' field from 'data.value'.
Updated 7203 documents of type 'moderately_active_minutes' to add 'value' field from 'data.value'.
Updated 7203 documents of type 'very_active_minutes' to add 'value' field from 'data.value'.
Updated 7203 documents of type 'sedentary_minutes' to add 'value' field from 'data.value'.
Updated 9675782 documents of type 'calories' to add 'date' field from 'data.dateTime'.
Updated 9675782 documents of type 'calories' to add 'value' field from 'data.value'.
Updated 3010529 documents of type 'steps' to add 'date' field from 'data.dateTime'.
Updated 3010529 documents of type 'steps' to add 'value' field from 'data.value'.
Updated 11526 documents 

Harmonize by adding type field to sema database

rename id to user_id for fitbit collection to harmonize with others

In [9]:
# rename "id" field to "user_id" in fitbit collection

# skip if all entities already have "user_id"
if fitbit_collection.count_documents({"user_id": {"$exists": False}}) > 0:

    rename_result = fitbit_collection.update_many(
        {},
        {"$rename": {"id": "user_id"}}
    )
    print(f"Renamed 'id' field to 'user_id' in {rename_result.modified_count} documents in 'fitbit' collection.")
else:
    print("Field 'id' already renamed to 'user_id' in 'fitbit' collection; skipping rename.")   

Renamed 'id' field to 'user_id' in 61439304 documents in 'fitbit' collection.


Doublecheck that every document has a fields for type, value, user_id, and date.

In [10]:
field_to_check_for = ['user_id', 'type', 'value', 'date']
# check that all documents in fitbit collection have these fields
missing_field_counts = {}
for field in field_to_check_for:
    count_missing = fitbit_collection.count_documents({field: {"$exists": False}})
    missing_field_counts[field] = count_missing
print("Missing field counts in 'fitbit' collection:")
for field, count in missing_field_counts.items():
    print(f"Field '{field}': {count} documents missing this field.") 

Missing field counts in 'fitbit' collection:
Field 'user_id': 0 documents missing this field.
Field 'type': 0 documents missing this field.
Field 'value': 0 documents missing this field.
Field 'date': 48748852 documents missing this field.


### Combine all three stores into a single store


In [11]:
# delete intermediate collections if desired
#fitbit_collection.drop()
#sema_collection.drop()
#print("Deleted intermediate collections 'fitbit' and 'sema'.")



In [12]:
# get total size of lifesnaps_data collection in gigabytes

total_size_gb = get_collection_size(db, 'lifesnaps_data') / 1e7
print(f"Total size of 'lifesnaps_data' collection: {total_size_gb:.02f} GB")

Total size of 'lifesnaps_data' collection: 0.00 GB
