In [8]:
import pymongo
from pathlib import Path
import os


db_import_dir = Path('/Users/poldrack/data_unsynced/LifeSnaps/rais_anonymized/mongo_rais_anonymized')


## Step 1: load mongobd data from bson

There are three data files:

- fitbit.bson
- sema.bson
- surveys.bson

we load these into the local mongodb using mongorestore.

the id/user_id entry in the mongo records refers to the subject id



In [None]:
# Connect to MongoDB
def get_mongo_client(host='localhost', port=27017):
    try:
        client = pymongo.MongoClient(f"mongodb://{host}:{port}/")
    except pymongo.errors.ConnectionError as e:
        raise Exception(f"Error connecting to MongoDB - have you set it up yet?: {e}")
    return client

client = get_mongo_client()

# load the database and import data if necessary
db = client['lifesnaps']
collection_lengths = {
    'fitbit': 71284346,
    'sema': 15380,
    'surveys': 935
}

overwrite = False

for collection_name, expected_length in collection_lengths.items():
    collection = db[collection_name]
    actual_length = collection.count_documents({})
    # use ge since we will removing some objects below
    if actual_length >= expected_length and not overwrite:
        print(f"Collection '{collection_name}' already loaded with {actual_length} documents.")
    else:
        # import the data from the BSON file
        print(f"Collection '{collection_name}' has {actual_length} documents, expected {expected_length}. Importing data...")
        import_file = db_import_dir / f"{collection_name}.bson"
        if not import_file.exists():
            raise FileNotFoundError(f"Import file {import_file} does not exist.")
        print(f"Importing data into collection '{collection_name}' from {import_file}...")
        command = f"mongorestore --host {client.address[0]} --port {client.address[1]} --db lifesnaps --collection {collection_name} --drop {import_file}"
        print(f"Running command: {command}")
        os.system(command)
        
        collection = db[collection_name]
        actual_length = collection.count_documents({})
        assert actual_length >= expected_length, f"After import, collection '{collection_name}' has {actual_length} documents, expected {expected_length}."
        print(f"Successfully imported collection '{collection_name}' with {actual_length} documents.")
 

Collection 'fitbit' already loaded with 71284346 documents.
Collection 'sema' already loaded with 15380 documents.
Collection 'surveys' already loaded with 935 documents.


### Step 2: remove unnecessary entries from fitbit database

The fitbit store is huge and we don't need many of the entries, so let's remove them.



In [11]:
fitbit_types_to_keep = [
    "Profile",
    "heart_rate",
    "sleep",
    "steps",
    "lightly_active_minutes",
    "moderately_active_minutes",
    "very_active_minutes",
    "sedentary_minutes",
    "calories",
]

# remove unwanted fitbit data
fitbit_collection = db['fitbit']
deletion_result = fitbit_collection.delete_many({"type": {"$nin": fitbit_types_to_keep}})
print(f"Removed {deletion_result.deleted_count} unwanted documents from 'fitbit' collection.")

Removed 9844973 unwanted documents from 'fitbit' collection.


In [13]:
def get_collection_type_counts(db, collection_name, sample_size=3):
    # Get distinct types in a collection and sample documents for each type
    collection = db[collection_name]
    distinct_types = collection.distinct("type")
    type_counts = {}
    for dtype in distinct_types:
        count = collection.count_documents({"type": dtype})
        sample_docs = list(collection.find({"type": dtype}).limit(sample_size))
        type_counts[dtype] = count
        
    return type_counts

get_collection_type_counts(db, 'fitbit')


{'Profile': 69,
 'calories': 9675782,
 'heart_rate': 48720040,
 'lightly_active_minutes': 7203,
 'moderately_active_minutes': 7203,
 'sedentary_minutes': 7203,
 'sleep': 4141,
 'steps': 3010529,
 'very_active_minutes': 7203}