# Ingestion into MongoDB

This notebook establishes the structure of models and how to properly ingest fields into a MongoDB database.

In [1]:
# Import system packages.
import sys, os
import urllib.parse
import datetime
import json

# Import data processing tools
import pandas as pd

# Import MongoDB related packages.
from pymongo import MongoClient
from dotenv import dotenv_values
from bson.objectid import ObjectId

# Load the environment variables.
ENV_PATH = "./../.env"
config = { 
    **dotenv_values(ENV_PATH + ".config"),
    **dotenv_values(ENV_PATH + ".secrets"),
    **os.environ,
}

config['LOOKUP_SHEETS'] = config['LOOKUP_SHEETS'].split(',')

In [2]:
# Setup the Database class.

"""
Reference:
https://medium.com/analytics-vidhya/setting-up-a-database-class-mongodb-for-interactions-in-python-6a16417cd58e
"""

class Database:
    
    CLIENT = None
    DATABASE = None
    
    @staticmethod
    def get_connection_string(options):
        hostname = urllib.parse.quote_plus(options['DB_HOST'])
        username = urllib.parse.quote_plus(options['DB_USER'])
        password = urllib.parse.quote_plus(options['DB_PASS'])
        port = int(options['DB_PORT'])
        return "mongodb://%s:%s@%s:%d/?authSource=admin" % (username, password, hostname, port)
    
    @classmethod
    def initialize(cls, options):
        uri = Database.get_connection_string(options)
        cls.CLIENT = MongoClient(uri)
        if cls.CLIENT:
            print("OK")
        return cls.CLIENT
        
    @classmethod
    def use(cls, database):
        cls.DATABASE = cls.CLIENT[database]
        if cls.DATABASE:
            print("OK")
        return cls.DATABASE
        
    @classmethod
    def insert_many(cls, collection, data):
        if cls.DATABASE:
            print("Inserting data into %s collection." % (collection,))
            return cls.DATABASE[collection].insert_many(data)
        else:
            print("No database currently loaded.")
            
    @classmethod
    def insert_one(cls, collection, record):
        if cls.DATABASE:
            print("Inserting record into %s collection." % (collection,))
            return cls.DATABASE[collection].insert_one(record)
        else:
            print("No database currently loaded.")
        
    @classmethod
    def find(cls, collection, query):
        if cls.DATABASE:
            print("Querying any %s collection: '%s'" % (collection, query))
            return cls.DATABASE[collection].find(query)
        else:
            print("No database currently loaded.")
    
    @classmethod
    def find_one(cls, collection, query):
        if cls.DATABASE:
            print("Querying one from %s collection: '%s'" % (collection, query))
            return cls.DATABASE[collection].find_one(query)
        else:
            print("No database currently loaded.")

In [3]:
# Initialize and connect to the MongoDB client and show client details.
Database.initialize(config)

OK


MongoClient(host=['mongodb.thelionsredmane.stream:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin')

In [4]:
# Use specific database and show details.
Database.use("glossary")

OK


Database(MongoClient(host=['mongodb.thelionsredmane.stream:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin'), 'glossary')

## Glossary Database

This section prepares the glossary database defined by the data inside `edu.iste.disabilityresourcesearchengine.xlsx`.

The `glossary` database has the following collections:

- `disability_category`
- `service_category`
- `states`
- `zipcodes`

In [5]:
# Prepare the data model for the disability and service categories.
class Category:
    def __init__(self, category, description):
        self.category = category
        self.description = description
        
    def data(self):
        data = {
            'cat': self.category,
            'desc': self.description
        }
        return data

category = Category("Ambulatory", "In the American Community Survey, individuals five or more years old who responded 'yes' when asked if they had serious difficulty walking or climbing stairs.")
category.data()

{'cat': 'Ambulatory',
 'desc': "In the American Community Survey, individuals five or more years old who responded 'yes' when asked if they had serious difficulty walking or climbing stairs."}

In [25]:
# Prepare the data model for the states.
class State:
    def __init__(self, state, abbreviation, code, population):
        self.state = state
        self.abbreviation = abbreviation
        self.code = code
        self.population = population
        
    def data(self):
        data = {
            'state': self.state,
            'abbr': self.abbreviation,
            'code': self.code,
            'pop': self.population      
        }
        return data

state = State("New York", "N.Y.", "NY", 19299981)
state.data()

{'state': 'New York', 'abbr': 'N.Y.', 'code': 'NY', 'pop': 19299981}

In [7]:
# Prepare the data model for the zipcodes.
class Zipcode:
    def __init__(self, zipcode, city, county, population):
        self.zipcode = zipcode
        self.city = city
        self.county = county
        self.population = population
        
    def data(self):
        data = {
            'zip': self.zipcode,
            'city': self.city,
            'county': self.county,
            'pop': self.population            
        }
        return data

zipcode = Zipcode(612, "Arecibo", "Arecibo", 59369)
zipcode.data()

{'zip': 612, 'city': 'Arecibo', 'county': 'Arecibo', 'pop': 59369}

In [8]:
# Preview pandas.DataFrame information.
def preview(df):
    if df is not None:
        print(df.head())
        print(df.tail())
        print(df.info())

In [9]:
# Load in dataset for the lookup table.
def load_sheet(filename, sheet, **settings):
    df = None
    with open(filename, "rb") as excel:
        df = pd.read_excel(excel, sheet_name=sheet, **settings)
    return df

# Dictionary comprehension, with keys as the worksheets.
filename = "./../data/" + config['LOOKUP_TABLE']
sheets = [
    'disability_category',
    'service_category',
    'states',
    'zipcodes',
]

In [10]:
# Prepare glossary.
glossary = {
    'disabilities': load_sheet(filename, sheets[0], index_col=None, usecols=[0,1]),
    'services': load_sheet(filename, sheets[1], index_col=None, usecols=[0,1]),
    'states': load_sheet(filename, sheets[2], index_col=None, usecols=[0,1,2,3]),
    'zipcodes': load_sheet(filename, sheets[3], index_col=None, usecols=[0,1,2,3]),
}   

In [11]:
glossary['disabilities'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   disability_category  7 non-null      object
 1   definition           7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [12]:
glossary['services'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category     7 non-null      object
 1   description  7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [13]:
glossary['states'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   State       51 non-null     object
 1   Abbrev      51 non-null     object
 2   Code        51 non-null     object
 3   Population  51 non-null     int64 
dtypes: int64(1), object(3)
memory usage: 1.7+ KB


In [14]:
glossary['zipcodes'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Zipcode     500 non-null    int64 
 1   City        500 non-null    object
 2   County      500 non-null    object
 3   Population  500 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 15.8+ KB


## Ingest to the Database

Loaded data can be ingested into the MongoDB instance.

In [15]:
# Convert DataFrame into the appropriate format for insertion.
"""
Reference:
https://www.datasciencelearner.com/insert-pandas-dataframe-into-mongodb/
"""
def to_records(df, columns=None):
    # Set df_view.
    df_view = df

    # If columns are present, rename them before export.
    if columns:
        df_view.columns = columns

    # Convert dataframe into a dictionary
    return df_view.to_dict("records")

### Categories

In [21]:
Database.use("glossary")

OK


Database(MongoClient(host=['mongodb.thelionsredmane.stream:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin'), 'glossary')

In [28]:
disability_categories = to_records(glossary['disabilities'], columns=['cat', 'desc'])
glossary['disabilities']

Unnamed: 0,cat,desc
0,Ambulatory,"In the American Community Survey, individuals ..."
1,Hearing,"In the ACS, individuals who indicated ""yes"" wh..."
2,Cognitive,"In the American Community Survey, individuals ..."
3,Vision,"In the ACS, individuals who indicated ""yes"" wh..."
4,Independent Living,"In the American Community Survey, individuals..."
5,Self-care,"In the American Community Survey, individuals ..."
6,Other,All-encompassing category for conditions of im...


In [23]:
service_categories = to_records(glossary['services'], columns=['cat', 'desc'])
glossary['services']

Unnamed: 0,cat,desc
0,Communication Services,"Communication assistance, such as through sign..."
1,Advocacy Services,Services that provide assistance in crisis sit...
2,Job Development and Placement,Assistance in obtaining employment related ser...
3,Information and Referral,"Answers questions about assistance, directs cl..."
4,Counseling,Intervention in crisis situations; helps clien...
5,Independent Living Skill Instruction,Assists clients in acquiring skills to live in...
6,Community Education,Increases public awareness and understanding o...


In [19]:
# Insert the categories.
Database.insert_many(sheets[0], disability_categories).inserted_ids

OK
Inserting data into disability_category collection.


[ObjectId('6109c67d105a90f29504fff6'),
 ObjectId('6109c67d105a90f29504fff7'),
 ObjectId('6109c67d105a90f29504fff8'),
 ObjectId('6109c67d105a90f29504fff9'),
 ObjectId('6109c67d105a90f29504fffa'),
 ObjectId('6109c67d105a90f29504fffb'),
 ObjectId('6109c67d105a90f29504fffc')]

In [20]:
Database.insert_many(sheets[1], service_categories).inserted_ids

Inserting data into service_category collection.


[ObjectId('6109c67f105a90f29504fffd'),
 ObjectId('6109c67f105a90f29504fffe'),
 ObjectId('6109c67f105a90f29504ffff'),
 ObjectId('6109c67f105a90f295050000'),
 ObjectId('6109c67f105a90f295050001'),
 ObjectId('6109c67f105a90f295050002'),
 ObjectId('6109c67f105a90f295050003')]

### Regions

In [24]:
Database.use("region")

OK


Database(MongoClient(host=['mongodb.thelionsredmane.stream:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin'), 'region')

In [29]:
states = to_records(glossary['states'], columns=['state', 'abbr', 'code', 'pop'])
glossary['states'].head()

Unnamed: 0,state,abbr,code,pop
0,Alabama,Ala.,AL,4934193
1,Alaska,Alaska,AK,724357
2,Arizona,Ariz.,AZ,7520103
3,Arkansas,Ark.,AR,3033946
4,California,Calif.,CA,39613493


In [30]:
zipcodes = to_records(glossary['zipcodes'], columns=['zip', 'city', 'county', 'pop'])
glossary['zipcodes'].head()

Unnamed: 0,zip,city,county,pop
0,612,Arecibo,Arecibo,59369
1,725,Caguas,Caguas,80537
2,926,San Juan,San Juan,93330
3,949,Toa Baja,Toa Baja,72943
4,953,Toa Alta,Toa Alta,71174


In [32]:
# Insert the regions.
Database.insert_many(sheets[2], states).inserted_ids

Inserting data into states collection.


[ObjectId('6109c788105a90f2950501f8'),
 ObjectId('6109c788105a90f2950501f9'),
 ObjectId('6109c788105a90f2950501fa'),
 ObjectId('6109c788105a90f2950501fb'),
 ObjectId('6109c788105a90f2950501fc'),
 ObjectId('6109c788105a90f2950501fd'),
 ObjectId('6109c788105a90f2950501fe'),
 ObjectId('6109c788105a90f2950501ff'),
 ObjectId('6109c788105a90f295050200'),
 ObjectId('6109c788105a90f295050201'),
 ObjectId('6109c788105a90f295050202'),
 ObjectId('6109c788105a90f295050203'),
 ObjectId('6109c788105a90f295050204'),
 ObjectId('6109c788105a90f295050205'),
 ObjectId('6109c788105a90f295050206'),
 ObjectId('6109c788105a90f295050207'),
 ObjectId('6109c788105a90f295050208'),
 ObjectId('6109c788105a90f295050209'),
 ObjectId('6109c788105a90f29505020a'),
 ObjectId('6109c788105a90f29505020b'),
 ObjectId('6109c788105a90f29505020c'),
 ObjectId('6109c788105a90f29505020d'),
 ObjectId('6109c788105a90f29505020e'),
 ObjectId('6109c788105a90f29505020f'),
 ObjectId('6109c788105a90f295050210'),
 ObjectId('6109c788105a90

In [31]:
Database.insert_many(sheets[3], zipcodes).inserted_ids

Inserting data into zipcodes collection.


[ObjectId('6109c786105a90f295050004'),
 ObjectId('6109c786105a90f295050005'),
 ObjectId('6109c786105a90f295050006'),
 ObjectId('6109c786105a90f295050007'),
 ObjectId('6109c786105a90f295050008'),
 ObjectId('6109c786105a90f295050009'),
 ObjectId('6109c786105a90f29505000a'),
 ObjectId('6109c786105a90f29505000b'),
 ObjectId('6109c786105a90f29505000c'),
 ObjectId('6109c786105a90f29505000d'),
 ObjectId('6109c786105a90f29505000e'),
 ObjectId('6109c786105a90f29505000f'),
 ObjectId('6109c786105a90f295050010'),
 ObjectId('6109c786105a90f295050011'),
 ObjectId('6109c786105a90f295050012'),
 ObjectId('6109c786105a90f295050013'),
 ObjectId('6109c786105a90f295050014'),
 ObjectId('6109c786105a90f295050015'),
 ObjectId('6109c786105a90f295050016'),
 ObjectId('6109c786105a90f295050017'),
 ObjectId('6109c786105a90f295050018'),
 ObjectId('6109c786105a90f295050019'),
 ObjectId('6109c786105a90f29505001a'),
 ObjectId('6109c786105a90f29505001b'),
 ObjectId('6109c786105a90f29505001c'),
 ObjectId('6109c786105a90