# Ingestion into MongoDB

This notebook establishes the structure of models and how to properly ingest fields into a MongoDB database.

In [1]:
# Import system packages.
import sys, os
import urllib.parse
import datetime
import json

# Import data processing tools
import pandas as pd

# Import MongoDB related packages.
from pymongo import MongoClient
from dotenv import dotenv_values
from bson.objectid import ObjectId

# Load the environment variables.
ENV_PATH = "./../.env"
config = { 
    **dotenv_values(ENV_PATH + ".config"),
    **dotenv_values(ENV_PATH + ".secrets"),
    **os.environ,
}

config['LOOKUP_SHEETS'] = config['LOOKUP_SHEETS'].split(',')

In [21]:
# Setup the Database class.

"""
Reference:
https://medium.com/analytics-vidhya/setting-up-a-database-class-mongodb-for-interactions-in-python-6a16417cd58e
"""

class Database:
    
    CLIENT = None
    DATABASE = None
    
    @staticmethod
    def get_connection_string(options):
        hostname = urllib.parse.quote_plus(options['DB_HOST'])
        username = urllib.parse.quote_plus(options['DB_USER'])
        password = urllib.parse.quote_plus(options['DB_PASS'])
        port = int(options['DB_PORT'])
        return "mongodb://%s:%s@%s:%d/?authSource=admin" % (username, password, hostname, port)
    
    @classmethod
    def initialize(cls, options):
        uri = Database.get_connection_string(options)
        cls.CLIENT = MongoClient(uri)
        if cls.CLIENT:
            print("OK")
        return cls.CLIENT
        
    @classmethod
    def use(cls, database):
        cls.DATABASE = cls.CLIENT[database]
        if cls.DATABASE:
            print("OK")
        return cls.DATABASE
        
    @classmethod
    def insert(cls, collection, data):
        if cls.DATABASE:
            cls.DATABASE[collection].insert(data)
        else:
            print("No database currently loaded.")
        
    @classmethod
    def find(cls, collection, query):
        if cls.DATABASE:
            return cls.DATABASE[collection].find(query)
        else:
            print("No database currently loaded.")
    
    @classmethod
    def find_one(cls, collection, query):
        if cls.DATABASE:
            return cls.DATABASE[collection].find_one(query)
        else:
            print("No database currently loaded.")

In [22]:
# Initialize and connect to the MongoDB client and show client details.
Database.initialize(config)

OK


MongoClient(host=['mongodb.thelionsredmane.stream:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin')

In [23]:
# Use specific database and show details.
Database.use("glossary")

OK


Database(MongoClient(host=['mongodb.thelionsredmane.stream:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin'), 'glossary')

## Glossary Database

This section prepares the glossary database defined by the data inside `edu.iste.disabilityresourcesearchengine.xlsx`.

The `glossary` database has the following collections:

- `disability_category`
- `service_category`
- `states`
- `zipcodes`

In [4]:
# Prepare the data model for the disability and service categories.
class Category:
    def __init__(self, category, description):
        self.category = category
        self.description = description
        
    def data(self):
        data = {
            'cat': self.category,
            'desc': self.description
        }
        return data
    
category = Category("Ambulatory", "In the American Community Survey, individuals five or more years old who responded 'yes' when asked if they had serious difficulty walking or climbing stairs.")
category.data()

{'cat': 'Ambulatory',
 'desc': "In the American Community Survey, individuals five or more years old who responded 'yes' when asked if they had serious difficulty walking or climbing stairs."}

In [5]:
# Prepare the data model for the states.
class State:
    def __init__(self, state, abbreviation, code, population):
        self.state = state
        self.abbreviation = abbreviation
        self.population = population
        
    def data(self):
        data = {
            'state': self.state,
            'abbr': self.abbreviation,
            'pop': self.population      
        }
        return data

state = State("New York", "N.Y.", "NY", 19299981)
state.data()

{'state': 'New York', 'abbr': 'N.Y.', 'pop': 19299981}

In [6]:
# Prepare the data model for the zipcodes.
class Zipcode:
    def __init__(self, zipcode, city, county, population):
        self.zipcode = zipcode
        self.city = city
        self.county = county
        self.population = population
        
    def data(self):
        data = {
            'zip': self.zipcode,
            'city': self.city,
            'county': self.county,
            'pop': self.population            
        }
        return data

zipcode = Zipcode(612, "Arecibo", "Arecibo", 59369)
zipcode.data()

{'zip': 612, 'city': 'Arecibo', 'county': 'Arecibo', 'pop': 59369}

In [7]:
# Preview pandas.DataFrame information.
def preview(df):
    if df is not None:
        print(df.head())
        print(df.tail())
        print(df.info())

In [8]:
# Load in dataset for the lookup table.
def load_sheet(filename, sheet, **settings):
    df = None
    with open(filename, "rb") as excel:
        df = pd.read_excel(excel, sheet_name=sheet, **settings)
    return df

# Dictionary comprehension, with keys as the worksheets.
filename = "./../data/" + config['LOOKUP_TABLE']
sheets = [
    'disability_category',
    'service_category',
    'states',
    'zipcodes',
]

In [9]:
# Prepare glossary.
glossary = {
    'disabilities': load_sheet(filename, sheets[0], index_col=None, usecols=[0,1]),
    'services': load_sheet(filename, sheets[1], index_col=None, usecols=[0,1]),
    'states': load_sheet(filename, sheets[2], index_col=None, usecols=[0,1,2,3]),
    'zipcodes': load_sheet(filename, sheets[3], index_col=None, usecols=[0,1,2,3]),
}   

In [10]:
glossary['disabilities'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   disability_category  7 non-null      object
 1   definition           7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [11]:
glossary['services'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category     7 non-null      object
 1   description  7 non-null      object
dtypes: object(2)
memory usage: 240.0+ bytes


In [12]:
glossary['states'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   State       51 non-null     object
 1   Abbrev      51 non-null     object
 2   Code        51 non-null     object
 3   Population  51 non-null     int64 
dtypes: int64(1), object(3)
memory usage: 1.7+ KB


In [13]:
glossary['zipcodes'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Zipcode     500 non-null    int64 
 1   City        500 non-null    object
 2   County      500 non-null    object
 3   Population  500 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 15.8+ KB


## Ingest to the Database

Loaded data can be ingested into the MongoDB instance.

In [25]:
# Export collections to the glossary database.
"""
Reference:
https://www.datasciencelearner.com/insert-pandas-dataframe-into-mongodb/
"""

# 

'\nReference:\nhttps://www.datasciencelearner.com/insert-pandas-dataframe-into-mongodb/\n'