In [1]:
import pandas as pd
from IPython.display import display


In [2]:
'''
   run_bash_cmdning django script and jupyter in this case have
   different working directories
   so this is done to make sure that the data path is correct for both
'''
import os
cwd = os.getcwd()
curr_dir = cwd.split(os.path.sep)[-1]
IS_NOTEBOOK = curr_dir == 'scripts'

if IS_NOTEBOOK: 
   os.chdir('../')

NEW_CWD = os.getcwd()
DATA_PATH = os.path.join(NEW_CWD, 'data','all-states-history.csv')
print('cwd before checking: ', cwd)
print('DATA_PATH: ', DATA_PATH)
print('cwd now: ', NEW_CWD)
print('IS_NOTEBOOK: ', IS_NOTEBOOK)

cwd before checking:  f:\Projects\geo-covid-backend\scripts
DATA_PATH:  f:\Projects\geo-covid-backend\data\all-states-history.csv
cwd now:  f:\Projects\geo-covid-backend
IS_NOTEBOOK:  True


In [3]:
import logging
import sys

root = logging.getLogger()
root.setLevel(logging.DEBUG)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
root.addHandler(handler)

In [4]:
''' Activate django env if is in notebook '''

if IS_NOTEBOOK:
    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'geo_covid.settings')
    import django
    django.setup()

2022-08-31 15:54:47,302 - environ.environ - DEBUG - Read environment variables from: F:\Projects\geo-covid-backend\.env
2022-08-31 15:54:47,305 - environ.environ - DEBUG - get 'SECRET_KEY' casted as 'None' with default '<NoValue>'
2022-08-31 15:54:47,305 - environ.environ - DEBUG - get 'DEBUG' casted as 'None' with default '<NoValue>'


In [5]:
'''
    Meta functions
'''
import subprocess

def decode_output(str):
    try:
        return str.decode('utf-8')
    except UnicodeDecodeError as e:
        return str.decode('latin-1')
def run_bash_cmd(command: str):
    print("Exec: ", command)
    process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
    if output: print('Out: ', decode_output(output))
    if error: print('Err: ', decode_output(error))

In [6]:
'''
    Helper functions
'''
import shutil
import re

CASE_CONVERT_PATTERN = re.compile(r'(?<!^)(?=[A-Z])')
def camel_case_to_snake_case(string):
    return CASE_CONVERT_PATTERN.sub('_', string).lower()

def clear_migrations():
    try:
        shutil.rmtree("./us_covid_api/migrations/")
    except BaseException as e:
        print('Fail to clear migration: ',e)

def clean_cache():
    run_bash_cmd('python manage.py clear_cache')
    run_bash_cmd('python manage.py clean_pyc')
    run_bash_cmd(f'pyclean {NEW_CWD}')

def drop_database():
    from pymongo import MongoClient
    client = MongoClient('localhost', 27017)
    client.drop_database('geo-covid')

def clean_and_remigrate_db():
    run_bash_cmd("python manage.py flush --noinput")
    run_bash_cmd("python manage.py makemigrations us_covid_api")
    run_bash_cmd("python manage.py migrate")

def generate_drf_spec_schema():
    # generate schema for drf_spectacular
    run_bash_cmd("python manage.py spectacular --file schema.yml")

def print_settings():
    run_bash_cmd("python manage.py print_settings")

In [7]:
'''
    Important functions relating to script main functionality
'''
from tkinter import N
import numpy as np
from math import isnan
from us_covid_api.models import Report, State
from django.core.exceptions import ObjectDoesNotExist
from django.utils.timezone import make_aware

def load_reports(report_df: pd.DataFrame):
    def generate_report(report_record: pd.Series):
        try:
            state = State.objects.get(initials=report_record['state'])
        except ObjectDoesNotExist as e:
            state = State(name=report_record['state'], initials=report_record['state'])
            state.save()

        report_record.loc['state'] = state
        report_record.loc['date'] = make_aware(report_record.loc['date'])
        report_dict = {k:(v if not type(v) is float or not isnan(v) else None) for k, v in report_record.to_dict().items()}

        report_obj = Report(id=None, **report_dict)
        report_obj.save()
    report_df.apply(generate_report, axis=1)

In [8]:
''' Testing code '''

def sample_code():
    df = pd.read_csv('data/all-states-history.csv', parse_dates=['date'])
    df.columns = list(df.columns.map(lambda x: camel_case_to_snake_case(x)))
    record = df.loc[0].copy()
    state = State(name='random', initials=record['state'])
    state.save()
    record.loc['state'] = state
    # record.pop('state')
    value_dict = record.to_dict()
    newdict = {k:(v if not type(v) is float or not isnan(v) else None) for k, v in value_dict.items()}
    print(newdict)
    obj = Report(id=None, **newdict)
    obj.save()
    print('DONE')

In [9]:
'''
    NOTE: this is the main function & entry point of the whole script (most important)
'''

from ast import arg


def run(*args):
    # TODO: add indexes and unique constraints
    print('script args: ', args)
    # Default behavior is run all below tasks
    if 'no-clear-migrations' not in args: clear_migrations()
    if 'no-clean-cache' not in args: clean_cache()
    if 'no-drop-db' not in args: drop_database()
    if 'no-remigrate-db' not in args: clean_and_remigrate_db()
    if 'no-generate-schema' not in args: generate_drf_spec_schema()
    if 'no-print-settings' not in args: print_settings()
    print('-----DONE PREP-----------------')
    # Data processing part
    df = pd.read_csv('data/all-states-history.csv', parse_dates=['date'])
    print('Shape: ', df.shape)
    df.columns = list(df.columns.map(lambda x: camel_case_to_snake_case(x)))
    load_reports(df)
    # TODO: process data somehow
    print('Done importing')

In [10]:
# ''' Notebook test run '''
import threading
import time
NOTEBOOK_ARGS = []

if IS_NOTEBOOK:
    print("Main    : before creating thread")
    x = threading.Thread(target=run, args = NOTEBOOK_ARGS)
    print("Main    : before running thread")
    x.start()
    x.join()
    print("Main    : wait for the thread to finish")
    print("Main    : all done")

Main    : before creating thread
Main    : before running thread
script args:  ()
Exec:  python manage.py clear_cache
Out:  Cache "default" has been cleared!

Exec:  python manage.py clean_pyc
Exec:  pyclean f:\Projects\geo-covid-backend
Exec:  python manage.py flush --noinput
Exec:  python manage.py makemigrations us_covid_api
Out:  Migrations for 'us_covid_api':
  us_covid_api\migrations\0001_initial.py
    - Create model State
    - Create model Report

Exec:  python manage.py migrate
Out:  Operations to perform:
  Apply all migrations: admin, auth, contenttypes, sessions, us_covid_api
Running migrations:
This version of djongo does not support "NULL, NOT NULL column validation check" fully. Visit https://nesdis.github.io/djongo/support/
  Applying contenttypes.0001_initial...This version of djongo does not support "schema validation using CONSTRAINT" fully. Visit https://nesdis.github.io/djongo/support/
 OK
  Applying auth.0001_initial...This version of djongo does not support "sch

In [11]:
'''
    Testing queries
'''

import timeit
from random import randrange, seed, randint
from datetime import timedelta
import datetime

# Settings
DAYS_INTERVAL = 5 
seed(0)


# Helper
def random_date(start=datetime.date(2020, 1, 13), end=datetime.date(2021, 3, 7)):
    """
    This function will return a random datetime between two datetime 
    objects.
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    return start + timedelta(seconds=random_second)

def random_state_initials():
    states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
    return states[randint(0, len(states) - 1)]

def examine_thread_speed(function, args = (), verbose = True):
    if verbose: print('--------------------------')
    # NOTE: there is threading overhead involved
    if args:
        x = threading.Thread(target=function, args = (*args, verbose,))
    else:
        new_func = lambda: function(verbose=verbose)
        x = threading.Thread(target=new_func)
    if verbose: print("Before running thread: ", function.__name__, ", arg: ", args)
    start = timeit.default_timer()
    x.start()
    x.join()
    stop = timeit.default_timer()
    time_diff = stop - start
    if verbose: print('Done, Execution Time (second): ', time_diff)
    return time_diff  


# Tests
def stats_one_day_all_states(date, verbose = True):
    if verbose: print('Date: ', date)
    time_range = (make_aware(datetime.datetime.combine(date, datetime.time.min)),
                make_aware(datetime.datetime.combine(date, datetime.time.max)))
    reports = Report.objects.filter(date__range= time_range)
    if verbose: print('Report retrieved: ', reports.count())
    if len(reports) == 0: print('No reports found at given date!')

def stats_day_range_all_states(start_day, verbose = True):
    end_day = start_day + datetime.timedelta(days=DAYS_INTERVAL)
    if verbose: print('Start day: ', start_day, ', end day: ', end_day)
    time_range = (make_aware(datetime.datetime.combine(start_day, datetime.time.min)),
                make_aware(datetime.datetime.combine(end_day, datetime.time.max)))
    reports = Report.objects.filter(date__range= time_range)
    if verbose: print('Report retrieved: ', reports.count())
    if len(reports) == 0: print('No reports found at given date!')
            
def stats_one_state_all_days(state_initials, verbose = True):
    if verbose: print('State: ', state_initials)
    state = State.objects.filter(initials=state_initials).first()
    reports = Report.objects.filter(state_id= state.id)
    if verbose: print('Report retrieved: ', reports.count())
    if len(reports) == 0: print('No reports found at given date!')
# def stats_few_state_all_days(): pass # ! OUT OF SCOPE


# Results
if IS_NOTEBOOK:
    examine_thread_speed(stats_one_day_all_states, args=(random_date(),));

    start_day = random_date()
    examine_thread_speed(stats_day_range_all_states, args=(start_day,));

    examine_thread_speed(stats_one_state_all_days, args=(random_state_initials(),));

--------------------------
Before running thread:  stats_one_day_all_states , arg:  (datetime.date(2020, 11, 7),)
Date:  2020-11-07
Report retrieved:  56
Done, Execution Time (second):  0.1538256999999703
--------------------------
Before running thread:  stats_day_range_all_states , arg:  (datetime.date(2020, 8, 1),)
Start day:  2020-08-01 , end day:  2020-08-06
Report retrieved:  336
Done, Execution Time (second):  0.37233250000002727
--------------------------
Before running thread:  stats_one_state_all_days , arg:  ('NJ',)
State:  NJ
Report retrieved:  392
Done, Execution Time (second):  0.32574030000000675


In [12]:

# Helper
def benchmark(function, args_list, inner_verbose=False):
    print('-----------------------------')
    results = []
    for args in args_list:
        time_delta = examine_thread_speed(function, args, verbose=inner_verbose)
        results.append(time_delta)
    print('Run results (second): ', results)
    print('Stats: ')
    display(pd.DataFrame(results).describe())

# Settings
REPEATS = 20
dates = [random_date() for _ in range(REPEATS)]
states_initials_list = [random_state_initials() for _ in range(REPEATS)]

# Results
to_args = lambda list_val: [(x,) for x in list_val]
if IS_NOTEBOOK:
    benchmark(stats_one_day_all_states, to_args(dates))
    benchmark(stats_day_range_all_states, to_args(dates))
    benchmark(stats_one_state_all_days, to_args(states_initials_list))

-----------------------------
Run results (second):  [0.08017819999997755, 0.083844699999986, 0.0755628999999658, 0.07894600000003038, 0.07566020000001572, 0.07448540000001458, 0.07305430000002389, 0.07498730000003206, 0.10452370000001565, 0.08107149999995045, 0.07091400000001613, 0.0811706000000072, 0.0826979999999935, 0.0759539000000018, 0.07407360000001972, 0.0828939000000446, 0.07677400000000034, 0.08485020000000532, 0.07250239999996211, 0.03976539999996476]
Stats: 


Unnamed: 0,0
count,20.0
mean,0.077196
std,0.011387
min,0.039765
25%,0.074382
50%,0.076364
75%,0.081552
max,0.104524


-----------------------------
Run results (second):  [0.2867310000000316, 0.30694090000002916, 0.29867819999998346, 0.3016620999999873, 0.2922633000000019, 0.29977109999998675, 0.2884159999999838, 0.2612823999999705, 0.28861159999996744, 0.296034700000007, 0.28229989999999816, 0.32390380000003915, 0.31927869999998393, 0.29897670000002563, 0.28010239999997566, 0.2932335000000421, 0.2976938999999561, 0.287504599999977, 0.28994400000004816, 0.14419639999999845]
Stats: 


Unnamed: 0,0
count,20.0
mean,0.286876
std,0.036171
min,0.144196
25%,0.287311
50%,0.292748
75%,0.299175
max,0.323904


-----------------------------
Run results (second):  [0.31101050000000896, 0.2962329000000068, 0.29138280000000805, 0.33248170000001664, 0.30951310000000376, 0.3259357999999679, 0.3043194999999628, 0.29835759999997435, 0.3148168000000169, 0.26962689999999156, 0.2727337000000034, 0.26672200000001567, 0.3152616999999509, 0.26488480000000436, 0.2995670999999902, 0.26963250000000016, 0.27992810000000645, 0.2676862000000142, 0.26069540000003144, 0.3028835000000072]
Stats: 


Unnamed: 0,0
count,20.0
mean,0.292684
std,0.022196
min,0.260695
25%,0.269631
50%,0.297295
75%,0.309887
max,0.332482


In [13]:
if IS_NOTEBOOK:
    run_bash_cmd('jupyter nbconvert --to python .\scripts\load_csv_to_database.ipynb')

Exec:  jupyter nbconvert --to python .\scripts\load_csv_to_database.ipynb
