# README.
This notebook synchronizes the database with the CSV file in /whhdata.

In [10]:
import os
import glob
import pandas as pd
import dbutils
import progressbar

# Locate and load CSV from whhdata.

In [2]:
# Where to get the data.
whhdata_path = "/whhdata"
glob_search_path = os.path.join(whhdata_path, "*.csv")
csv_paths = sorted(glob.glob(glob_search_path))
csv_path = csv_paths[0]
print("Using {}".format(csv_paths))

Using ['/whhdata/results-measures-20190319-102143.csv']


In [3]:
# Load the data-frame.
df = pd.read_csv(csv_path)

In [21]:
# Print some samples.
df.head()

Unnamed: 0,id,personId,qrcode,sex,type,age,height,weight,muac,headCircumference,oedema,latitude,longitude,address,timestamp,deleted,deletedBy,visible,createdBy
0,fb3bfc1a748e12ee_measure_1552552380603_hfPIPRf...,fb3bfc1a748e12ee_mandloi_1552552136224_j441qmO...,,,manual,1371,90.5,10.7,13.5,46.2,False,22.33796,74.953553,,1552890976734,False,,False,cgm15whh@gmail.com
1,fb3bfc1a748e12ee_measure_1552378132366_b3zadc7...,fb3bfc1a748e12ee_jamra_1552378019288_CilAQntVd...,,,manual,1104,85.4,9.75,14.1,43.3,False,22.363275,74.99987,,1552891706555,False,,False,cgm15whh@gmail.com
2,fb3bfc1a748e12ee_measure_1552374894278_oLUZcV3...,fb3bfc1a748e12ee_nigwal_1552374875042_YyE4As6U...,,,manual,617,76.3,8.2,13.5,45.0,False,22.363275,74.99987,,1552891706483,False,,False,cgm15whh@gmail.com
3,fb3bfc1a748e12ee_measure_1552376998993_qhSkTpX...,fb3bfc1a748e12ee_jamra_1552376878360_GZ2RfdJKN...,,,manual,707,74.0,7.3,11.6,44.3,False,22.363275,74.99987,,1552891706532,False,,False,cgm15whh@gmail.com
4,fb3bfc1a748e12ee_measure_1552376299613_pDjxYHV...,fb3bfc1a748e12ee_buawar_1552376170986_vFRpgUMz...,,,manual,608,71.0,7.65,12.6,42.6,False,22.363275,74.99987,,1552891706509,False,,False,cgm15whh@gmail.com


In [22]:
# List all columns.
columns = list(df)
columns

['id',
 'personId',
 'qrcode',
 'sex',
 'type',
 'age',
 'height',
 'weight',
 'muac',
 'headCircumference',
 'oedema',
 'latitude',
 'longitude',
 'address',
 'timestamp',
 'deleted',
 'deletedBy',
 'visible',
 'createdBy']

# Mapping the CSV-columns to DB-columns.

In [6]:
columns_mapping = { column: column for column in columns}
columns_mapping["id"] = "measurement_id"
columns_mapping["personId"] = "person_id"
columns_mapping["age"] = "age_days"
columns_mapping["height"] = "height_cms"
columns_mapping["weight"] = "weight_cms"
columns_mapping["muac"] = "muac_cms"
columns_mapping["headCircumference"] = "head_circumference_cms"
columns_mapping["deletedBy"] = "deleted_by"
columns_mapping["createdBy"] = "created_by"
columns_mapping["personId"] = "person_id"
columns_mapping

{'id': 'measurement_id',
 'personId': 'person_id',
 'qrcode': 'qrcode',
 'sex': 'sex',
 'type': 'type',
 'age': 'age_days',
 'height': 'height_cms',
 'weight': 'weight_cms',
 'muac': 'muac_cms',
 'headCircumference': 'head_circumference_cms',
 'oedema': 'oedema',
 'latitude': 'latitude',
 'longitude': 'longitude',
 'address': 'address',
 'timestamp': 'timestamp',
 'deleted': 'deleted',
 'deletedBy': 'deleted_by',
 'visible': 'visible',
 'createdBy': 'created_by'}

# Synchronize the DB.

In [20]:
# Connect to the database.
main_connector = dbutils.connect_to_main_database()
table = "measurements"

# Number of rows before.
rows_number = main_connector.get_number_of_rows(table)
print("Number of rows before: {}".format(rows_number))

# Drop table. # TODO consider update.
main_connector.clear_table(table)

# Number of rows after.
rows_number = main_connector.get_number_of_rows(table)
print("Number of rows after clear: {}".format(rows_number))

# Insert data in batches.
batch_size = 1000
sql_statement = ""
rows_number_df = len(df.index)
bar = progressbar.ProgressBar(max_value=rows_number_df)
for index, row in df.iterrows():
    bar.update(index)
    sql_statement += "INSERT INTO {}".format("measurements") + " "
    
    keys = []
    values = []
    for df_key, db_key in columns_mapping.items():
        keys.append(str(db_key))
        values.append("'" + str(row[df_key]) + "'")
    
    keys_string = "(" + ", ".join(keys) + ")"
    values_string = "VALUES (" + ", ".join(values) + ")"
    sql_statement += keys_string
    sql_statement += "\n" + values_string
    sql_statement += ";" + "\n"

    if index != 0 and ((index % batch_size) == 0 or index == rows_number_df - 1):
        main_connector.execute(sql_statement)
        sql_statement = ""

bar.finish()

# Number of rows after sync.
rows_number = main_connector.get_number_of_rows(table)
print("Number of rows after: {}".format(rows_number))

Number of rows before: 13739


  2% (348 of 13739) |                    | Elapsed Time: 0:00:00 ETA:   0:00:05

Number of rows after clear: 0


100% (13739 of 13739) |##################| Elapsed Time: 0:00:07 Time:  0:00:07


Number of rows after: 13739
