# Data Deep Dive.

In [None]:
!cp /whhdata/dbconnection.json .

In [8]:
%matplotlib inline
import matplotlib.pyplot as plt
import dbutils
import pandas as pd
import glob2
import os
import config
from tqdm import tqdm
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from IPython.display import display

## MAE for all models.

In [43]:
db_connector = dbutils.connect_to_main_database()

select_sql_statement = "SELECT DISTINCT(type) FROM artifact_quality;"
types = db_connector.execute(select_sql_statement, fetch_all=True)
models = [t[0] for t in types if len(t[0]) > 20]

@interact(models=models, datasets=["training", "nottraining"], bins=[100, 200, 400, 800])
def visualize(models, datasets, bins):
    select_sql_statement = "SELECT value FROM artifact_quality WHERE misc = '{}' AND type='{}' AND key='mae';".format(datasets, models)
    results = db_connector.execute(select_sql_statement, fetch_all=True)
    results = [x[0] for x in results]
    plt.figure(figsize=(20, 4))
    plt.hist(results, bins=bins)
    plt.title("mae for {} results for {}".format(len(results), datasets))
    plt.show()
    plt.close()


interactive(children=(Dropdown(description='models', options=('20190708-0919_2379-595height',), value='2019070…

## Gold Standard Results.

In [72]:
db_connector = dbutils.connect_to_main_database()

select_sql_statement = "SELECT DISTINCT(type) FROM artifact_quality;"
types = db_connector.execute(select_sql_statement, fetch_all=True)
models = [t[0] for t in types if len(t[0]) > 20]

limits = [0.2, 0.4, 0.6, 1.2, "all"]

@interact(models=models, datasets=["training", "nottraining"])
def visualize(models, datasets):
    # Get all artifacts.
    select_sql_statement = ""
    select_sql_statement += "SELECT COUNT(*) FROM artifact_quality WHERE misc = '{}' AND type='{}' AND key='mae';".format(datasets, models)
    all_artifacts_count = db_connector.execute(select_sql_statement, fetch_one=True)[0]

    rows = []
    for limit in limits:
        
        # Get those below limit.
        select_sql_statement = ""
        select_sql_statement += "SELECT COUNT(*) FROM artifact_quality WHERE misc = '{}' AND type='{}' AND key='mae'".format(datasets, models)
        if limit != "all":
            select_sql_statement += " AND value <{};".format(limit)
        select_sql_statement += ";"
        below_limit_count = db_connector.execute(select_sql_statement, fetch_one=True)[0]
        
        percent = 100 * below_limit_count / all_artifacts_count
        
        description = "All"
        if limit != "all":
            description = "MAE < {}mm".format(int(10 * limit))
        
        rows.append((description, below_limit_count, round(percent, 2)))


    headers = ("Description", "Number of artifacts", "%")
    df = pd.DataFrame(rows, columns=headers)
    display(df)

interactive(children=(Dropdown(description='models', options=('20190708-0919_2379-595height',), value='2019070…

## Available Data in Storage (rgb scans, pcd scans)  => rgb = > 16331 &  pcd => 5049

In [11]:
# TODO Tristan: This can be optimized a lot with glob, I guess.

## Getting the number of GB files 
sum_jpg = 0
sum_pcd = 0

search_path = config.artifacts_path + "/"

block = [".DS_Store", "DEMO_TEST_0001", "._data"]
for file in tqdm((os.listdir(search_path))):
    if(file not in block):
        for file1 in os.listdir(search_path + file):
            if(file1 == "measurements"):
                for file2 in os.listdir(search_path + file + "/" + file1 + "/"):
                    for file3 in os.listdir(search_path + file + "/" + file1 + "/" + file2 + "/"):
                        if(file3=="pc"):
                            sum_pcd  = sum_pcd + len(os.listdir(search_path+file+"/measurements/"+file2+"/"+file3))
                        if(file3=="rgb"):
                            sum_jpg = sum_jpg + len(os.listdir(search_path+file+"/measurements/"+file2+"/"+file3))
print("Number of rgb scans in Storage :  " + str(sum_jpg)) 
print("Number of pc scans in Storage :  " + str(sum_pcd)) 



  0%|          | 0/5579 [00:00<?, ?it/s][A[A

 12%|█▏        | 643/5579 [00:00<00:00, 6420.68it/s][A[A

 22%|██▏       | 1220/5579 [00:00<00:00, 6208.21it/s][A[A

 31%|███▏      | 1751/5579 [00:00<00:00, 5906.83it/s][A[A

 41%|████      | 2287/5579 [00:00<00:00, 5728.70it/s][A[A

 51%|█████     | 2837/5579 [00:00<00:00, 5657.63it/s][A[A
 47%|████▋     | 2624/5579 [00:12<00:00, 4817.60it/s][A

 51%|█████     | 2837/5579 [00:12<00:00, 5657.63it/s][A[A

 58%|█████▊    | 3239/5579 [00:12<00:21, 108.30it/s] [A[A

 58%|█████▊    | 3243/5579 [00:12<00:35, 65.25it/s] [A[A

 58%|█████▊    | 3247/5579 [00:12<00:46, 49.71it/s][A[A

KeyboardInterrupt: 