# Setup

In [34]:
#@title ## `pip install`
# Don't forget to restart runtime after installing
%pip install "labelbox[data]" --quiet  # installs all required libraries plus extras required in manipulating annotations (shapely, geojson, numpy, PILLOW, opencv-python, etc.)
%pip install -U kaleido  --quiet # for saving the still figures besides .eps (i.e png, pdf)
%pip install poppler-utils  --quiet   # for exporting to .eps extension
%pip install plotly>=5.13.0    # need ≥5.6 to use ticklabelstep argument, ≥5.8 to use minor ticks. Release history here https://github.com/plotly/plotly.py/releases

In [1]:
#@title ## Base imports
import os
import cmd
import sys
import json
import numpy as np
import pandas as pd
import scipy
import scipy.stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

import skimage
import skimage.io
import PIL
import PIL.Image
import requests
import urllib

import labelbox

import IPython.display
import matplotlib
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots

# Display versions of python packages
pip_versions = %system pip freeze  # uses colab magic to get list from shell
pip_versions_organized = {
    "standard": [pip_version for pip_version in pip_versions if "==" in pip_version],
    "other": [pip_version for pip_version in pip_versions if "==" not in pip_version]
    }
print(f"Python version: {sys.version} \n")  # display version of python itself (i.e. 3.8.10)
cli = cmd.Cmd()
cli.columnize(pip_versions_organized["standard"], displaywidth=800)
cli.columnize(pip_versions_organized["other"], displaywidth=160)

Python version: 3.9.16 (main, Dec  7 2022, 01:11:51) 
[GCC 9.4.0] 

absl-py==1.4.0                backcall==0.2.0         charset-normalizer==2.0.12  cvxopt==1.3.0           dm-tree==0.1.8            fastdownload==0.0.7        GDAL==3.3.2                       google-cloud-bigquery==3.4.2           greenlet==2.0.2           htmlmin==0.1.12              imutils==0.5.4           jupyter-client==6.1.12        libclang==15.0.6.1          mistune==0.8.4           nbclient==0.7.2                  opencv-python==4.6.0.66           parso==0.8.3         pluggy==0.7.1              psutil==5.9.4              pydot-ng==2.0.0     PySocks==1.7.1                      qdldl==0.1.5.post3              Send2Trash==1.8.0        sphinxcontrib-applehelp==1.0.4        tblib==1.7.0                          terminado==0.17.1     traitlets==5.7.1          Werkzeug==2.2.3          
alabaster==0.7.13             backoff==1.10.0         click==8.1.3                cvxpy==1.3.0            docutils==0.16            

In [3]:
#@title Basic helper functions
colab_ip = %system hostname -I   # uses colab magic to get list from shell
colab_ip = colab_ip[0].strip()   # returns "172.28.0.12"
# Get most possible port names with: !sudo lsof -i -P -n | grep LISTEN
colab_port = 9000                # could use 6000, 8080, or 9000

notebook_filename = filename = requests.get(f"http://{colab_ip}:{colab_port}/api/sessions").json()[0]["name"]

# Avoids scroll-in-the-scroll in the entire Notebook
def resize_colab_cell():
  display(IPython.display.Javascript("google.colab.output.setIframeHeight(0, true, {maxHeight: 10000})"))
get_ipython().events.register("pre_run_cell", resize_colab_cell)


#@markdown ### func `def get_path_to_save(...):`
def get_path_to_save(plot_props:dict=None, file_prefix="", save_filename:str=None, save_in_subfolder:str=None, extension="jpg", dot=".", create_folder_if_necessary=True):
    """Code created myself (Rahul Yerrabelli)"""
    replace_characters = {
        "$": "",
        "\\frac":"",
        "\\mathrm":"",
        "\\left(":"(",  "\\right)":")",
        "\\left[":"[",  "\\right]":"]",
        "\\": "",       "/":"-",
        "{": "(",       "}": ")",
        "<":"",         ">":"",
        "?":"",
        "_":"",
        "^":"",
        "*":"",
        "!":"",
        ":":"-",
        "|":"-",
        ".":"_",
    }

    # define save_filename based on plot_props
    if save_filename is None:
        save_filename = "unnamed"

    save_path = [
                 "outputs",
                f"""{notebook_filename.split(".",1)[0]}""",
                ]
    if save_in_subfolder is not None:
        if isinstance(save_in_subfolder, (list, tuple, set, np.ndarray) ):
            save_path.append(**save_in_subfolder)
        else:  # should be a string then
            save_path.append(save_in_subfolder)
    save_path = os.path.join(*save_path)

    if not os.path.exists(save_path) and create_folder_if_necessary:
        os.makedirs(save_path)
    return os.path.join(save_path, file_prefix+save_filename+dot+extension)


## Google drive

In [4]:
#@title ### Prepare for mounting

mountpoint_folder_name = "drive"  # can be anything, doesn't have to be "drive"
project_path_within_drive = "PythonProjects/SpeculumAnalysis" #@param {type:"string"}
project_path_full = os.path.join("/content/",mountpoint_folder_name,
                        "MyDrive",project_path_within_drive)

%cd {project_path_full}

<IPython.core.display.Javascript object>

/content/drive/MyDrive/Computer Backups/Rahul Yerrabelli drive/PythonProjects/SpeculumAnalysis


In [5]:
#@title ## Mount google drive
try:
    import google.colab.drive
    import os, sys
    # Need to move out of google drive directory if going to remount
    %cd
    # drive.mount documentation can be accessed via: drive.mount?
    #Signature: drive.mount(mountpoint, force_remount=False, timeout_ms=120000, use_metadata_server=False)
    google.colab.drive.mount(os.path.join("/content/",mountpoint_folder_name), force_remount=True)  # mounts to a folder called mountpoint_folder_name

    if project_path_full not in sys.path:
        pass
        #sys.path.insert(0,project_path_full)
    %cd {project_path_full}
    
except ModuleNotFoundError:  # in case not run in Google colab
    import traceback
    traceback.print_exc()

<IPython.core.display.Javascript object>

/root
Mounted at /content/drive
/content/drive/MyDrive/Computer Backups/Rahul Yerrabelli drive/PythonProjects/SpeculumAnalysis


# Data

## Read in the collected/labeled data

### Labelbox

#### Option 1: Read from labelbox

##### Set up labelbox connection
Works with LabelBox api (https://labelbox.com/), which is the tool I used to label all the distances on the images.

In [6]:
# Add your labelbox api key and project
# Labelbox API stored in separate file since it is specific for a labelbox 
#account and shouldn't be committed to git. Contact the 
# team (i.e. Rahul Yerrabelli) in order to access to the data on your own account.
with open("auth/LABELBOX_API_KEY.json", "r") as infile:
  json_data = json.load(infile)
API_KEY = json_data["API_KEY"]
del json_data   # delete sensitive info

PROJECT_ID = "cl2cept1u4ees0zbx6uan5kwa"
DATASET_IDS = {
    "1_1": "cl2cerkwd5gtd0zcahfz98401",  # SpeculumData1__GloveNonsterile
    "1_2": "cl2hu1u8z019a0z823yl5f8gr",  # SpeculumData1__Condom
    "2_1": "cl7183159370n072eammu39e6",  # SpeculumData2_1__MultipleSizeSpecula
    "2_2": None,  # haven't uploaded yet, trials weren't that useful
    "2_3": "cleky2xtu19w3070qezkdbhd9",  # SpeculumData2_3__GloveSterile
}

client = labelbox.Client(api_key=API_KEY)
del API_KEY   # delete sensitive info
project = client.get_project(PROJECT_ID)

# Below code is from labelbox tutorial
# Create a mapping for the colors
hex_to_rgb = lambda hex_color: tuple(
    int(hex_color[i + 1:i + 3], 16) for i in (0, 2, 4))
colors = {
    tool.name: hex_to_rgb(tool.color)
    for tool in labelbox.OntologyBuilder.from_project(project).tools
}

<IPython.core.display.Javascript object>

##### Download datasets into df

In [7]:
datasets = {}
for short_id, DATASET_ID in DATASET_IDS.items():
    if DATASET_ID is None:
        datasets[short_id]=[None]*7
    else:
        # Alternative way to get dataset
        # dataset = next(client.get_datasets(where=(labelbox.Dataset.name == "SpeculumDataset2_3")))
        dataset = client.get_dataset(DATASET_ID)
        datasets[short_id] = [dataset.uid, dataset.name, dataset.description, dataset.created_at, dataset.updated_at, dataset.row_count, dataset]
datasets_df = pd.DataFrame.from_dict(datasets,orient="index", columns=[
    "uid","name","description","created_at","updated_at","row_count","dataset"
])
datasets = datasets_df["dataset"]

<IPython.core.display.Javascript object>

##### Download data rows into df

In [8]:
data_dfs = {}
for short_id,dataset in datasets.items():
    if dataset is not None:
        data_rows = dataset.data_rows()
        data_dfs[short_id] = pd.DataFrame([[
            short_id, data_row.external_id, data_row.created_at, data_row.updated_at, data_row.uid,
            data_row.media_attributes["width"], data_row.media_attributes["height"], data_row.media_attributes["mimeType"], data_row.media_attributes["contentLength"]
            ]
            for data_row in data_rows],
            columns=["dataset_short_id","external_id","created_at","updated_at","uid", 
                     "width", "height", "mimeType", "contentLength"]
        )
        data_dfs[short_id] = data_dfs[short_id].sort_values(by="external_id")
data_df = pd.concat(data_dfs).reset_index(drop=True)  # drop index as we don't need it and key info is in the df itself
data_df

<IPython.core.display.Javascript object>

Unnamed: 0,dataset_short_id,external_id,created_at,updated_at,uid,width,height,mimeType,contentLength
0,1_1,20220423_142023.jpg,2022-04-23 22:03:35+00:00,2022-04-23 22:03:35+00:00,cl2cerrz71mj90zrwfqq43dfp,4032,2268,image/jpeg,308728
1,1_1,20220423_142031.jpg,2022-04-23 22:03:35+00:00,2022-04-23 22:03:35+00:00,cl2cerrz71mjd0zrw1hjxbmrs,4032,2268,image/jpeg,303287
2,1_1,20220423_142049.jpg,2022-04-23 22:03:35+00:00,2022-04-23 22:03:35+00:00,cl2cerrz71mjh0zrwd0wf0ysx,4032,2268,image/jpeg,291925
3,1_1,20220423_142054.jpg,2022-04-23 22:03:35+00:00,2022-04-23 22:03:35+00:00,cl2cerrz71mjl0zrw5yux6dxy,4032,2268,image/jpeg,295793
4,1_1,20220423_142100.jpg,2022-04-23 22:03:35+00:00,2022-04-23 22:03:35+00:00,cl2cerrz61mj50zrw7nawe0ma,4032,2268,image/jpeg,302057
...,...,...,...,...,...,...,...,...,...
447,2_3,20230225_173617.jpg,2023-03-04 00:58:34+00:00,2023-03-04 00:58:34+00:00,clet9aaeu0g9407aoa1kmgus5,2252,4000,image/jpeg,2589623
448,2_3,20230225_173619.jpg,2023-03-04 00:58:34+00:00,2023-03-04 00:58:34+00:00,clet9aaeu0g9807ao1e9kgj9d,2252,4000,image/jpeg,2590057
449,2_3,20230225_173623.jpg,2023-03-04 00:58:34+00:00,2023-03-04 00:58:34+00:00,clet9aaeu0g9c07aoe2ra1s48,2252,4000,image/jpeg,2591756
450,2_3,20230225_173627.jpg,2023-03-04 00:58:34+00:00,2023-03-04 00:58:34+00:00,clet9aaeu0g9g07ao3b8f8uy0,2252,4000,image/jpeg,2590123


##### Attempt fixing datetimes by the external id

In [94]:
import pytz
import datetime
chicago_tz = pytz.timezone("America/Chicago") 

for data_row in datasets["2_3"].data_rows():
    #data_row.update()
    ca = data_row.created_at
    print(type(data_row.created_at))
    print(data_row.external_id, data_row.created_at, data_row.updated_at, data_row.uid)
    datatime_str = data_row.external_id
    dt = datetime.datetime(int(datatime_str[0:4]),int(datatime_str[4:6]),int(datatime_str[6:8]),
                           int(datatime_str[9:11]),int(datatime_str[11:13]),int(datatime_str[13:15]),
                           tzinfo=chicago_tz)
    data_row.update(update_at=dt
            )
    # InvalidAttributeError: Field(s) ''update_at'' not valid on DB type 'DataRow'("Field(s) ''update_at'' not valid on DB type 'DataRow'", None)
    break

<IPython.core.display.Javascript object>

<class 'datetime.datetime'>
20230225_155110.jpg 2023-03-04 17:39:05+00:00 2023-03-04 17:39:05+00:00 cleu90ykxoe9607aoefbc1gsw


InvalidAttributeError: ignored

##### Download labels into df

In [27]:
image_labels = project.label_generator()
image_labels = image_labels.as_list()
labels_df = pd.DataFrame([[
                           label.data.external_id, 
                           label.annotations[0].value.end.x - label.annotations[0].value.start.x, 
                           label.annotations[0].value.end.y - label.annotations[0].value.start.y, 
                           label.annotations[0].value.start.x, 
                           label.annotations[0].value.start.y, 
                           label.data.url, 
                           label.uid,
                           len(label.annotations), len(label.extra["Reviews"]),   # Annotations should be exactly 1. 
                           label.extra["Has Open Issues"], label.extra["Skipped"], 
                           label.extra["Created At"], label.extra["Updated At"], label.extra["Seconds to Label"], label.extra["Created By"],  # Created By is an email address str
                           label.extra["Agreement"], label.extra["Benchmark Agreement"], label.extra["Benchmark ID"]
                           ] 
                          for label in image_labels],
                         columns=["Filename","x","y", "xstart","ystart","url", "Label ID",
                                  "Ann Ct", "Reviews Ct", "Open Issues", "Skipped", 
                                  "created_at","updated_at","Label Seconds", "Created By", 
                                  "Agreement","Benchmark Agreement","Benchmark ID",])

labels2 = project.export_labels(download = True, start="2022-04-01", end="2023-04-01")
labels3 = [value.copy() for value in labels2 ]

<IPython.core.display.Javascript object>


This method is deprecated and will be removed in a future release. LabelList class will be deprecated.


LabelList is deprecated and will be removed in a future release.



In [29]:
for ind in range(len(labels3)):
    # Simplify "Label" and "Reviews" by removing unnecessary variables and making the necessary ones at the top level
    # Thus, labels3 will be only 2 layers deep.
    if "Label" in labels3[ind]:
        coords = labels3[ind]["Label"]["objects"][0]["bbox"]
        for key, val in coords.items():
            labels3[ind]["Label"+"-"+key] = val
        # URL to download mask. Still has token in it
        labels3[ind]["Label_url"] = labels3[ind]["Label"]["objects"][0]["instanceURI"] 
        del labels3[ind]["Label"]

    # Remove special info ie emails, tokens (except the Label_url for now)
    labels3[ind].pop("Labeled Data", None)  # url with token in it
    labels3[ind].pop("View Label", None)  # url
    labels3[ind].pop("Created By", None)  # has email address
    labels3[ind].pop("Reviews", None)  # empty list

<IPython.core.display.Javascript object>

##### Download and save image masks

In [19]:
# Download and save image masks from URLs
for ind in range(len(labels3)):
    filename = labels3[ind]["External ID"].rsplit(".",maxsplit=1)[0] + "_label"  # usually an .jpg, but split at just "." to be more robust
    filepath = get_path_to_save(save_filename=filename, extension="png")  # the mask will be png, not jpg
    urllib.request.urlretrieve(labels3[ind]["Label_url"], filepath)

<IPython.core.display.Javascript object>

HTTPError: ignored

In [30]:
# Remove Label_url before saving as that URL has the Labelbox token in it
labelbox_df = pd.DataFrame.from_dict(labels3).set_index("External ID").drop(columns=["Label_url"])

<IPython.core.display.Javascript object>

In [31]:
save_path = "data/v2/02_intermediate/labels_df"
labels_df.to_csv( save_path + ".csv")
labels_df.to_pickle(save_path+".pkl")

<IPython.core.display.Javascript object>

#### Option #2: Read from csv/pkl if already saved there from previous run

In [32]:
#labels_df = pd.read_csv("data/02_intermediate/labels_df.csv", index_col=0)
labels_df = pd.read_pickle("data/v2/02_intermediate/labels_df.pkl")

<IPython.core.display.Javascript object>

### Read trial data from saved excel sheet

In [33]:
def handle_opening_distance(x):
    if x=="BROKE":
        return 0
    elif type(x)==str and x.lower() in ["n/a","na","nan"]:
        return np.nan
    else:
        return float(x)

# Made Trial a str because it is not really being used as a numeric variable - better for plotting as it becomes a discrete variable instead of continuous (i.e. for color legend)
speculum_df_raw = pd.read_excel("data/01_raw/SpeculumTrialData_v2_3.xlsx", index_col=0, sheet_name="AllTrialsLongVals", 
                                dtype={
                                    "Overall Num": np.int32, "Day Ct": np.int32, "Day Num": np.int32, "Day Num Ct": np.int32, 
                                    "Set Ct": np.int32, "Trial Ct": np.int32, "Set Trial Ct": np.int32, 
                                    "Trial Num": np.int32, #"mmHg": np.int32,
                                    "Spec Ang": np.int32, "Spec Ht": np.int32, 
                                    # Keep size as str even for sterile sizes like 7 and 7.5 to be consistent
                                    "Size": str, "Trial": str, "Filename": str, "Speculum Type": str
                                    }, 
                                converters={"Opening Distance": handle_opening_distance},
                                )    
# For compatibility with older versions
speculum_df_raw.columns = [col.replace("Vertical","Opening").replace("Height","Distance") for col in speculum_df_raw.columns]

#key_cols = ["Speculum Type","Spec Ang","Spec Ht","Size","Material","Material Type","Method"]
#speculum_df_raw.drop_duplicates(subset=key_cols).reset_index().drop("index",axis=1).reset_index().rename({"index":"Set"},axis=1)
#set_info = speculum_df_raw[key_cols].drop_duplicates().reset_index().rename({"index":"Set"},axis=1)
#speculum_df_raw_with_set = speculum_df_raw.merge(set_info, how="outer",on=key_cols)

speculum_df_notfailed = speculum_df_raw.dropna(axis="index", subset=["Filename"])   # Dropped the rows with failed trials



<IPython.core.display.Javascript object>

In [34]:
#@title ### Save intermediate files
path_to_folder = "data/v2/02_intermediate"
speculum_df_raw.to_csv(   os.path.join(path_to_folder, "speculum_df_raw"+".csv"))
speculum_df_raw.to_pickle(os.path.join(path_to_folder, "speculum_df_raw"+".pkl"))
speculum_df_notfailed.to_csv(   os.path.join(path_to_folder, "speculum_df_notfailed"+".csv"))
speculum_df_notfailed.to_pickle(os.path.join(path_to_folder, "speculum_df_notfailed"+".pkl"))

<IPython.core.display.Javascript object>

## Data rearranging

### Combine labelbox and excel sheet, calculate relative value

In [35]:
freqs = labels_df["Filename"].value_counts()
freqs = freqs[freqs>1]
if len(freqs) > 0:
    display("Warning: There are images with multiple labels. This is not currently supported, and only the first will be kept. This applies for the following files: " + ", ".join( [f"{filename} ({freq})" for filename, freq in freqs.items()] ))
    display(labels_df[labels_df["Filename"].isin(freqs.index)])

df_long=pd.merge(left=speculum_df_notfailed, right=labels_df, on="Filename")

# Drop duplicates i.e. cases where there are multiple labels for the same image
# Currently, don't support combining labels
df_long = df_long.drop_duplicates(subset=["Filename"], keep="first")

glove_rows = df_long["Material Type"]=="Glove"
# The glove images got rotated 90 degrees. To fix this and clarify the directions of the opening, renaming the columns from x,y to wd and ht.
df_long.loc[ glove_rows,"wd"] = df_long.loc[ glove_rows].y
df_long.loc[ glove_rows,"ht"] = df_long.loc[ glove_rows].x
df_long.loc[ glove_rows,"wd_start"] = df_long.loc[ glove_rows].ystart
df_long.loc[ glove_rows,"ht_start"] = df_long.loc[ glove_rows].xstart

df_long.loc[~glove_rows,"wd"] = df_long.loc[~glove_rows].x
df_long.loc[~glove_rows,"ht"] = df_long.loc[~glove_rows].y
df_long.loc[~glove_rows,"wd_start"] = df_long.loc[~glove_rows].xstart
df_long.loc[~glove_rows,"ht_start"] = df_long.loc[~glove_rows].ystart
df_long = df_long.drop(columns=["x","y","xstart","ystart"])

df_long.head()

# Calculate relative value by dividing by the 0mmHg value
base_mmHg = 0 # mmHg
for ind in df_long["Trial Ct"].unique():   # "Trial Ct" was formerly called "Order". "Set Trial Ct" was formerly called "Trial". "Opening Distance" was formerly called "Vertical Height", then "Opening Height"
    df_long.loc[df_long["Trial Ct"]==ind,"wd_rel"]  = 1- df_long.loc[df_long["Trial Ct"]==ind].wd / df_long.loc[ (df_long["Trial Ct"]==ind) & (df_long["mmHg"]==base_mmHg) ].wd.item()
    df_long.loc[df_long["Trial Ct"]==ind,"ht_rel"]  = 1- df_long.loc[df_long["Trial Ct"]==ind].ht / df_long.loc[ (df_long["Trial Ct"]==ind) & (df_long["mmHg"]==base_mmHg) ].ht.item()
#df_long


<IPython.core.display.Javascript object>



Unnamed: 0,Filename,x,y,xstart,ystart,url,Label ID,Ann Ct,Reviews Ct,Open Issues,Skipped,created_at,updated_at,Label Seconds,Created By,Agreement,Benchmark Agreement,Benchmark ID
1,20220423_142023.jpg,261.0,257.0,2470.0,585.0,https://storage.labelbox.com/cl2ceiao35hbj0zah...,cl2cez4xr5ki20zagcvnuf7sk,1,0,0.0,False,2022-04-23T22:10:24.000Z,2023-03-04T18:55:36.000Z,24.533,ryerrabelli@gmail.com,0.504524,-1.0,
252,20220423_142023.jpg,487.0,273.0,2422.0,574.0,https://storage.labelbox.com/cl2ceiao35hbj0zah...,cleubp4vb1n8w072bd8tnbaks,1,0,0.0,False,2023-03-04T18:55:36.000Z,2023-03-04T18:55:36.000Z,330.936,eemman2@illinois.edu,0.504524,-1.0,


### Get wide form

In [36]:
df_wide = df_long.pivot(index=
                        ["Day Ct","Set Ct","Trial Ct","Set Trial Ct","Speculum Type","Size","Brand","Material","Material Type","Method","Hand","Spec Ang","Spec Ht","Opening Distance"], 
                        columns="mmHg", values=["wd_rel","ht_rel"]).reset_index("Opening Distance")
df_wide_flat = df_wide.copy()
df_wide_flat.columns = [".".join([str(item) for item in col]).strip(".") for col in df_wide_flat.columns.values]

<IPython.core.display.Javascript object>


In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior)



### Order by the trial and the mmHg within that set (multiindex)

In [37]:
df_multiindex = df_long.set_index(["Trial Ct","mmHg"])
df_multiindex.head(8)

<IPython.core.display.Javascript object>

Unnamed: 0_level_0,Unnamed: 1_level_0,Datetime,Day Ct,Day Num,Day Trial Ct,Set Ct,Set Trial Ct,Speculum Type,Spec Ang,Spec Ht,Size,...,Created By,Agreement,Benchmark Agreement,Benchmark ID,wd,ht,wd_start,ht_start,wd_rel,ht_rel
Trial Ct,mmHg,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,0,2022-04-23 14:20:23.136,1,1,1,1,1,White,5,0,M,...,ryerrabelli@gmail.com,0.504524,-1.0,,257.0,261.0,585.0,2470.0,0.0,0.0
1,40,2022-04-23 14:20:30.912,1,2,1,1,1,White,5,0,M,...,ryerrabelli@gmail.com,-1.0,-1.0,,190.0,287.0,614.0,2470.0,0.2607,-0.099617
1,80,2022-04-23 14:20:49.056,1,3,1,1,1,White,5,0,M,...,ryerrabelli@gmail.com,-1.0,-1.0,,120.0,252.0,655.0,2464.0,0.533074,0.034483
1,120,2022-04-23 14:20:54.240,1,4,1,1,1,White,5,0,M,...,ryerrabelli@gmail.com,-1.0,-1.0,,90.0,250.0,670.0,2463.0,0.649805,0.042146
1,160,2022-04-23 14:21:00.288,1,5,1,1,1,White,5,0,M,...,ryerrabelli@gmail.com,-1.0,-1.0,,65.0,361.0,672.0,2470.0,0.747082,-0.383142
1,200,2022-04-23 14:21:08.064,1,6,1,1,1,White,5,0,M,...,ryerrabelli@gmail.com,-1.0,-1.0,,44.0,140.0,681.0,2468.0,0.828794,0.463602
2,0,2022-04-23 14:33:10.368,1,7,2,1,2,White,5,0,M,...,ryerrabelli@gmail.com,-1.0,-1.0,,277.0,243.0,685.0,2435.0,0.0,0.0
2,40,2022-04-23 14:33:24.192,1,8,2,1,2,White,5,0,M,...,ryerrabelli@gmail.com,-1.0,-1.0,,210.0,221.0,729.0,2433.0,0.241877,0.090535


### Save processed dfs

In [38]:
path_to_folder = "data/v2/03_processed"

df_long.to_csv(   os.path.join(path_to_folder, "combined_df_long"+".csv"))
df_long.to_excel( os.path.join(path_to_folder, "combined_df_long"+".xlsx"))
df_long.to_pickle(os.path.join(path_to_folder, "combined_df_long"+".pkl"))

df_wide.to_csv(   os.path.join(path_to_folder, "combined_df_wide"+".csv"))
df_wide.to_excel( os.path.join(path_to_folder, "combined_df_wide"+".xlsx"))
df_wide.to_pickle(os.path.join(path_to_folder, "combined_df_wide"+".pkl"))

df_wide_flat.to_csv(   os.path.join(path_to_folder, "combined_df_wide_flat"+".csv"))
df_wide_flat.to_excel( os.path.join(path_to_folder, "combined_df_wide_flat"+".xlsx"))
df_wide_flat.to_pickle(os.path.join(path_to_folder, "combined_df_wide_flat"+".pkl"))

df_multiindex.to_excel( os.path.join(path_to_folder, "combined_df_multiindex"+".xlsx"))   # assuming a multiindex wouldn't save well to a csv file
df_multiindex.to_pickle(os.path.join(path_to_folder, "combined_df_multiindex"+".pkl"))  

<IPython.core.display.Javascript object>

## Aggregate analysis across trials

In [44]:
a=df_long[df_long["Day Ct"]==4]
a[["Day Ct", "Set Ct", "Speculum Type", "Size", "Brand", "Material", "Material Type", "Method", "Spec Ang", "Spec Ht",]].value_counts()

<IPython.core.display.Javascript object>

Day Ct  Set Ct  Speculum Type  Size   Brand                 Material                         Material Type  Method  Spec Ang  Spec Ht
4       26      Green          M      STRONG-Blue           Nitrile                          Glove          Middle  5         0          24
        27      Green          L      STRONG-Blue           Nitrile                          Glove          Middle  5         0          24
        28      Green          7      SensiCare PI          Polyisoprene                     Glove          Middle  5         0          24
        29      Green          7      Protexis              Latex                            Glove          Middle  5         0          24
        30      Green          7      Protexis PI Blue      Polyisoprene, emollient coating  Glove          Middle  5         0          24
        31      Green          8      SensiCare PI          Polyisoprene                     Glove          Middle  5         0          24
        32      Green     

In [45]:
#@title ### Get aggregate dfs
# Group by all the parameters that will be the same across different trials of the same object
consistent_cols = ["Day Ct", "Set Ct", "Speculum Type", "Size", "Brand", "Material", "Material Type", "Method", "Spec Ang", "Spec Ht", "mmHg"]
aggregatable_cols = ["wd","ht","wd_rel","ht_rel", "Opening Distance"]
grouped_trials = df_long[consistent_cols+aggregatable_cols].groupby(consistent_cols)
#display(grouped_trials.describe())

def sem(x, ddof=1):   # ddof=1 to get sample standard deviation, not the population standard deviation (np's default)
    sem = np.std(x, ddof=ddof)/np.sqrt(len(x))

def nonnan(x):
    return x[~np.isnan(x)]

df_agg_long = grouped_trials.agg([np.mean, scipy.stats.sem, np.std, np.min, np.median, np.max, np.count_nonzero], ddof=1).reset_index()

df_agg_long_flat = df_agg_long.copy()
df_agg_long_flat.columns = [".".join(col).strip(".") for col in df_agg_long.columns.values]
#df_agg_long_flat

<IPython.core.display.Javascript object>


Degrees of freedom <= 0 for slice


invalid value encountered in double_scalars


Degrees of freedom <= 0 for slice


invalid value encountered in double_scalars


Degrees of freedom <= 0 for slice


invalid value encountered in double_scalars


Degrees of freedom <= 0 for slice


invalid value encountered in double_scalars


Degrees of freedom <= 0 for slice


invalid value encountered in true_divide



In [46]:
#@title ### Save aggregate dfs
path_to_folder = "data/v2/04_aggregated"
df_agg_long.to_csv(   os.path.join(path_to_folder, "combined_df_agg_long"+".csv"))
df_agg_long.to_excel( os.path.join(path_to_folder, "combined_df_agg_long"+".xlsx"))
df_agg_long.to_pickle(os.path.join(path_to_folder, "combined_df_agg_long"+".pkl"))
df_agg_long_flat.to_csv(   os.path.join(path_to_folder, "combined_df_agg_long_flat"+".csv"))
df_agg_long_flat.to_pickle(os.path.join(path_to_folder, "combined_df_agg_long_flat"+".pkl"))

<IPython.core.display.Javascript object>