# Detecting unuploaded forms based on the company they keep

Some forms are always completed together. If one is uploaded and the others are not, then we should start looking for the reason why and either locate the form or mark it missing.

In [None]:
form_group = None
event = None
site = None
output_dir = None
#classify_marked_missing_as_present = False

In [None]:
if site is not None:
    assert site in ["sri", "ucsd", "upmc", "ohsu", "duke"]
include_dag = site is not None
site_label = site if include_dag else "all"

if form_group is None:
    raise KeyError("Missing parameter `form_group`!")

In [None]:
events = [event] if event is not None else None

These are the forms that should co-occur. If they don't, then something has gone wrong.

In [None]:
form_groups = {
    'sleep': [
        'sleep_study_evening_questionnaire',
        'sleep_study_presleep_questionnaire',
        'sleep_study_morning_questionnaire'
    ],
    'mri': [
        'mr_session_report',
        'mri_report'
    ],
    'deldisc_stroop': [
        'delayed_discounting_1000',
        'delayed_discounting_100',
        'stroop'],
    'deldisc': [
        'delayed_discounting_1000', 
        'delayed_discounting_100'
    ],
    'youth_report': [
        'youth_report_1',
        'youth_report_1b',
        'youth_report_2',
    ],
    'youth_report_lssaga': [
        'youth_report_1',
        'youth_report_1b',
        'youth_report_2',
        'limesurvey_ssaga_youth'
    ],
}

# if form_group is not None:
assert form_group in form_groups.keys()

In [None]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('/sibis-software/python-packages/')
import sibispy
from sibispy import sibislogger as slog

In [None]:
from qa_utils import chunked_form_export, get_items_matching_regex, form_has_content, form_has_content_and_is_not_missing

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

In [None]:
session = sibispy.Session()
if not session.configure():
    sys.exit()

slog.init_log(None, None, 
              'QC: Check that logical groupings of forms are uploaded', 
              'check_form_groups', None)
slog.startTimer1()

# Setting specific constants for this run of QC
api = session.connect_server('data_entry', True)
primary_key = api.def_field

In [None]:
meta = api.export_metadata(format='df')

To be considered as "having content", the form has to pass any of the three tests:

1. Is it marked missing? If yes, then it has known content.
2. Is it marked complete? If yes, then it has known content.
3. Does it have non-NaN answers? If yes, then it has known content.

In future iterations, it might not be unreasonable to stop considering the completion status -- if there is no content, then the record should be marked missing, not just complete.

Currently, the records get exported separately for each form group. Depending on future benchmarking, it might make sense to just get all the records and then scrape out the columns of interest.

In [None]:
results = pd.DataFrame()
results_detailed = pd.DataFrame()
group_name = form_group
forms = form_groups[group_name]
data = chunked_form_export(api, forms=forms, events=events, include_dag=include_dag)
if include_dag:
    data = data.loc[data['redcap_data_access_group'] == site]

In [None]:
form_group_fields = [meta.loc[meta['form_name'] == form].index.tolist()
                     for form in forms]
# per_form_results = map(lambda form_fields: data.loc[:, form_fields].apply(form_has_content, axis=1), 
#                        form_group_fields)
per_form_results = map(lambda form_fields: data.loc[:, form_fields].apply(form_has_content_and_is_not_missing, axis=1), 
                       form_group_fields)
per_form_results = pd.concat(per_form_results, axis=1)
per_form_results.columns = forms
group_results = (per_form_results
                 .apply(lambda row: row.any() and not row.all(), axis=1))
group_results.name = group_name  #"{}_overall".format(group_name)

In [None]:
results = group_results
results_detailed = pd.concat([group_results, per_form_results], axis=1)

## Any participant/event combinations where one form is missing

In [None]:
data.head()

In [None]:
#results.loc[results.apply(pd.Series.any, axis=1)]

In [None]:
if output_dir is not None:
    file_name = 'grouping-{}-{}'.format(group_name, site_label)
    if event is not None:
        file_name = '{}-{}'.format(file_name, event)
        
    file_name = file_name + '.csv'
    hit_only_file_name = '{}-hits.csv'.format(file_name)
        
    results_detailed.to_csv(os.path.join(output_dir, file_name))
    (results_detailed
     .loc[results_detailed[group_name]]
     .to_csv(os.path.join(output_dir, hit_only_file_name)))

In [None]:
results_detailed.loc[results_detailed[group_name]]

In [None]:
# tst = results_detailed.iloc[:3, :]
# tst

In [None]:
# # https://ncanda.sri.com/redcap/redcap_v8.4.0/DataEntry/index.php?pid=20&event_id=78&id=A-00010-F-6&page=youth_report_1
# from string import Template
# TEMPLATE = Template("https://ncanda.sri.com/redcap/redcap_v8.4.0/DataEntry/index.php?pid=20&event_id=78&id={id}&page={form}")
# def linkify_column(col, template=None):
#     # 1-D data point
#     form_name = col.name
#     # n data points
#     study_ids = col.index.get_level_values('study_id')
#     links = 
    
# tst.apply(linkify_column, axis=0)