In [1]:
import pandas as pd
import numpy as np
import os
import sys
import glob
import json

In [2]:
sys.path.insert(0, os.path.normpath('../src'))
from tariff import TariffClassifier

In [3]:
def rootj():
    for p in ['/home/j', 'J:/', 'J:']:
        if os.path.exists(p):
            return p
smartva_repo = os.path.join(rootj(), 'Project', 'VA', 'smartva')
smartva_data_dir = os.path.join(smartva_repo, 'smartva', 'data')
smartva_testing_repo = os.path.join(rootj(), 'Project', 'VA', 'smartva_testing')
nhmrc_data_folder = os.path.join(rootj(), 'LIMITED_USE', 'PROJECT_FOLDERS', 'NHMRC_VA',
                                 'external_va_2015', 'NHMRC', 'Data', 'version21', 'output')
for p in [smartva_repo, smartva_data_dir, smartva_testing_repo, nhmrc_data_folder]:
    assert os.path.exists(p)

ages = ['adult', 'child', 'neonate']

In [4]:
sys.path.insert(1, smartva_repo)
from smartva.data import adult_tariff_data
from smartva.data import child_tariff_data
from smartva.data import neonate_tariff_data
tariff_data = {
    'adult': adult_tariff_data,
    'child': child_tariff_data,
    'neonate': neonate_tariff_data
}

# Run SmartVA

In [5]:
def save_smartva_tariff_data(outdir):
    data = {
        'adult': {
            'CUTOFF_POS': adult_tariff_data.CUTOFF_POS,
            'UNIFORM_LIST_POS': adult_tariff_data.UNIFORM_LIST_POS,
            'MIN_CAUSE_SCORE': adult_tariff_data.MIN_CAUSE_SCORE,
            'SPURIOUS_ASSOCIATIONS': adult_tariff_data.SPURIOUS_ASSOCIATIONS,
            'RESTRICTIONS': adult_tariff_data.CAUSE_CONDITIONS,
            'FREQUENCIES': adult_tariff_data.FREQUENCIES,
        },
        'child': {
            'CUTOFF_POS': child_tariff_data.CUTOFF_POS,
            'UNIFORM_LIST_POS': child_tariff_data.UNIFORM_LIST_POS,
            'MIN_CAUSE_SCORE': child_tariff_data.MIN_CAUSE_SCORE,
            'SPURIOUS_ASSOCIATIONS': child_tariff_data.SPURIOUS_ASSOCIATIONS,
            'RESTRICTIONS': child_tariff_data.CAUSE_CONDITIONS,
            'FREQUENCIES': child_tariff_data.FREQUENCIES,
        },
        'neonate': {
            'CUTOFF_POS': neonate_tariff_data.CUTOFF_POS,
            'UNIFORM_LIST_POS': neonate_tariff_data.UNIFORM_LIST_POS,
            'MIN_CAUSE_SCORE': neonate_tariff_data.MIN_CAUSE_SCORE,
            'SPURIOUS_ASSOCIATIONS': neonate_tariff_data.SPURIOUS_ASSOCIATIONS,
            'RESTRICTIONS': None,
            'FREQUENCIES': neonate_tariff_data.FREQUENCIES,
        },
    }
    with open(os.path.join(outdir, 'tariff_data.json'), 'w') as f:
        json.dump(data, f)
    

In [6]:
%cd J:/Project/VA/smartva
!git checkout develop
!python app.py --version
%cd-

J:\Project\VA\smartva
M	dev_notes.txt
M	pkg/Manual Data Entry Forms/DataEntryForm.xlsx
M	pkg/build-agent/kill_server.sh
M	pkg/build-agent/run_build.sh
M	pkg/build-agent/run_prep.sh
M	pkg/build-agent/run_tests.sh
M	pkg/build-agent/start_server.sh
M	pkg/hooks/hook-numpy.py
Your branch is up-to-date with 'origin/develop'.


Already on 'develop'


SmartVA, version 1.2.0
C:\Users\josephj7\Desktop\repos\va\hierarchical_causes\test


In [7]:
save_smartva_tariff_data(os.path.normpath(os.path.join('..', 'data', 'test1')))

# Training
Training the tariff classifier provides four pieces of data which are used in predictions:
1. cause list (as a numpy array)
2. tariff matrix (as a dataframe)
3. training data resampled to a uniform cause distribution
4. cause-specific cutoffs

These will be hard-coded with values to match the SmartVA inputs

##### Original Tariff matrices

In [8]:
orig_tariffs = {age: pd.read_csv(os.path.join(smartva_data_dir, 'tariffs-{}.csv'.format(age))) for age in ages}

##### Original Validated training data

In [9]:
orig_validated = {age: pd.read_csv(os.path.join(smartva_data_dir, 'validated-{}.csv'.format(age))) for age in ages}

##### Hyperparameters
The only parameters which apply to *predicting* are related to thresholds used to decide if an observation has too little information to determine any cause. For the tariff classifier, these are:
1. overall_rank_cutoff
2. cause_rank_cutoff
3. min_cause_score

The first two are used as percentile values (in np.percentile) and should be scaled to 0 to 100.

In [10]:
clfs = {age: TariffClassifier(min_cause_score=tariff_data[age].MIN_CAUSE_SCORE,
                              cause_pct_cutoff=tariff_data[age].CUTOFF_POS * 100,
                              overall_pct_cutoff=(100 - tariff_data[age].UNIFORM_LIST_POS * 100))
        for age in ages}

##### Tariffs
The classifer only stores fully processed tariffs. The steps to process tariffs are:
1. remove insignificant tariffs
2. remove spurious associations
3. only keep the top symptoms by cause
4. round tariffs to the desired precision

Only the first step has been applied to the tariffs used by SmartVA and SmartVA TariffPrep does the remaining. These steps only need to happen once and then can be used for all predictions.

In [11]:
def process_tariffs(tariffs, precision=0.5, spurious_associations=None, top_n=40):
    """Finish transforming the tariff matrix from SmartVA into a final form"""
    tariffs = tariffs.set_index('xs_name')
    tariffs.index = tariffs.index.str.lstrip('cause').astype(int)
    tariffs.index.name = 'cause'
    
    clf = TariffClassifier()
    if spurious_associations:
        tariffs = clf.remove_spurious_associations(tariffs, spurious_associations)
    tariffs = clf.keep_top_symptoms(tariffs, top_n)
    tariffs = clf.round_tariffs(tariffs, precision)

    return tariffs

In [12]:
final_tariffs = {age: process_tariffs(tariffs, spurious_associations=tariff_data[age].SPURIOUS_ASSOCIATIONS)
                 for age, tariffs in orig_tariffs.items()}

##### Validated Data with Uniform Cause Distribution
The classifier stores the *scored* training data with a uniform cause distribution and the cause cutoffs. SmartVA uses the symptom binaries with the original distribution. Scoring and resampling to a uniform cause distribtuion and calculating cutoffs only needs to happen once and the results can be used for all predictions.

In [13]:
def process_training_data(validated, tariffs, tariff_data):
    """Finish transforming the validated training data into scored, resampled data"""
    validated = validated.set_index('sid')
    causes = validated.va46
    validated = validated.loc[:, tariffs.columns].fillna(0)
    
    clf = TariffClassifier()
    scored = pd.DataFrame(clf.score_samples(validated.values, tariffs.values),
                          index=validated.index, columns=tariffs.index)
    
    # Jump through hoops to make the results match SmartVA (see below)
    # I'm looking at you reversed sorted key function.. oO
    resampled_index = np.repeat(*zip(*sorted(tariff_data.FREQUENCIES.items(), key=lambda x: x[0], reverse=True)))
    X_uniform = scored.loc[resampled_index].values
    y_uniform = causes.loc[resampled_index].values - 1  # to match Python indicies
    
    cutoff_scores, cutoff_ranks = clf.calc_cutoffs(X_uniform, y_uniform, tariff_data.CUTOFF_POS * 100)
    return X_uniform, y_uniform, cutoff_scores, cutoff_ranks

###### Reverse sorted key silliness
The cause-specific cutoff is the specified percentile of the subset of the unique ranks which are of the given cause. Ranks are unique and score are likely not. If two or more observations have the same tariff score for a cause their order will not be consistent with a simple quicksort. When calculating the cutoffs the target cause is sorted along with the row. Since the ranks are unique, the sort order of rows within duplicated (tied) tariff scores affects the cutoff. The effect is large enough to drive a handleful of predictions.

SmartVA achieves stable sorting by sorting on the row identifier ('sid') within duplicate causes. The new code does not hold on to index labels. (Why would I bother?) The new code calculates cutoffs by reverse sorting tariff scores by cause. To acheive the same results as SmartVA the rows of the input data must be sorted in reverse order and the sort must use a mergesort algorithm with preserves the order of rows relative to each other. 

In [14]:
final_validated = {age: process_training_data(validated, final_tariffs[age], tariff_data[age])
                   for age, validated in orig_validated.items()}

##### Training

In [15]:
for age, clf in clfs.items():
    tariffs = final_tariffs[age]
    X_uniform, y_uniform, cutoff_scores, cutoff_ranks = final_validated[age]
    
    clf.tariffs_ = tariffs
    clf.causes_ = tariffs.index.values
    clf.n_causes_ = len(clf.causes_)
    clf.symptoms_ = tariffs.columns.values
    clf.X_uniform_ = X_uniform
    clf.y_uniform_ = y_uniform
    clf.cutoff_scores_ = cutoff_scores
    clf.cutoff_ranks_ = cutoff_ranks

# Regression Testing

In [16]:
def fetch_outputs(directory, age):
    """Return SmartVA output files"""
    clf = TariffClassifier()
    intermediate = os.path.join(directory, 'intermediate-files')
    symps = pd.read_csv(os.path.join(intermediate, '{}-symptom.csv'.format(age)), index_col=0)
    scores = pd.read_csv(os.path.join(intermediate, '{}-tariff-scores.csv'.format(age)), index_col=0)
    ranks = pd.read_csv(os.path.join(intermediate, '{}-tariff-ranks.csv'.format(age)), index_col=0)
    preds = pd.read_csv(os.path.join(directory, '{}-predictions.csv'.format(age)), index_col=0)
    preds['cause46'] = ranks.apply(clf.best_ranked, axis=1)
    csmf = pd.read_csv(os.path.join(directory, '{}-csmf.csv'.format(age)), index_col=0)
    with open(os.path.join(intermediate, '{}-cutoffs.txt'.format(age)), 'r') as f:
        cutoffs = dict([map(int, line.split(' : ')) for line in f])
    return symps, scores, cutoffs, ranks, preds, csmf

In [17]:
def process_symptom_file(df, cols):
    """Split the symptom file into the pieces needed for the classifier"""
    X = df.loc[:, cols].fillna(0)
    age = df.real_age
    sex = df.real_gender
    return X, age, sex

In [18]:
age = 'adult'
path = os.path.join('..', 'data', 'phmrc_adult')
restrictions = {
    'males_only': [39],
    'females_only': [3, 20, 22, 36, 42, 6, 7],
    'min_age': [
        (15, [3, 20, 22, 36, 42, 6, 7, 9, 17, 27, 30, 39, 43]),
    ],
    'max_age': [
        (49, [3, 20, 22, 36, 42]),
        (75, [1, 2])
    ]
}

In [19]:
symps, scores, cutoffs, ranks, preds, csmf = fetch_outputs(path, age)
X, real_ages, real_sexes = process_symptom_file(symps, clfs[age].tariffs_.columns)

In [26]:
out = clfs[age].predict(X, ages=real_ages, sexes=real_sexes, restrictions=restrictions,
                        return_scores=True, return_restricted=True, return_ranks=True)
preds2, scores2, ranks2, valid = out

In [27]:
assert (scores.values == scores2).all().all()
assert (pd.Series(cutoffs) == pd.Series(clfs[age].cutoff_ranks_, index=clfs[age].causes_)).all()

In [28]:
preds['jj'] = preds2
diff = preds.loc[pd.to_numeric(preds.cause46.fillna(-99), errors='coerce') != preds.jj.map(float)]
diff

Unnamed: 0_level_0,cause,cause34,age,sex,cause46,jj
sid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adult3,,Undetermined,62.0,2,,13.0
Adult4,,Undetermined,80.0,2,,6.0
Adult7,,Undetermined,65.0,1,,13.0
Adult12,33.0,Suicide,17.0,1,15,34.0
Adult21,,Undetermined,35.0,2,,36.0
Adult25,16.0,Homicide,18.0,2,5,21.0
Adult34,,Undetermined,37.0,2,,20.0
Adult39,,Undetermined,56.0,2,,6.0
Adult42,,Undetermined,33.0,9,,8.0
Adult44,,Undetermined,48.0,2,,6.0


In [None]:
xtab = pd.crosstab(preds.cause46.astype(float).fillna(-99), preds.jj, margins=True)
order = range(1,46) + [-99, 'All']
xtab.replace({0: ''}).ix[order, order].fillna('').loc[-99]

In [None]:
real_ages[preds.jj.isin([3,20,22,36,42]).index]

In [None]:
ranks.loc[diff.index].head()

In [None]:
jj_ranks= pd.DataFrame(ranks2, index=ranks.index, columns=ranks.columns)
jj_ranks.loc[diff.index].head()

In [None]:
clfs['adult']

In [None]:
scores.loc['Adult12']

In [None]:
real_ages

In [None]:
jj_ranks.loc['Adult12']

In [None]:
ranks.loc['Adult12']

In [None]:
jj_scores = pd.DataFrame(scores2, index=scores.index, columns=scores.columns)
jj_scores.head()