In [1]:
import pandas as pd
import os
import glob

In [2]:
if os.path.exists('J:/'):
    rootj = 'J:/'
elif os.path.exists('/home/j'):
    rootj = '/home/j'
else:
    raise BaseException('No J Drive access')
new_dir = os.path.join('.', 'working') # in the repo
old_dir = os.path.join(rootj, 'Project', 'VA')
assert os.path.exists(old_dir)

In [3]:
ages = ['Adult', 'Child', 'Neonate']

# Presymptom Data
The first step of the process is to process the raw survey data from the intermediate files in the `J:/Project/GC13/Verbal Autopsy/VA Data` directory.

In [4]:
old_presymp_dir = os.path.join(old_dir, 'Publication_2015', 'Revised Data', 'Presymptom Data')
presymp_filename = 'VA Final - {}.dta'
for age in ages:
    print "Checking {}".format(age)
    old_df = pd.read_stata(os.path.join(old_presymp_dir, presymp_filename.format(age)), convert_categoricals=False).set_index('sid')
    new_df = pd.read_stata(os.path.join(new_dir, presymp_filename.format(age)), convert_categoricals=False).set_index('sid')
    assert old_df.shape == new_df.shape
    assert len(set(old_df.index).symmetric_difference(new_df.index)) == 0
    assert len(set(old_df.columns).symmetric_difference(new_df.columns)) == 0
    assert (old_df.fillna(0) == new_df.loc[old_df.index].fillna(0)).all().all()

Checking Adult
Checking Child
Checking Neonate


# Text Processing

##### Text Extraction
The first step of the process is to use the R `tm` package to create csvs with frequencies of keywords. This analysis exists in the `J:/Project/VA/Publication` directory but not the `J:/Project/VA/Publication_2015` directory.

Within the `parse_text.R` file, the variables containing text are hard-coded by module (lines 39 to 53). Open Responses span multiple columns (e.g `a7_*`, `c6_*`). This is because stata, which was used for this analysis, has a limit for the length of string variables. However, the original files have more open reponse variables than are listed in the code (up to 14 instead of up to 10). Some minor truncation occurs between the files generated by the code in the repo and the old versions. To complicate matters, dictionary key words (from DICT-5) are replaces as lower case and at least one observation has a keyword truncated in the middle of the word. Observations should be considered a match if the new version begins with the same text as the old version after converting both to lower case and dropping the last word from the old version.

In [138]:
def mostly_matched(row):
    old = row.iloc[0].lower().strip()
    new = row.iloc[1].lower().strip()
    
    # Drop the last word of `old` in case it was truncated in `old`
    # and replaced with a keyword in `new`. Split and join `new`
    # to remove multiple internal spaces (to match `old`)
    old_words = old.split()
    old = ' '.join(old_words[:len(old_words) - 1])
    return ' '.join(new.split()).startswith(old)

old_text_dir = os.path.join(old_dir, 'Publication', 'FreeText', 'Words')
for filepath in glob.glob(os.path.join(old_text_dir, '*all_words*.csv')):
    f = filepath.split(os.sep)[-1]
    print "Checking {}".format(f)
    old_df = pd.read_csv(filepath)
    old_df = old_df[['sid', 'text']].set_index('sid').sort_index()
    new_df = pd.read_csv(os.path.join(new_dir, 'freetext', f))
    new_df = new_df[['sid', 'text']].set_index('sid').sort_index()
    assert old_df.shape == new_df.shape
    assert len(set(old_df.columns).symmetric_difference(new_df.columns)) == 0
    fuzzy_match = pd.concat([old_df.text, new_df.text], axis=1).apply(mostly_matched, axis=1)
    assert ((old_df.text == new_df.text) | fuzzy_match).all().all()

Checking Child_all_words.csv
Checking Adult_all_words_pre_dictionary.csv
Checking Child_all_words_pre_dictionary.csv
Checking Neonate_all_words.csv
Checking Neonate_all_words_pre_dictionary.csv


In [139]:
old_text_dir = os.path.join(old_dir, 'Publication', 'FreeText', 'Words')
for filepath in glob.glob(os.path.join(old_text_dir, '*50freq.csv')):
    f = filepath.split(os.sep)[-1]
    print "Checking {}".format(f)
    old_df = pd.read_csv(filepath)
    old_df = old_df.set_index('word_id').sort_index()
    new_df = pd.read_csv(os.path.join(new_dir, 'freetext', f))
    new_df = new_df.set_index('word_id').sort_index()
    assert old_df.shape == new_df.shape
    assert len(set(old_df.columns).symmetric_difference(new_df.columns)) == 0
    assert (old_df.fillna(0) == new_df.fillna(0)).all().all()

Checking Child_words_all_variables_50freq.csv


AssertionError: 

In [None]:
old_df.shape

In [140]:
new_df.shape

(2064, 270)

In [141]:
old_df.columns.difference(new_df.columns)

Index([u'word_clock', u'word_ray', u'word_remov'], dtype='object')

In [142]:
new_df.columns.difference(old_df.columns)

Index([u'word_alreadi', u'word_also', u'word_alway', u'word_anyth',
       u'word_around', u'word_ask', u'word_away', u'word_back', u'word_becam',
       u'word_becom', u'word_came', u'word_done', u'word_face', u'word_find',
       u'word_first', u'word_gave', u'word_give', u'word_given', u'word_good',
       u'word_got', u'word_high', u'word_just', u'word_know', u'word_last',
       u'word_later', u'word_like', u'word_mani', u'word_may', u'word_need',
       u'word_next', u'word_noth', u'word_old', u'word_one', u'word_place',
       u'word_point', u'word_problem', u'word_put', u'word_right',
       u'word_said', u'word_saw', u'word_side', u'word_sinc', u'word_small',
       u'word_still', u'word_taken', u'word_thought', u'word_three',
       u'word_took', u'word_turn', u'word_two', u'word_use', u'word_way',
       u'word_well', u'word_went', u'word_whole', u'word_will', u'word_xray',
       u'word_year'],
      dtype='object')

In [143]:
cols = old_df.columns.intersection(new_df.columns)
matches = (old_df[cols] == new_df.loc[old_df.index, cols]).all()
matches.loc[matches != True]

word_certif       False
word_child        False
word_colleg       False
word_come         False
word_day          False
word_even         False
word_get          False
word_medic        False
word_month        False
word_pneumonia    False
word_sever        False
word_swell        False
word_take         False
dtype: bool

In [144]:
old_df.loc[old_df.word_pneumonia != new_df.word_pneumonia, ['word_pneumonia']]

Unnamed: 0_level_0,word_pneumonia
word_id,Unnamed: 1_level_1
B-1144,1
B-1843,1
B-1854,1
B-220,1
B-2344,1
B-2555,1
B-2869,1
B-2986,2
B-3037,2
B-3042,1


In [145]:
new_df.loc[old_df.word_pneumonia != new_df.word_pneumonia, ['word_pneumonia']]

Unnamed: 0_level_0,word_pneumonia
word_id,Unnamed: 1_level_1
B-1144,0
B-1843,0
B-1854,0
B-220,0
B-2344,0
B-2555,0
B-2869,0
B-2986,1
B-3037,1
B-3042,0


In [146]:
old_df.loc[old_df.word_swell != new_df.word_swell, ['word_swell']]

Unnamed: 0_level_0,word_swell
word_id,Unnamed: 1_level_1
U-1222,2


In [147]:
new_df.loc[old_df.word_swell != new_df.word_swell, ['word_swell']]

Unnamed: 0_level_0,word_swell
word_id,Unnamed: 1_level_1
U-1222,1


##### Text Bootstrap
The word observations are then bootstrapped to determine significance

##### Binarization and Tariff calculation
Text is then converted into symptom variables and tariffs are calculated

In [99]:
for filepath in glob.glob(os.path.join(old_text_dir, '*.dta')):
    f = filepath.split(os.sep)[-1]
    print "Checking {}".format(f)
    old_df = pd.read_stata(filepath)
    new_df = pd.read_stata(os.path.join(new_dir, 'freetext', f))
    new_df = new_df.set_index('sid').sort_index()
    old_df = old_df.set_index('sid').sort_index()
    assert old_df.shape == new_df.shape
    assert len(set(old_df.columns).symmetric_difference(new_df.columns)) == 0
    assert (old_df.fillna(0) == new_df.fillna(0)).all().all()

Checking Child_text.dta


AssertionError: 

# Maps

In [5]:
old_maps_dir = os.path.join(old_dir, 'Publication_2015', 'Revised Data', 'Code')
for filepath in glob.glob(os.path.join(old_maps_dir, '*.xlsx')):
    f = filepath.split(os.sep)[-1]
    print "Checking {}".format(f)
    old_df = pd.read_excel(filepath)
    new_df = pd.read_excel(os.path.join(new_dir, 'maps', f))
    assert old_df.shape == new_df.shape
    assert len(set(old_df.columns).symmetric_difference(new_df.columns)) == 0
    assert (old_df.fillna(0) == new_df.fillna(0)).all().all()

Checking Master Cause Map.xlsx
Checking Master Codebook.xlsx


In [6]:
old_maps_dir = os.path.join(old_dir, 'Publication_2015', 'Revised Data', 'Maps')
for filepath in glob.glob(os.path.join(old_maps_dir, '*.dta')):
    f = filepath.split(os.sep)[-1]
    print "Checking {}".format(f)
    old_df = pd.read_stata(filepath, convert_categoricals=False)
    new_df = pd.read_stata(os.path.join(new_dir, 'maps', f), convert_categoricals=False)
    assert old_df.shape == new_df.shape
    assert len(set(old_df.columns).symmetric_difference(new_df.columns)) == 0
    assert (old_df.fillna(0) == new_df.fillna(0)).all().all()

Checking Neonate_map.dta
Checking Child_symptoms.dta
Checking Child_map.dta
Checking Neonate_symptoms.dta
Checking Adult_map_indet.dta
Checking Neonate_categories.dta
Checking Child_categories.dta
Checking Adult_categories.dta
Checking Neonate_map_indet.dta
Checking Adult_symptoms.dta
Checking Adult_map_34.dta
Checking Child_map_indet.dta
Checking Adult_map.dta


# Dump Files

In [11]:
old_maps_dir = os.path.join(old_dir, 'Publication_2015', 'Revised Data', 'Dump Folder')
for filepath in glob.glob(os.path.join(old_maps_dir, '*cutoffs*.dta')):
    f = filepath.split(os.sep)[-1]
    print "Checking {}".format(f)
    old_df = pd.read_stata(filepath, convert_categoricals=False)
    new_df = pd.read_stata(os.path.join(new_dir, 'dump', f), convert_categoricals=False)
    assert old_df.shape == new_df.shape
    assert len(set(old_df.columns).symmetric_difference(new_df.columns)) == 0
    assert (old_df.fillna(0) == new_df.fillna(0)).all().all()

Checking neonate_cutoffs.dta
Checking child_cutoffs.dta
Checking adult_cutoffs.dta


In [21]:
old_maps_dir = os.path.join(old_dir, 'Publication_2015', 'Revised Data', 'Dump Folder')
for filepath in glob.glob(os.path.join(old_maps_dir, '*weightforage*.dta')):
    f = filepath.split(os.sep)[-1]
    print "Checking {}".format(f)
    old_df = pd.read_stata(filepath, convert_categoricals=False)
    new_df = pd.read_stata(os.path.join(new_dir, 'dump', f), convert_categoricals=False)
    if 'sid' in new_df:
        new_df = new_df.set_index('sid').sort_index()
        old_df = old_df.set_index('sid').sort_index()
    assert old_df.shape == new_df.shape
    assert len(set(old_df.columns).symmetric_difference(new_df.columns)) == 0
    assert (old_df.fillna(0) == new_df.fillna(0)).all().all()

Checking Child_weightforage.dta
Checking Neonate_weightforage.dta


# Symptom Data

In [43]:
old_presymp_dir = os.path.join(old_dir, 'Publication_2015', 'Revised Data', 'Symptom Data')
symp_filename = '{}Data.dta'
for age in ages:
    print "Checking {}".format(age)
    old_df = pd.read_stata(os.path.join(old_presymp_dir, symp_filename.format(age)), convert_categoricals=False).set_index('sid')
    new_df = pd.read_stata(os.path.join(new_dir, symp_filename.format(age)), convert_categoricals=False).set_index('sid')
    assert old_df.shape == new_df.shape
    assert len(set(old_df.index).symmetric_difference(new_df.index)) == 0
    assert len(set(old_df.columns).symmetric_difference(new_df.columns)) == 0
    assert (old_df.fillna(0) == new_df.loc[old_df.index].fillna(0)).all().all()

Checking Adult


AssertionError: 