In [None]:
import os
import pandas as pd
import numpy as np
import random
import datetime
import re
import hashlib

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

import shap
import xgboost as xgb

import matplotlib.pyplot as plt

OUTCOMES_PATH = "../data/symptom_outcomes.csv"
PATH_TO_AUDIO = "/vol0/psych_audio/raw-audio/"
MAX_ID_LEN = 6  # Some of the ID's in the pandas dataframe are cast to integers which makes them nonstandard lengths
OUT_PATH = "../results/scotty_phq9_diffs_with_paths.tsv"

In [None]:
# These columns are coded numerically in the dataset, 
# but when you read them in with pandas they are initially 
# coded as strings
NUMERIC_COLS = [
    'No_Interest_ses',
    'Feeling_down_ses',
    'Sleep_ses',
    'Tired_ses',
    'Appetite_ses',
    'Bad_self_ses',
    'Concentrating_ses',
    'Move_slow_ses',
    'Dead_ses',
    'Work_ses',
    'Large_amt_ses',
    'OBE_LOC_ses',
    'Vomit_ses',
    'Laxative_ses',
    'Restrict_ses',
    'Shp_wt_judge_ses',
    'Schoolwork_ses',
    'Relationships_ses',
    'Shp_Wt_Bad_self_ses',
    'PHQ9_total_ses',
]

# Create some simple converter functions to turn strings
# into integer format and date strings into date format
STR_TO_INT = lambda val: None if val == " " else int(val)
STR_TO_DATE = lambda val: None if val == " " else datetime.datetime.strptime(val, "%m/%d/%Y")

# Build a dictionary of functions where the key
# is the column name and the value is the converter 
# function which will convert from strings to the 
# appropriate type
CONVERTERS = {
    'Date_ses': STR_TO_DATE, 
    'Date_ses1': STR_TO_DATE,
    'Age_ses1': STR_TO_INT, 
    'Weight_ses1': STR_TO_INT,
    'Height_ses1': STR_TO_INT,
    'Gender_ses1': STR_TO_INT,
}

# Systematically go through the numeric columns and convert
# them from string format to integer format using the 
# lambda functions we created above
for numeric_col in NUMERIC_COLS:
    CONVERTERS[numeric_col] = STR_TO_INT
    CONVERTERS[numeric_col + "1"] = STR_TO_INT  # Process all first-session-specific columns

In [None]:
def make_hash(row):
    """ Converts pandas df row into 256-bit hash
    
    Takes in a row from a pandas dataframe (presumably from the metadata
    file) and, using the 'ID_number' and 'Date_ses' columns, provides a 
    unique 256-bit hash for that patient's session. This unique hash will 
    eventually become the audio path filename. 
    """
    hash_input = str(row['ID_number']) + ':' + row['Date_ses'].strftime("%Y-%m-%d")
    hash_obj = hashlib.sha256(hash_input.encode('utf-8'))
    hex_dig = hash_obj.hexdigest()
    return hex_dig

In [None]:
def preprocess_outcomes(outcomes_path=OUTCOMES_PATH, converters=CONVERTERS):
    """ Read in and clean raw outcomes data file
    
    The data file given to us by the authors compiling the original
    study has some extraneous columns and an overall format that is
    not necessarily conducive to the types of analyses we wish to run.
    This function therefore reads in that original data file into a 
    pandas dataframe and does some minor preprocessing, including:
        (1) Extracting the site, therapist, and patient ID's from the
            session recording ID
        (2) Collapsing special _ses1 columns (indicating that the session
            is the first for the patient) into the non-first-session columns
            of the same name
        (3) Dropping sessions that have no date (these are not useful to us)
        (4) Sorting the entries by (first) ID_number and (second) session date
        (5) Converting numeric columns to numeric values appropriately
    """
    outcomes_df = pd.read_csv(outcomes_path, converters=converters)
    
    # Extracts the site, therapist, and patient ID numbers
    # The first 2 digits are of the ID corerspond to the site
    # The next 2 digits correspond to the therapist
    # The last 2 digits correspond to the patient
    outcomes_df['Site_ID_number'] = outcomes_df['ID_number'].floordiv(10000)
    outcomes_df['Therapist_ID_number'] = outcomes_df['ID_number'].floordiv(100)
    outcomes_df['Patient_ID_number'] = outcomes_df['ID_number']
    
    # Sets first session attributes (Ex. sets Date_ses for rows the first session, for which only Date_ses1 is set)
    outcomes_df.Date_ses.fillna(outcomes_df.Date_ses1, inplace=True)
    
    print("Subject {} has no date. Dropping...".format(outcomes_df.loc[outcomes_df['Date_ses'].isnull(), 'ID_number']))
    outcomes_df = outcomes_df.loc[~outcomes_df['Date_ses'].isnull(), :]
    
    # Fill in the columns that don't end in 1 with information from first session
    # using the corresponding column names that do end in 1
    for numeric_col in NUMERIC_COLS:
        outcomes_df[numeric_col].fillna(outcomes_df[numeric_col + "1"], inplace=True)
        
    # Sorts values by date (after ID)
    outcomes_df.sort_values(['ID_number', 'Date_ses'], inplace=True)
    outcomes_df.reset_index(drop=True, inplace=True)

    # Gets rid of unnecessary first session columns
    unnecessary_cols = [numeric_col + "1" for numeric_col in NUMERIC_COLS] + ['Date_ses1', 'First_ses', 'is_first_session']
    outcomes_df.drop(unnecessary_cols, axis=1, inplace=True)

    return outcomes_df

In [None]:
def extract_audio_filenames(meta_df, audio_path=PATH_TO_AUDIO):
    """ Adds audio_path and Num_sess columns to metadata file
    
    Walks through the directory tree given by audio_path argument and, 
    for each file in the directory, extracts the date of the recording,
    the patient ID, and the session number (note that, because there are 
    gaps, it may be the case that the serial ordering in the metadata file
    does not correspond to a serial ordering of session numbers - therefore
    this information must be extracted from the original audio filename 
    itself). By matching on the patient ID and date, an 'audio_path' and
    'Num_sess' entry are thus added to the metadata dataframe given by meta_df 
    for each row entry corresponding to an audio file discovered in 'audio_path'.
    """
    for root, subFolders, files in os.walk(audio_path):
        for filename in files:
            filetype = filename.split(".")[-1]
            assert filetype in {"mp3", "MP3", "wav", "WAV", "wma", "WMA"}
            name = filename[:-4]
            name_split = name.split("_")
            sess_num = name_split[0][1]  # Sess number is 2nd char of first token in filename (1 is 1st session)
            subj_id = name_split[1]  # Grab the second token in the filename, which should correspond to subj_id
            part_str = ""
            part_num = -1  # If an audio file is split into two parts, we'll mark this flag to be the part number.
                           # Otherwise, it will just remain -1 and no parts handling will be performed
            
            # Handle case where the last 5 chars are "Part1" or "Part2"
            if name[-5:-1] == "Part" or name[-5:-1] == "part":
                part_str = name[-5:]
                part_num = [int(x) for x in part_str if x.isdigit()]
                if len(part_num) > 1:  # If for whatever reason there's more than one digit...
                    raise ValueError("Invalid part number: {}".format(name))
                else:
                    part_num = part_num[0]
            
            # Handle the case where the last 3 chars are "pt1" or "pt2"
            if name[-3:-1] == "Pt" or name[-3:-1] == "pt":
                part_str = name[-3:]
                part_num = [int(x) for x in part_str if x.isdigit()]
                if len(part_num) > 1:  # If for whatever reason there's more than one digit
                    raise ValueError("Invalid part number: {}".format(name))
                else:
                    part_num = part_num[0]
            
            # Handling abnormal case 1: S2_Part 1_470101_P1_10.20.15.MP3
            try:
                int(subj_id)  # Check to see if the second token can be cast as a string
            except:  # If not, then assume the second token is the "part" e.g. part 1 vs. part 2
                print("\nEncountered nonstandard formatting: ")
                print(filename)
                try:
                    subj_id = name_split[2]
                    print("Skipping second token. Using third token as subject ID: {}\n".format(subj_id))
                    int(subj_id)
                    part_str = name_split[1]
                    part_num = [int(x) for x in part_str if x.isdigit()]
                    if len(part_num) > 1:
                        raise ValueError("Invalid part number: {}".format(name))
                    else:
                        part_num = part_num[0]
                except:
                    raise ValueError("Failed to parse nonstandard formatting: {}".format(filename))
            
            # Extract the date from 
            date_match = re.search("(P[0-9].|\s|_)([0-9]{1,2}\.[0-9]{1,2}\.[0-9]{2,4})", filename)
            if date_match:
                date_str = date_match.group(2)  # Find the substring corresponding to the date in the filename
                date_str_split = date_str.split('.')  # Split by period to extract [month, day, year]
                if len(date_str_split[2]) > 2:  # If the year is '2014' rather than '14', change it to '14'
                    date_str_split[2] = date_str_split[2].replace('20', '')
                date_str_standardized = '.'.join(date_str_split)  # Join everything back together
                date = datetime.datetime.strptime(date_str_standardized, "%m.%d.%y")  # Turn it into a datetime obj.
                date = date.strftime("%Y-%m-%d")
            else:
                raise AttributeError("No date found for {}!".format(filename))
            
            path = root + '/' + filename
            # print("ID_number = {}; Date_ses = {}".format(subj_id, date))
            # print("Random date string: {}".format(meta_df.loc[1, 'Date_ses']))
            subj_sess_slice = meta_df.loc[(meta_df['ID_number'] == int(subj_id)) & (meta_df['Date_ses'] == date)]
            
            if len(subj_sess_slice) == 0:
                print("WARNING: Audiofile {} has no metadata. Adding placeholder row to metadata...".format(filename))
                # print("len before adding row = {}".format(len(meta_df)))
                new_row = pd.DataFrame(columns=meta_df.columns)
                new_row.loc[0, 'ID_number'] = int(subj_id)
                new_row.loc[0, 'Patient_ID_number'] = int(subj_id)
                new_row.loc[0, 'Num_sess'] = sess_num
                new_row.loc[0, 'Date_ses'] = date
                new_row.loc[0, 'audio_path'] = path
                for c in new_row.columns:
                    if not pd.isnull(new_row.loc[0, c]):
                        new_row[c] = new_row[c].astype(meta_df.dtypes[c])
                meta_df = meta_df.append(new_row)
                # print("len after adding row = {}".format(len(meta_df)))
                # print("\tExtracted information from audiofile is:")
                # print("\t\tsubj_id = {}\n\t\tsess_num={}\n\t\tdate={}".format(subj_id, sess_num, date))
                continue

            if part_num == -1:  # Nothing special, just a single audio file
                meta_df.loc[(meta_df['ID_number'] == int(subj_id)) & \
                            (meta_df['Date_ses'] == date), 
                            'audio_path'] = path
            
            elif part_num == 1:  # Corresponds to a two-part audio file where we're considering the first part
                current_stored_path =  meta_df.loc[(meta_df['ID_number'] == int(subj_id)) & \
                                                   (meta_df['Date_ses'] == date), 
                                                   'audio_path']
                if pd.isnull(current_stored_path.item()):
                    meta_df.loc[(meta_df['ID_number'] == int(subj_id)) & \
                                (meta_df['Date_ses'] == date), 
                                'audio_path'] = path
                else:
                    meta_df.loc[(meta_df['ID_number'] == int(subj_id)) & \
                                (meta_df['Date_ses'] == date), 
                                'audio_path'] = path + ";" + current_stored_path
            
            elif part_num == 2:  # Corresponds to a two-part audio file where we're considering the second part
                current_stored_path =  meta_df.loc[(meta_df['ID_number'] == int(subj_id)) & \
                                                   (meta_df['Date_ses'] == date), 
                                                   'audio_path']
                if pd.isnull(current_stored_path.item()):
                    meta_df.loc[(meta_df['ID_number'] == int(subj_id)) & \
                                (meta_df['Date_ses'] == date), 
                                'audio_path'] = path
                else:
                    meta_df.loc[(meta_df['ID_number'] == int(subj_id)) & \
                                (meta_df['Date_ses'] == date), 
                                'audio_path'] = current_stored_path + ";" + path
                
            else:
                raise ValueError("ERROR: Our code can't handle more than 2-part audio files")
            
            meta_df.loc[(meta_df['ID_number'] == int(subj_id)) & \
                                (meta_df['Date_ses'] == date), 
                                'Num_sess'] = sess_num
            
    return meta_df

In [None]:
def calculate_col_diffs_per_sess(diff_df, col_name='PHQ9_total_ses'):
    """Calculates session-to-session differences for a column
    
        Finds the session-to-session differences for a given column,
        grouping by patient ID number and date. Thus, if we passed in 
        'PHQ9' as the diff column, then the entry under 'PHQ9_diff' for
        each patient's first session will be that patient's second session
        PHQ-9 score minus their first session PHQ-9 score.
        
        Args:
            diff_df: pandas dataframe with columns
                'ID_number' corresponding to the patient ID number
                'Date_ses' corresponding the date of the session
            col_name: The name of the column for which the user
                would like to calculate session-to-session differences.
                Default is 'PHQ9_total_ses'
    """
    # First create a MultiIndex with outer index as the ID number
    # and the inner index as the date
    diff_df = diff_df.set_index(['ID_number', 'Date_ses'])
    diff_df[col_name + '_diff'] = np.nan
    idx = pd.IndexSlice  # Just some syntactic sugar for MultiIndex slicing
    for id_num in diff_df.index.levels[0]:  # For each ID number...
        # Go over all the dates and collect the differences between the previous
        # session and the subsequent session. We use the option 
        diff_df.loc[idx[id_num, :], col_name + '_diff'] = -diff_df.loc[idx[id_num, :], 
                                                                      col_name].diff(periods=-1)
    diff_df.reset_index(inplace=True)
    return diff_df

In [None]:
meta_df = preprocess_outcomes()

In [None]:
meta_df = extract_audio_filenames(meta_df)

In [None]:
meta_df = meta_df.sort_values(by=['ID_number', 'Date_ses'])

In [None]:
meta_df = calculate_col_diffs_per_sess(meta_df, col_name='PHQ9_total_ses')

In [None]:
meta_df['hash'] = meta_df.apply(make_hash, axis=1)

In [None]:
meta_df.to_csv(OUT_PATH, sep='\t', float_format='%.2f', index=False) 

# Making Train, Dev, and Test splits

In [None]:
TEST_TEXT_PATH = "/vol0/psych_audio/gold-transcripts/gold-final"
METADATA_PATH = "/vol0/psych_audio/scotty/results/scotty_phq9_diffs_with_paths.tsv"

In [None]:
def extract_test_subj_ids(test_files_path=TEST_TEXT_PATH):
    """Extract set of test subject IDs.
    
    Use session ID's that appear in the the gold-standard transcripts
    as the list of test set session ID's. 
    
    Args:
        None
    
    Returns:
        The set of test subject IDs (set of strings, all of length MAX_ID_LEN)
    """
    test_subj_ids = set()
    for filename in os.listdir(test_files_path):
        name_split = filename.split('_')
        if name_split[0] == ".DS":  # If there's an arbitrary .DS file, just skip it
            continue
        else:  # Otherwise extract the subject ID from the filename and add it to the test set
            test_subj_ids.add(name_split[1])
    test_subj_ids = {x.rjust(MAX_ID_LEN, '0') for x in test_subj_ids}  # Left-pad with zeros
    return test_subj_ids

def extract_train_dev_combined_subj_ids(test_subj_ids, metadata_path=METADATA_PATH):
    """Extract union of train and dev IDs 
    
    Use the metadata file to get the list of all ID's for which we have 
    PHQ-9 data, then remove from that list of ID's all the ones that are 
    already in the dev set, and keep the rest as the train/dev sets.
    
    Args:
        test_subj_ids: (set of strings) the set of IDs associated with the test set
    
    Returns:
        The set of train+dev subject IDs (set of strings, all of length MAX_ID_LEN)
    """
    meta_df = pd.read_csv(metadata_path, delimiter="\t")
    all_metadata_ids = np.unique(meta_df['ID_number'])  # Get the unique ID's from the metadata
    all_metadata_ids = {str(x) for x in all_metadata_ids}
    all_metadata_ids = {x.rjust(MAX_ID_LEN, '0') for x in all_metadata_ids}  # Left-pad with zeros
    train_dev_ids = all_metadata_ids - test_subj_ids
    return train_dev_ids, all_metadata_ids

def extract_train_dev_split_subj_ids(train_dev_ids, n_dev=30, seed=42):
    """Extract the (split) set of train and dev IDs
    
    Randomly divide the IDs in the train_dev_ids to be in either 
    train set or dev set according such that there are n_dev IDs 
    in the dev set. The returned sets are disjoint, but their union
    is the original train_dev_ids set passed as an argument.
    """
    random.seed(seed)
    dev_subj_ids = set(random.sample(train_dev_ids, n_dev))
    train_subj_ids = train_dev_ids - dev_subj_ids
    return train_subj_ids, dev_subj_ids

In [None]:
df = pd.read_csv(METADATA_PATH, delimiter="\t")
path_to_hash_map = {df.loc[i, 'audio_path']:df.loc[i, 'hash'] for i in df.index}

In [None]:
test_subj_ids = extract_test_subj_ids(TEST_TEXT_PATH)
train_dev_ids, all_metadata_ids = extract_train_dev_combined_subj_ids(test_subj_ids)
train_subj_ids, dev_subj_ids = extract_train_dev_split_subj_ids(train_dev_ids)

In [None]:
print("Extracted {} train ID's".format(len(train_subj_ids)))
print("Extracted {} dev ID's".format(len(dev_subj_ids)))
print("Extracted {} test ID's".format(len(test_subj_ids)))
print("Extracted {} total ID's".format(len(train_subj_ids.union(dev_subj_ids).union(test_subj_ids))))
print("There are {} unique ID's in the metadata table".format(len(all_metadata_ids)))
assert len(train_subj_ids.intersection(dev_subj_ids)) == 0
assert len(train_subj_ids.intersection(test_subj_ids)) == 0
assert len(dev_subj_ids.intersection(test_subj_ids)) == 0

In [None]:
train_subj_ids = {int(x) for x in train_subj_ids}
dev_subj_ids = {int(x) for x in dev_subj_ids}
test_subj_ids = {int(x) for x in test_subj_ids}

df['train'] = df['ID_number'].isin(train_subj_ids)
df['dev'] = df['ID_number'].isin(dev_subj_ids)
df['test'] = df['ID_number'].isin(test_subj_ids)

In [None]:
df = pd.read_csv(METADATA_PATH, delimiter="\t")
df = df.dropna(subset=['PHQ9_total_ses'])  # TODO: Do we want to drop all sessions without PHQ9 total?
df = df.dropna(subset=['Num_sess'])  # TODO: Do we want to drop all sessions without the session number?
df['Num_sess'] = df['Num_sess'].apply(lambda x: int(x))

# Adding text to the dataframe for downstream ML

In [None]:
TRAIN_DEV_TEXT_PATH = "/vol0/psych_audio/bootcamp-2018/Data/Transcriptions"

In [None]:
def add_text_to_df(meta_df):
    """Create the text field where the transcript words go.
    """
    
    for root, subFolders, files in os.walk(TRAIN_DEV_TEXT_PATH):
        for filename in files:
            filetype = filename.split(".")[-1]
            name = filename[:-4]
            name_split = name.split("_")
            sess_num = name_split[0][1]  # Sess number is 2nd char of first token in filename (1 is 1st session)
            subj_id = name_split[1]  # Grab the second token in the filename, which should correspond to subj_id
            part_str = ""
            part_num = -1  # If an audio file is split into two parts, we'll mark this flag to be the part number.
                           # Otherwise, it will just remain -1 and no parts handling will be performed
            
            path = root + '/' + filename
            
            # Handling abnormal case 1: S2_Part 1_470101_P1_10.20.15.MP3
            try:
                int(subj_id)  # Check to see if the second token can be cast as a string
            except:  # If not, then assume the second token is the "part" e.g. part 1 vs. part 2
                print("\nEncountered nonstandard formatting: ")
                print(filename)
                try:
                    subj_id = name_split[2]
                    print("Skipping second token. Using third token as subject ID: {}\n".format(subj_id))
                    int(subj_id)
                    part_str = name_split[1]
                    part_num = [int(x) for x in part_str if x.isdigit()]
                    if len(part_num) > 1:
                        raise ValueError("Invalid part number: {}".format(name))
                    else:
                        part_num = part_num[0]
                except:
                    raise ValueError("Failed to parse nonstandard formatting: {}".format(filename))
            
            with open(path) as f:
                sess_words_list = []
                lines = f.readlines()
                for line in lines:
                    entry = line.split(' ')
                    if entry[0] == 'Word:':
                        new_word = entry[1].lower()[:-1]
                        sess_words_list.append(new_word)
            sess_words_str = ' '.join(sess_words_list)
            meta_df.loc[(meta_df['ID_number'] == int(subj_id)) & \
                        (meta_df['Num_sess'] == int(sess_num)),
                        'text'] = sess_words_str
            meta_df.loc[(meta_df['ID_number'] == int(subj_id)) & \
                        (meta_df['Num_sess'] == int(sess_num)),
                        'text_path'] = path
            
    return meta_df

In [None]:
add_text_to_df(df)
df = df.dropna(subset=['text'])