# Koki Sasagawa
### LHS 712 Project
### 4/16/2018

In [1]:
import os
import xml.etree.ElementTree as ET
import nltk
import re
from nltk.tokenize import RegexpTokenizer
# from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
# from nltk.stem.lancaster import LancasterStemmer
# from nltk.stem.porter import PorterStemmer

In [2]:
path_data = '/Users/koki/Documents/Notes/LHS_712_NLP_Health_Data/n2c2/train/'

met_files = set()
counter1 = 0
 
for file_name in os.listdir(path_data):  # list all files in the current directory
    tree = ET.parse(path_data + file_name)
    if tree.find('.//ALCOHOL-ABUSE').get('met') == 'met': # retrieve tag ALCOHOL-ABUSE and get met attribute
        counter1 += 1
        met_files.add(file_name)
        
not_met_files = set()
counter2 = 0

for file_name in os.listdir(path_data):  # list all files in the current directory
    tree = ET.parse(path_data + file_name)
    if tree.find('.//ALCOHOL-ABUSE').get('met') == 'not met': # retrieve tag ALCOHOL-ABUSE and get met attribute
        counter2 += 1
        not_met_files.add(file_name)

In [3]:
print("%d total files" %(counter1 + counter2))
print("%d are 'met' cases" %counter1)
print("%d are 'not met' cases" %counter2)

202 total files
7 are 'met' cases
195 are 'not met' cases


In [4]:
# Name of met files
print("These files are:")
for i in sorted(met_files):
    print(i)

These files are:
159.xml
176.xml
187.xml
212.xml
258.xml
325.xml
344.xml


In [5]:
# Create list of words related to drinking. Use as key words.
ALCOHOL = ['drink',
           'alcohol',
           'etoh',
           'ethanol']

# Create list of words for defining met cases:
# Some words retrieved from: 
# https://www.merriam-webster.com/thesaurus/alcoholism
# https://www.projectknow.com/research/addiction-glossary-of-terms-and-phrases/
ALCOHOL_MODIFIER = ["abuse",
                    "addiction",
                    "binge",
                    "concern",
                    "dependence",
                    "excessive",
                    "heavy"]

ALCOHOL_MENTAL = ["anxiety",
                  "debauchery",
                  "depression",
                  "dispomania",
                  "dissoluteness",
                  "distraught",
                  "drunkenness",
                  "inebriety",
                  "insobriety",
                  "intemperance",
                  "intoxicate",
                  "bibulousness",]

NEGATION = ["no",
            "without",
            "stop",
            "n't",
            "not",
            "h/o", 
            "never", 
            "none",  
            "nor", 
            "non",
            "rare",
            "previous",
            "prior",
            "history",
            "denies",
            "negative"]

In [6]:
snowball_stemmer = SnowballStemmer("english")
stemmed_alcohol = [snowball_stemmer.stem(word) for word in ALCOHOL]
stemmed_alcohol

['drink', 'alcohol', 'etoh', 'ethanol']

In [7]:
stemmed_alcohol_modifer = [snowball_stemmer.stem(word) for word in ALCOHOL_MODIFIER]
stemmed_alcohol_modifer

['abus', 'addict', 'bing', 'concern', 'depend', 'excess', 'heavi']

In [8]:
stemmed_alcohol_mental = [snowball_stemmer.stem(word) for word in ALCOHOL_MENTAL]
stemmed_alcohol_mental

['anxieti',
 'debaucheri',
 'depress',
 'dispomania',
 'dissolut',
 'distraught',
 'drunken',
 'inebrieti',
 'insobrieti',
 'intemper',
 'intox',
 'bibul']

In [9]:
stemmed_negation = [snowball_stemmer.stem(word) for word in NEGATION]
stemmed_negation

['no',
 'without',
 'stop',
 "n't",
 'not',
 'h/o',
 'never',
 'none',
 'nor',
 'non',
 'rare',
 'previous',
 'prior',
 'histori',
 'deni',
 'negat']

In [10]:
# MAIN SCRIPT
met_predictions = set()
not_met_predictions = set()

for file_name in os.listdir(path_data):
    tree=ET.parse(path_data + file_name)
    raw_text = tree.find('.//TEXT').text

    clean_text = re.sub('\\n', ' ', raw_text)
    clean_text = re.sub('\\t','', clean_text)
    clean_text = re.sub('[\s]{2,}', ' ', clean_text)

    sentences = nltk.sent_tokenize(clean_text)
    # sentences = nltk.sent_tokenize(raw_text)
    
    hotspot_lines = set()

    for i in sentences:
        # filter out anything non-alphabetical characters and a few special characters
        tokenizer = RegexpTokenizer(r'[a-zA-Z\/\']+')
        token = tokenizer.tokenize(i)

        # Different Stemmers

        # wordnet_lemmatizer = WordNetLemmatizer()
        # lemmatized_tokens = [wordnet_lemmatizer.lemmatize(word) for word in tagged_token]

        # lancaster_stemmer = LancasterStemmer()
        # stemmed_tokens = [lancaster_stemmer.stem(word) for word in token]

        # porter_stemmer = PorterStemmer()
        # stemmed_tokens = [porter_stemmer.stem(word) for word in token]

        snowball_stemmer = SnowballStemmer("english")
        stemmed_tokens = [snowball_stemmer.stem(word.lower()) for word in token]

        drink_score = 0
        abuse_score = 0

        for j in range(len(stemmed_tokens)):
            if stemmed_tokens[j] in stemmed_alcohol:
                drink_score += 1
                # Negation Detection in negative direction
                if (j > 0) and (stemmed_tokens[j - 1] in stemmed_negation):
                    drink_score = 0
                elif (j > 1) and (stemmed_tokens[j - 2] in stemmed_negation):
                    drink_score = 0
                elif (j > 2) and (stemmed_tokens[j - 3] in stemmed_negation):
                    drink_score = 0
                elif (j > 3) and (stemmed_tokens[j - 4] in stemmed_negation):
                    drink_score = 0
                elif (j > 4) and (stemmed_tokens[j - 5] in stemmed_negation):
                    drink_score = 0
                # Negation Detection in positive direction
                elif (j < len(stemmed_tokens) - 1) and (stemmed_tokens[j + 1] in stemmed_negation):
                    drink_score = 0
                elif (j < len(stemmed_tokens) - 2) and (stemmed_tokens[j + 2] in stemmed_negation):
                    drink_score = 0
                elif (j < len(stemmed_tokens) - 3) and (stemmed_tokens[j + 3] in stemmed_negation):
                    drink_score = 0
                # Positive Modifer Detection
                elif (j > 0) and (stemmed_tokens[j - 1] in stemmed_alcohol_modifer):
                    abuse_score += 1
                elif (j > 1) and (stemmed_tokens[j - 2] in stemmed_alcohol_modifer):
                    abuse_score += 1
                elif (j < len(stemmed_tokens) - 1) and (stemmed_tokens[j + 1] in stemmed_alcohol_modifer):
                    abuse_score += 1
                elif (j < len(stemmed_tokens) - 2) and (stemmed_tokens[j + 2] in stemmed_alcohol_modifer):
                    abuse_score += 1
            # Mental Health Detection    
            elif stemmed_tokens[j] in stemmed_alcohol_mental:
                abuse_score += 1
        if drink_score >= 1 and abuse_score >= 1:
            hotspot_lines.add(i)

    if hotspot_lines:
        met_predictions.add(file_name)
    else:
        not_met_predictions.add(file_name)

In [11]:
met_predictions

{'110.xml',
 '159.xml',
 '176.xml',
 '187.xml',
 '188.xml',
 '210.xml',
 '212.xml',
 '319.xml',
 '325.xml',
 '344.xml',
 '356.xml'}

In [12]:
# Performance Results

P = counter1
N = counter2 

# True positive is the event that test makes a positive prediction and the
# subject has a positive result under the gold standard
TP = 0
for file in met_files:
    if file in met_predictions:
        TP += 1
        
# True negative is the event that the test makes a negative prediction and 
#  the subject has a negative result under the gold standard
TN = 0
for file in not_met_files:
    if file in not_met_predictions:
        TN += 1 
        
# False positive is the event that the test makes a positive prediction and 
# the subject has a negative result under the gold standard. 
FP = N - TN

# False negative is the event that the test makes a negative prediction and
# the subject has a positive result under the gold standard
FN = P - TP

# The fraction of relevant positive instances among retrieved positive instances
precision = TP / (TP + FP)

# Closeness of a measured value to standard
accuracy = (TP + TN) / (TP + FP + FN + TN)
f1_score = (2 * TP) / ((2 * TP) + FP + FN)

# Print Results 
print("True positives: %d" %TP)
print("False positives: %d" %(FP))
print("True negatives: %d" %TN)
print("False negatives: %d" %(FN))
print("Sensitivity (True Positive Rate): %0.2f" %(TP / P))
print("Specificity (True Negative Rate): %0.2f" %(TN / N))
print("Precision: %0.2f" %precision)
print("Accuracy: %0.2f" %accuracy)
print("F1 score: %0.2f" %f1_score)

True positives: 6
False positives: 5
True negatives: 190
False negatives: 1
Sensitivity (True Positive Rate): 0.86
Specificity (True Negative Rate): 0.97
Precision: 0.55
Accuracy: 0.97
F1 score: 0.67


In [9]:
# def combine(i):
#     """ This recursive functinon will add consecutively occuring lines together """
#     if i < N and trimmed_lines[i]:
#         line = trimmed_lines[i]
#         trimmed_lines[i] = ''
#         return line + ' ' + combine(i+1)
#     return ''