In [10]:
# things we need for NLP
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
from nltk.corpus import wordnet
syns = wordnet.synsets("program")

# things we need for Tensorflow
import numpy as np
import tflearn
import tensorflow as tf
import random
import re
import unicodedata
import sys

In [11]:
import urllib.request
import urllib.response
import sys
import os, glob
import http.client, urllib
import json
import http.client, urllib.request, urllib.parse, urllib.error, base64

# Replace the accessKey string value with your valid access key.
accessKey = '82e4612615634c398bfd26e7d6327833'
url = 'westcentralus.api.cognitive.microsoft.com'
path = '/text/analytics/v2.0/keyPhrases'

def extract_keywords(body):
    headers = {'Ocp-Apim-Subscription-Key': accessKey}
    conn = http.client.HTTPSConnection(url)
    body_req = {'documents':[{'language': 'en', 'id': 1, 'text': body}]}
    body_json = json.dumps(body_req)
    conn.request ("POST", path, body_json, headers)
    response = conn.getresponse ()
    string = response.read().decode('utf-8')
    json_obj = json.loads(string)
    print("json_obj: ", json_obj)
    keyphrases_list = ((json_obj['documents'])[0])['keyPhrases']
    print("keyphrases_list: ", keyphrases_list)
    return keyphrases_list

In [21]:
# medical conditions training data
training_data = []

import csv

# adds every medical condition along with associated info from csv file
with open('Disease CSV.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count != 0:
            training_data.append({"class":row[1], "synonyms":row[2], 
                                  "sentence":row[3], "treatments":row[4], "danger level":row[5]})
        line_count += 1

print ("%s total classes in training data" % len(training_data))

for term in training_data:
    print(term["class"] + " | " + term["synonyms"] + " | " + term["sentence"] + 
          " | " + term["treatments"] + " | " + term["danger level"])

28 total classes in training data
Common Cold | Upper Respiratory Tract;Nose and Throat Infection | Runny or stuffy nose;sore throat;cough;congestion;body aches;sneezing | Stay hydrated;rest;sooth a sore throat using saltwater gargle;take over the counter cold and cough medications
 | 1
Allergies | Allergic Reaction | Eye irritation;runny or stuffy nose;puffy watery eyes;sneezing;inflamed;itchy nose and throat | Remove the cause of allergy; 
otherwise take Antihistamines to relieve sneezing;Decongestants to relieve congestions in nasal membranes;Anti inflamary agents to reduce;Allergy shots | 1
Conjunctivitis | Pink Eye | Redness of eyes;itchy eyes;tearing or burning sensation of eyeball | See a doctor;Bacterial cases can be treated with antibiotic eye drops;Allergic reactions can be treated with other eye drops | 1
Irritable Bowel Syndrome (IBS) | Diarrhea | Watery, loose stools;frequent bowel movement;stomachache nausea;bloody stool | Diarrhea will typically go away in 2 days;Consult

In [28]:
# a table structure to hold the different punctuation used
tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                    if unicodedata.category(chr(i)).startswith('P'))

# method to remove punctuations from sentences.
def remove_punctuation(text):
     return text.translate(tbl)

# method to slice strings
def slicer(my_str,sub):
    index=my_str.find(sub)
    if index !=-1 :
         return my_str[index:] 
    else :
         raise Exception('Sub string not found!')

# find synonyms of all the words in a sentence
def find_syns(sentence):
    words = []
    for word in sentence.split():
        if word != '':
            synonyms = []
            for syn in wordnet.synsets(word):
                for l in syn.lemmas():
                    if l.name() not in synonyms and l.name() != word:
                        synonyms.append(l.name())
            word = (word + ' ' + ' '.join(synonyms)).strip()
            words.append(word)
    print("SYNONYMS:",words)
    return words

# clean all synonyms and words
def clean_syns(arg):
    clean_synonyms = []
    for syn_set in synonyms:
        syn_set = ' '.join(s for s in syn_set.split() if not any(c.isdigit() for c in s))
        syn_set = re.sub(r'\d+', '', syn_set)
        syn_set = syn_set.replace("_"," ")
        syn_set = remove_punctuation(syn_set)
        clean_synonyms.append(syn_set)
    print("CLEAN SYNONYMS: ",clean_synonyms)
    return clean_synonyms

In [29]:
import string

words = []
words_total = []
classes = []
classes_details = []
documents = []
ignore_words = ['?']

# loop through each description per symptom
for pattern in training_data:
    symptoms = pattern['sentence']
    word_list = symptoms.split(';')
    print("WORD_LIST:",word_list)
    word_list_new = []
    for word in word_list:
        word = remove_punctuation(word)
        word = word.strip().lower()
        word_list_new.append(word)
    print("WORD_LIST_NEW:",word_list_new)
    joined_symps = ' '.join(s for s in word_list_new)
    print("joined_symps:",joined_symps)
    
    #find synonyms of each word in description
    synonyms = find_syns(joined_symps)
    synonyms_clean = clean_syns(synonyms)
    
    # tokenize each word in the description
    w_list = []
    for syn_set in synonyms_clean:
        w_token = nltk.word_tokenize(syn_set)
        w = [stemmer.stem(i.lower()) for i in w_token if i not in ignore_words]
        w = sorted(list(set(w)))
    
        # strip duplicates
        for item in w:
            if not item in words_total and item != '':
                words_total.append(item.strip())
            else:
                w.remove(item)
    
        w_syn = ' '.join(w)
        w_list.append(w_syn)
    
    #add each string to words
    for item in w_list:
        if not item in words and item != '':
            #print("Item is in words array already.")
            words.append(item.strip())
     
    # add to documents in our corpus
    documents.append((w_list, pattern['class']))
    
    # add to our classes list
    if pattern['class'] not in classes:
        classes.append(pattern['class'])
        classes_details.append({"class":pattern['class'], "synonyms":pattern['synonyms'],
                                "treatments":pattern['treatments'], "danger level":pattern['danger level']})
        
#remove duplicates in words
words = sorted(list(set(words)))

# remove duplicates in classes
classes = sorted(list(set(classes)))

print (len(documents), "documents", documents)
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)
print ("classes_details", classes_details)

WORD_LIST: ['Runny or stuffy nose', 'sore throat', 'cough', 'congestion', 'body aches', 'sneezing']
WORD_LIST_NEW: ['runny or stuffy nose', 'sore throat', 'cough', 'congestion', 'body aches', 'sneezing']
joined_symps: runny or stuffy nose sore throat cough congestion body aches sneezing
SYNONYMS: ['runny fluid', 'or Oregon Beaver_State OR operating_room operating_theater operating_theatre surgery', 'stuffy airless close unaired stodgy', 'nose olfactory_organ nozzle intrude horn_in pry poke scent wind nuzzle', 'sore sensitive raw tender afflictive painful huffy mad', 'throat pharynx', 'cough coughing', 'congestion over-crowding', 'body organic_structure physical_structure dead_body torso trunk consistency consistence eubstance soundbox personify', 'aches ache aching hurt suffer yearn yen pine languish smart', 'sneezing sneeze sternutation']
CLEAN SYNONYMS:  ['runny fluid', 'or Oregon BeaverState OR operatingroom operatingtheater operatingtheatre surgery', 'stuffy airless close unaired s

WORD_LIST: ['Tender,swollen breats', 'fatigue', 'slight bleeding or cramping', 'missing period', 'bloating', 'constipation', 'crying', 'gas', 'mood swings', 'lower back pain']
WORD_LIST_NEW: ['tenderswollen breats', 'fatigue', 'slight bleeding or cramping', 'missing period', 'bloating', 'constipation', 'crying', 'gas', 'mood swings', 'lower back pain']
joined_symps: tenderswollen breats fatigue slight bleeding or cramping missing period bloating constipation crying gas mood swings lower back pain
SYNONYMS: ['tenderswollen', 'breats', 'fatigue weariness tiredness fatigue_duty tire pall weary jade wear_upon tire_out wear wear_out outwear wear_down fag_out fag', 'slight rebuff cold-shoulder little flimsy fragile tenuous thin slender slim svelte', 'bleeding hemorrhage haemorrhage shed_blood bleed leech phlebotomize phlebotomise run', 'or Oregon Beaver_State OR operating_room operating_theater operating_theatre surgery', 'cramping cramp hamper halter strangle', 'missing miss lose neglect pr

333 unique stemmed words ['a adenin amp angstrom angstromunit antiophthalmicfact axerophthol deoxyadenosinemonophosph group type vitamin', 'ab abdomin abdominalmusc', 'abbrevy abridg boildown comedown contract cutback cutdown decoct deoxid dilut foreshort keepdown meltoff quash repress rockbottom scaledown shrink slim slimdown subdu subjug thinout trim trimback trimdown', 'abdom abid bear bel breadbasket brook digest putup stand stickout stomach support tol tum tummy vent', 'abdomin', 'abdominalcav stomach', 'abid bel brook end stand stomach support tum vent', 'abject blu bringdown brok crush deplet depress dispirit down downcast downheart downinthemou frown gloom grim humbl humy letdown lour low lowdown lowerber lowpitch lowspirit lowton mis scummy scurvy smal takedown turndown', 'ablaz conflagr enkindl fireup heat ignit inflam kindl red stirup wak', 'abomin atrocy dread pain terr unspeak', 'about almost approach approxim cheesep comenear comeon dear drawclos drawnear good goup most n

In [6]:
# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(classes)

# training set, bag of words for each sentence
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    print("TOKENIZED WORDS: ",pattern_words)
    # stem each word
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    # create our bag of words array
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    # output is a '0' for each tag and '1' for current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)

# create train and test lists
train_x = list(training[:,0])
train_y = list(training[:,1])

TOKENIZED WORDS:  ['bowel diarrhe diarrhoe loos of the', 'cramp halt hamp iron musc spasm strangle', 'abdomin pain']
TOKENIZED WORDS:  ['eyelid lid palpebr', 'epiphor epistroph', 'conjunctivit pinkey']
TOKENIZED WORDS:  ['head tilt', 'incoordin', 'atax ataxy dyssynerg mot', 'trem', 'pares']
TOKENIZED WORDS:  ['pap', 'nod tuberc']
TOKENIZED WORDS:  ['defect judg', 'ach cephalalg concern headach vex worry', 'detery mem', 'down duty fag fatigu jad out outwear pal tir upon wear weary', 'difficul difficult troubl', 'insomn']
TOKENIZED WORDS:  ['barf be cast cat chuck disgorg emes honk puk purg regorg regurgit retch sick spew spu throw up upchuck vomit', 'ment stat', 'papilledem']
TOKENIZED WORDS:  ['acquit arc assoil clear complet discharg dismiss dismit dispatch drop eject elect emit empty exculp exhaust exon expel fir fre go lib must off outpo put releas run sack set spark unload vent waiv', 'memb pen phall', 'scrotal swel', 'ancestry blood bloodlin desc lin origin par pedigr proflig rak 

In [13]:
# reset underlying graph data
tf.reset_default_graph()
# Build neural network
net = tflearn.input_data(shape=[None, len(train_x[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
net = tflearn.regression(net)

# Define model and setup tensorboard
model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')
# Start training (apply gradient descent algorithm)
model.fit(train_x, train_y, n_epoch=10000, batch_size=8, show_metric=True)
model.save('model.tflearn')

Training Step: 79999  | total loss: [1m[32m0.04529[0m[0m | time: 0.010s
| Adam | epoch: 20000 | loss: 0.04529 - acc: 0.9973 -- iter: 24/25
Training Step: 80000  | total loss: [1m[32m0.04183[0m[0m | time: 0.014s
| Adam | epoch: 20000 | loss: 0.04183 - acc: 0.9976 -- iter: 25/25
--
INFO:tensorflow:C:\Users\Steven\Documents\HTN PROJECT 2\model.tflearn is not in all_model_checkpoint_paths. Manually adding it.


In [9]:
# save all of our data structures
import pickle
pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y, 'classes_details':classes_details}, open( "training_data", "wb" ) )