In [38]:
import pandas as pd
import sklearn.model_selection as ms
import sklearn.linear_model as lm

# things we need for NLP
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
from nltk.corpus import wordnet
syns = wordnet.synsets("program")

# things we need for keras
import numpy as np
import random
import re
import unicodedata
import sys

In [2]:
# medical conditions training data
training_data = []

import csv

# adds every medical condition along with associated info from csv file
with open('Disease Database NEW 2 CSV.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count != 0:
            training_data.append({"class":row[1], "synonyms":row[2], 
                                  "sentence":row[3]+';'+row[6], "treatments":row[4], "danger level":row[5]})
        line_count += 1

print ("%s total classes in training data" % len(training_data))

for term in training_data:
    print(term["class"] + " | " + term["synonyms"] + " | " + term["sentence"] + 
          " | " + term["treatments"] + " | " + term["danger level"])

14 total classes in training data
Common Cold | Upper Respiratory Tract;Nose and Throat Infection | Hi, I have a runny nose all the time and I don't know what to do about it. I was sniffing all the time in the past few days and it was getting very annoying;One of my nausal was blocked and I couldn't breath properly;My throat hurts after a cold shower last night, I am not sure what to do about it;It feels like there was something in my throat and I've been dry coughing for several days already;I recently developed a condition that requires frequent washroom visits througout the day;I am so tired this morning, every single one of my muscle hurts so much;All my joints are sour since last morning;I feel so cold all the time, I sneezed all the time;I am not so conformatble;I caught a cold;I am shiverring;My throat hurts;I an tired all the time and I don't want to move;Runny or stuffy nose;sore throat;cough;congestion;body aches;sneezing | Stay hydrated;rest;sooth a sore throat using saltwat

In [3]:
# method to slice strings
def slicer(my_str,sub):
    index=my_str.find(sub)
    if index !=-1 :
         return my_str[index:] 
    else :
         raise Exception('Sub string not found!')

In [10]:
import string

words = []
words_total = []
classes = []
classes_details = []
documents = []
ignore_words = ['?']

# loop through each description per symptom
for pattern in training_data:
    symptoms = pattern['sentence']
    word_list = symptoms.split(';')
    print("WORD_LIST:",word_list)
    word_list_new = []
    for word in word_list:
        word = word.strip().lower()
        word_list_new.append(word)
    print("WORD_LIST_NEW:",word_list_new)
    joined_symps = ' '.join(s for s in word_list_new)
    print("joined_symps:",joined_symps)
    
    # tokenize each word in the description
    w = nltk.word_tokenize(joined_symps)
    # add to our words list
    words.extend(w)
    # add to documents in our corpus
    documents.append((w, pattern['class']))
    # add to our classes list
    if pattern['class'] not in classes:
        classes.append(pattern['class'])
        classes_details.append({"class":pattern['class'], "synonyms":pattern['synonyms'],
                                "treatments":pattern['treatments'], "danger level":pattern['danger level']})
    
# stem and lower each word and remove duplicates
words = [stemmer.stem(w.decode('utf-8').lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

# remove duplicates
classes = sorted(list(set(classes)))

print (len(documents), "documents", documents)
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)
print ("classes_details", classes_details)

('WORD_LIST:', ["Hi, I have a runny nose all the time and I don't know what to do about it. I was sniffing all the time in the past few days and it was getting very annoying", "One of my nausal was blocked and I couldn't breath properly", 'My throat hurts after a cold shower last night, I am not sure what to do about it', "It feels like there was something in my throat and I've been dry coughing for several days already", 'I recently developed a condition that requires frequent washroom visits througout the day', 'I am so tired this morning, every single one of my muscle hurts so much', 'All my joints are sour since last morning', 'I feel so cold all the time, I sneezed all the time', 'I am not so conformatble', 'I caught a cold', 'I am shiverring', 'My throat hurts', "I an tired all the time and I don't want to move", 'Runny or stuffy nose', 'sore throat', 'cough', 'congestion', 'body aches', 'sneezing'])
('WORD_LIST_NEW:', ["hi, i have a runny nose all the time and i don't know what 

In [48]:
# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(classes)

# training set, bag of words for each sentence
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    print("TOKENIZED WORDS: ",pattern_words)
    # stem each word
    pattern_words = [stemmer.stem(word.decode('utf-8').lower()) for word in pattern_words]
    # create our bag of words array
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)
    print("pattern_words:",pattern_words)
    print("BAG: ",bag)

    # output is a '0' for each tag and '1' for current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)

# create train and test lists
train_x = np.array(list(training[:,0]))
print("TRAIN_X:",train_x)
print("X SHAPE:",train_x.shape)
#train_y = np.array([list(training[:,1]))]
train_y = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
print("TRAIN_y:",train_y)
print("Y SHAPE:",train_y.shape)

X = np.random.randint(5, size=(6, 100))
print("X:",X)

('TOKENIZED WORDS: ', ['hi', ',', 'i', 'have', 'a', 'runny', 'nose', 'all', 'the', 'time', 'and', 'i', 'do', "n't", 'know', 'what', 'to', 'do', 'about', 'it', '.', 'i', 'was', 'sniffing', 'all', 'the', 'time', 'in', 'the', 'past', 'few', 'days', 'and', 'it', 'was', 'getting', 'very', 'annoying', 'one', 'of', 'my', 'nausal', 'was', 'blocked', 'and', 'i', 'could', "n't", 'breath', 'properly', 'my', 'throat', 'hurts', 'after', 'a', 'cold', 'shower', 'last', 'night', ',', 'i', 'am', 'not', 'sure', 'what', 'to', 'do', 'about', 'it', 'it', 'feels', 'like', 'there', 'was', 'something', 'in', 'my', 'throat', 'and', 'i', "'ve", 'been', 'dry', 'coughing', 'for', 'several', 'days', 'already', 'i', 'recently', 'developed', 'a', 'condition', 'that', 'requires', 'frequent', 'washroom', 'visits', 'througout', 'the', 'day', 'i', 'am', 'so', 'tired', 'this', 'morning', ',', 'every', 'single', 'one', 'of', 'my', 'muscle', 'hurts', 'so', 'much', 'all', 'my', 'joints', 'are', 'sour', 'since', 'last', 'mor

In [50]:
#from sklearn.naive_bayes import MultinomialNB

model = lm.LogisticRegression().fit(train_x,train_y)
#clf = lm.LogisticRegression(train_x,train_y)
#clf = MultinomialNB().fit(train_x, train_y)

import coremltools

input_features = "Symptoms"
output_feature = "Names"

coreml_model = coremltools.converters.sklearn.convert(model, input_features, output_feature)
coreml_model.save("Medicine.mlmodel")

# def clean_up_sentence(sentence):
#     #strip punctuations, numbers, and words containing numbers
#     no_nums1 = ' '.join(s for s in sentence.split() if not any(c.isdigit() for c in s))
#     no_nums2 = re.sub(r'\d+', '', no_nums1)
# #     keywords = extract_keywords(no_nums2)
# #     keyword_str = ' '.join(s for s in keywords)
#     # tokenize the pattern
#     sentence_words = nltk.word_tokenize(no_nums2)
#     # stem each word
#     sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
#     print("sentence after stemming: ", sentence_words)
#     return sentence_words

# # return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
# def bow(sentence, words, show_details=False):
#     # tokenize the pattern
#     sentence_words = clean_up_sentence(sentence)
#     # bag of words
#     bag = [0]*len(words)  
#     for s in sentence_words:
#         for i,w in enumerate(words):
#             if w == s: 
#                 bag[i] = 1
#                 if show_details:
#                     print ("found in bag: %s" % w)

#     return(np.array(bag))

# docs_new = 'runny nose and sneezing last night along with coughing'
# X_new_counts = clean_up_sentence(docs_new)
# X_new_tfidf = bow(X_new_counts,words,show_details=False)

# predicted = clf.predict(X_new_tfidf)

# print('%r => %s' % (docs_new, twenty_train.target_names[category]))

In [None]:
# med = pd.read_csv("Disease Database NEW 3 CSV.csv", usecols=[1, 3])
# med.head()

In [None]:
# X, y = med.iloc[:, -1], med.iloc[:, :-1]
# print("X",X)
# print("Y",y)

In [None]:
# X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# regr = lm.LinearRegression()  # 1
# regr.fit(X_train, y_train)    # 2
# regr.score(X_test, y_test)    # 3