# Symptom Identifier
This document contains exploration to build the NLP model to determine probability or similarity of how much given text indicates each symptom 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import joblib

from gensim.models import Word2Vec
import gensim.downloader
from scipy.spatial.distance import cosine

# Extracting Symptoms
Now I will write some simple nlp steps to extract symptoms from descriptions. Until we collect enough text data, we will rely on the vector space provided by Spacy  to determine how closely a description is related to the each symptoms. We'll also see if the public domain medical transcription data will create a better vector space to identify the symptoms from descriptions.

In [2]:
# importing symptoms
symptoms = pd.read_json('data/symptoms.json', orient = 'table')

### Preprocessing
first, preprocessing steps for text input

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import bigrams
from nltk.stem import WordNetLemmatizer

In [None]:
def remove_punctuations(text, punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~�0123456789'):
    ''' remove punctuations '''
    table_ = str.maketrans('', '', punctuations)
    return text.translate(table_)

def ascii_only(text):
    ''' remove non-ascii words '''
    return text.encode("ascii", "ignore").decode()

def lemmatize(word):
    ''' lemmatize text'''
    wnl = WordNetLemmatizer()
    return wnl.lemmatize(word)

#### Spelling
To increase the accuracy, we will apply an autocorrect model.

In [None]:
def autocorrect(text):
    ''' to-do '''
    pass

In [3]:



def preprocess(text, sw = ['i', 'me', 'my', 'myself', 'we', 'our',
                         'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your',
                         'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
                         "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
                         'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll",
                         'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have',
                         'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but',
                         'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
                         'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
                         'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
                         'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all',
                         'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
                         'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
                         's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
                         'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't",
                         'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven',
                         "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't",
                         'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
                         "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'felt', 'feel', 'feels']):
    '''
    Takes a text as an input
    Preprocess (remove punctuations, turn lower case, lemmatize, remove stop words)
    Return a nested array with a tokens per sentence
    '''
    if isinstance(text, str):
        text = ascii_only(text.lower())

        text_tokens = []
        for sentence in sent_tokenize(text): 
            sentence = remove_punctuations(sentence)
            tokens = word_tokenize(sentence)
            text_tokens.append([lemmatize(word) for word in tokens if word not in sw])
        return text_tokens
    else: 
        return 'no input'
    

### Word2Vec training
Below is the method to create a vector space by training with word2vec. But for the prototype we will use the pretrained model. But once we obtain enough text data, we can retrain the vector space with more targeted language.

In [12]:
# for now we will use GloVe 
# for initial run 
model = gensim.downloader.load('glove-wiki-gigaword-300')

In [4]:
#model.save("model/word2vec.model")
model = Word2Vec.load("model/word2vec.model")

AttributeError: Can't get attribute 'Word2VecKeyedVectors' on <module 'gensim.models.deprecated.keyedvectors' from '/opt/anaconda3/lib/python3.7/site-packages/gensim/models/deprecated/keyedvectors.py'>

In [738]:
model.init_sims(replace=True) # normalize if we need to retrain, remove replace

In [11]:
model.save("model/word2vec_norm.model")
#model = Word2Vec.load("model/word2vec_norm.model")

AttributeError: Can't get attribute 'Word2VecKeyedVectors' on <module 'gensim.models.deprecated.keyedvectors' from '/opt/anaconda3/lib/python3.7/site-packages/gensim/models/deprecated/keyedvectors.py'>

We now have a vector space. 
For now, we'll use average vector of sentence to estimate. But in the future, we will turn this into doc2vec and check the sentence similarity instead of word similarity to increase accuracy.

In [807]:
def get_avg_vectors(text, model):
    if isinstance(text, str):
        text_input = preprocess(text)
    else: 
        text_input = text.copy()
    avg_vec = []
    for sentence in text_input:
        vectors = []
        for word in sentence:
            try:
                vectors.append(model[word])
            except KeyError:
                print(f'{word} not exists')
                pass
        avg = np.average(vectors, axis = 0)
        avg_vec.append(avg)
    return avg_vec

In [773]:
symptoms = keys.symptom.values

In [774]:
symptom_vectors = get_avg_vectors('. '.join(symptoms), model)

neurofibroma not exists
neurofibroma not exists
neurofibroma not exists
neurofibroma not exists
mpnst not exists
moyamoya not exists


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [775]:
symptom_vectors = dict(zip(symptoms, symptom_vectors))

In [776]:
joblib.dump(symptom_vectors, 'symptom_vectors.pkl')

['symptom_vectors.pkl']

In [854]:
# for each sentence, see how close they are to target symptoms

def identify_symptom(text, symptom_vectors, model, threshold = 0.5):
    '''
    Find the closest symptom per sentences
    '''
    avg_vec = get_avg_vectors(text, model)
    pred_symptoms = {}

    for sent_vec in avg_vec: 
        
        # for each sentence
        max_ = threshold

        for symptom, sym_vec in symptom_vectors.items():
            similarity =  1 - cosine(sent_vec, sym_vec)
            if similarity > max_:

                max_ = similarity
                max_symptom = symptom
        if max_ > threshold:
            try: 
                # if symptom already exists, update if similarity is higher
                if max_ > pred_symptoms[max_symptom] : 
                    pred_symptoms[max_symptom] = max_

            except: 
                # add symptom if it does not exist
                pred_symptoms[max_symptom] = max_
    return [k for k, v in sorted(pred_symptoms.items(), key = lambda item: item[1])]        

In [860]:
text = "I have a high fever and headache. Things look a little strange. Pain on the left side of the body. Also felt a bit of numbness. It seems like I had hard time moving my limbs"
result = identify_symptom(text, symptom_vectors, model)

In [936]:
keys.shape

(65, 3)

In [926]:
def return_symptom_id(symptom_list, keys):
    return [keys[keys.symptom == x].index[0] for x in symptom_list]

In [927]:
# return symptom_id for this
result_symptom_id = return_symptom_id(result, keys)
result_symptom_id

[12, 25, 24, 21, 58]

For the next iteration, we can present users with the select list of symptoms so they can provide a feedback as to how accurate our model is, then retrain based on their answer.

# Testing
Testing using medical transcription data

In [527]:
mt = pd.read_csv('data/medical_transcription_samples.csv', index_col = 0)

In [537]:
mt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4999 entries, 0 to 4998
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   description        4999 non-null   object
 1   medical_specialty  4999 non-null   object
 2   sample_name        4999 non-null   object
 3   transcription      4966 non-null   object
 4   keywords           3931 non-null   object
dtypes: object(5)
memory usage: 234.3+ KB


In [6]:
# preprocessing all transcriptions
text_input = [preprocess(x) for x in mt.transcription]

NameError: name 'mt' is not defined

In [590]:
from itertools import chain
# unnesting once
text_input = list(chain(*text_input))

In [591]:
len(text_input)

140476

In [None]:
# this is when we train our own vector space using text_input preprocessed above
#model = Word2Vec(sentences = text_input)