In [131]:
from sklearn.feature_extraction.text import *
import numpy as np
from scipy.sparse import *
import cPickle
from sklearn.metrics.pairwise import linear_kernel
import re, string
import nltk
from sklearn.feature_extraction.stop_words import *
from collections import Counter

In [132]:
# We need to import the same functions we used to vectorize, in order to repeat the process.

def gettext(text):

    """
    Parse text and remove JavaScript and references
    """
    return text.replace("To use the sharing features on this page, please enable JavaScript.", '').split("References")[0]

def tokenize_custom(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3.
    """
    text = text.lower()
    subs = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n' + ']')
    processed_text = subs.sub(' ', text)
    words = nltk.word_tokenize(processed_text)
    words = [word for word in words if len(word) >=3 and word not in ENGLISH_STOP_WORDS]
    return words

def stemwords(words):
    """
    Given a list of tokens/words, return a new list with each word
    stemmed using a PorterStemmer.
    """
    stemmer = nltk.stem.porter.PorterStemmer()
    stemmed_list = [stemmer.stem_word(word) for word in words]
    return stemmed_list

def tokenizer_custom(text):
    return stemwords(tokenize_custom(text))

In [133]:
# We have to write the user query to a text file because the transform function in tfidf expects RAW text documents. 

def write_query(query, file_dest = "user_query.txt"):
    f = open(file_dest, 'w')
    f.write(query)
    f.close()

In [134]:
def load_vectorizer(filename):
    fp = open(filename, 'rb')
    clf = cPickle.load(fp)
    fp.close()
    return clf

In [135]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [136]:
def load_disease_names(filename):
    f = open(filename, 'r')
    diseases = f.readlines()
    f.close()
    
    diseases = [disease.strip() for disease in diseases]
    
    return diseases

In [137]:
def disease_match(query_file = "user_query.txt",
                  vectorizer_loc = "diseases_data/vectorizer.pk",
                  disease_matrix_loc = "diseases_data/disease_matrix.npz",
                 disease_names_loc = "diseases_data/diseases.txt"):
    
    diseases = load_disease_names(disease_names_loc)
    tfidf = load_vectorizer(vectorizer_loc)
    disease_matrix = load_sparse_csr(disease_matrix_loc)
    
    query_vector = tfidf.transform([query_file])
    cosine_similarities = linear_kernel(query_vector, disease_matrix).flatten()
    
    # Get the top 5 matches to diseases
    related_docs_indices = cosine_similarities.argsort()[:-6:-1]
    
    return [(diseases[idx], cosine_similarities[idx]) for idx in related_docs_indices]

    
    

In [142]:
# Test query
write_query("back pain joint swelling")
disease_match()

[('Joint swelling.txt', 0.80407964952685362),
 ('Joint pain.txt', 0.68293841285188439),
 ('Osteoarthritis.txt', 0.61580092291340971),
 ('Synovial fluid analysis.txt', 0.55550425565384876),
 ('Hypermobile joints.txt', 0.516365922606973)]

In [143]:
# Test query
write_query("I think mites have bitten me")
disease_match()

[('Scabies.txt', 0.22795127755014302),
 ('Rickettsialpox.txt', 0.16896717298726763),
 ('Chiggers.txt', 0.11327264032014427),
 ('Allergen.txt', 0.085123588268646841),
 ('Yellow fever.txt', 0.076080354181726442)]

In [144]:
# Test query
write_query("I have a cold and cough")
disease_match()

[('Common cold.txt', 0.62659839161061781),
 ('Cough.txt', 0.53201503192351374),
 ('Coughing up blood.txt', 0.44308745531623112),
 ('Cold intolerance.txt', 0.43211978373513765),
 ('Vitamin C and colds.txt', 0.29329506399237626)]

In [145]:
# Test query
write_query("I feel fatigued. My skin is pale and looks white")
disease_match()

[('Paleness.txt', 0.53892340505888825),
 ('Fatigue.txt', 0.33874794670379049),
 ('Pupil - white spots.txt', 0.24364046824057434),
 ('Felty syndrome.txt', 0.15570268288106281),
 ('Skin care and incontinence.txt', 0.14670889378321389)]

In [146]:
# Test query
write_query("I have autoimmune hemolytic anemia")
disease_match()

[('Hemolytic anemia.txt', 0.48362230206912082),
 ('Immune hemolytic anemia.txt', 0.47096286216783023),
 ('Drug-induced immune hemolytic anemia.txt', 0.44732235035296464),
 ('Hemolytic anemia caused by chemicals and toxins.txt', 0.40190572954635301),
 ('Anemia of chronic disease.txt', 0.37709923065132905)]

In [147]:
# Test query
write_query("I need to get treated for cancer")
disease_match()

[('Cancer.txt', 0.66312498889843174),
 ('Lung cancer.txt', 0.47391585043495138),
 ('Colon cancer.txt', 0.43705885884448636),
 ('Vaginal cancer.txt', 0.43614152511923837),
 ('Breast cancer in men.txt', 0.43512193887205863)]