In [29]:
from os import listdir
from os.path import isfile, join
import gensim
from sklearn.feature_extraction.text import *
import sys, os, numpy, re, string
import nltk
from sklearn.feature_extraction.stop_words import *
from collections import Counter
import numpy as np
import cPickle

In [30]:
def gettext(text):

    """
    Parse text and remove JavaScript and references
    """
    return text.replace("To use the sharing features on this page, please enable JavaScript.", '').split("References")[0]

def tokenize_custom(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3.
    """
    text = text.lower()
    subs = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n' + ']')
    processed_text = subs.sub(' ', text)
    words = nltk.word_tokenize(processed_text)
    words = [word for word in words if len(word) >=3 and word not in ENGLISH_STOP_WORDS]
    return words

def stemwords(words):
    """
    Given a list of tokens/words, return a new list with each word
    stemmed using a PorterStemmer.
    """
    stemmer = nltk.stem.porter.PorterStemmer()
    stemmed_list = [stemmer.stem_word(word) for word in words]
    return stemmed_list

In [31]:
def tokenizer_custom(text):
    return stemwords(tokenize_custom(text))

In [32]:
def filelist(root, skip_files ='resources'):
    """
    Get the list of filenames from the medline directory. Ignore filenames that contain 'Resources'
    """
    result =[]
    for path, subdirs, files in os.walk(root):
        for name in files:
            if not name.startswith('.') and skip_files not in name:
                result.append(os.path.join(path, name))
    return result

In [33]:
# These are functions for loading and saving scipy sparse matrices

def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [34]:
# This is a function to dump the vectorizer object to disk. We need it later to get disease matches.

def save_vectorizer(vectorizer, filename = "diseases_data/vectorizer.pk"):
    with open(filename, 'wb') as fin:
        cPickle.dump(vectorizer, fin)
    

In [43]:
def save_disease_names(filename, filenames_list):
    diseases = filenames_list
    with open(filename, 'w') as f:
        for disease in  diseases:
            f.write("%s\n" % disease)
    f.close()

In [44]:
# filelist("diseases_data/curated_diseases/")

In [45]:
tfidf = TfidfVectorizer(decode_error = 'ignore',input='filename', # argument to transform() is list of files
                        analyzer='word',
                        tokenizer=tokenizer_custom,  # tokenize, stem
                        preprocessor = gettext,
                        stop_words='english') # strip out stop words

root_dir = "diseases_data/curated_diseases/"
filenames_list = filelist(root_dir)

In [46]:
doc_matrix = tfidf.fit_transform(filenames_list)

In [47]:
save_vectorizer(tfidf)

In [48]:
save_sparse_csr("diseases_data/disease_matrix", doc_matrix)

In [49]:
# Save the order of filenames
save_disease_names("diseases_data/diseases.txt", filenames_list)