In [21]:
import cPickle as pickle

import scipy
import numpy as np
import pandas as pd
import re

# Load English stopwords from NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize, RegexpTokenizer, WhitespaceTokenizer
import nltk.data

from gensim.models import word2vec
import logging

import os

print ("Required Libraries loaded.")

Required Libraries loaded.


In [22]:
# Import articles for string cleaning

def clean_text(text, stem_words = False, remove_stopwords = False):
    ###clean
    texts = re.sub("[^a-zA-Z]",    #pattern to match
              " ",              #replace other with this
              text)                 #text to apply to
         
    #print "Text recived: ", texts 
    clean_corpus = texts.lower().split()
    #print "corpus: ", clean_corpus
    if stem_words:
        # Porter stemmer
        porter = nltk.PorterStemmer()
        # Snowball stemmer
        snowball = nltk.SnowballStemmer('english')
        # Lancaster stemmer
        lancaster = nltk.LancasterStemmer()
        # General stemming Lambda function to stem tokens
        clean_corpus = lambda tokens: [porter.stem(w) for w in corpus]
    if remove_stopwords:   # Optionally remove stop words
        stops = set(stopwords.words("english"))
        clean_corpus = [w for w in clean_corpus if not w in stops]
        #print "Clean_corpus: ", clean_corpus
    
    return (clean_corpus)

In [23]:
# Use the punkt tokenizer for sentence splitting

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def article_to_sentences(article, tokenizer, stem_words=False, remove_stopwords=False):
    """
    article - article to use as input to create the wordlist
    tokenizer - the tokenizer to use to split into sentences
    stem_words - boolean, whether to use the stemmer function or not
    remove_stopwords - boolean, whether to remove the stopwords function or not
    
    article_to_sentences: Function to convert a document to a sequence of sentences,
    optionally removing stop words.  
    returns: a sequence of sentences where each sentence is itself a sequence of words
    """
    raw_sentences = tokenizer.tokenize(article.decode('utf-8').strip())   # Punkt tokenize into sentences
    sentences = []   # create list of sentences 
    for raw_sentence in raw_sentences:
        #print "raw sentence: ", raw_sentence
        #print len(raw_sentence)
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(clean_text(raw_sentence, stem_words, remove_stopwords))

    # returns a list of lists: list of sentences composed of lists of words
        #print "Sentences: ", sentences
    return sentences

In [24]:
from __future__ import unicode_literals
def create_sentences(corpus, name):
    print ("Begin sentences creation %s..." % name)
    sentences = []  # Initialize an empty list of sentences

    print ("Parsing sentences from training set")
    for article in corpus:
        sentences += article_to_sentences(article, tokenizer)
    print ("Finished data loading and creating sentences")
    return sentences

In [25]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
def train_save_model(sentences, filename):

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

    # Set values for various parameters
    #num_features = 210    # Word vector dimensionality                      
    min_word_count = 10   # Minimum word count                        
    num_workers = 8       # Number of threads to run in parallel
    context = 15          # Context window size                                                                                    
    downsampling = 1e-3   # Downsample setting for frequent words

    # Initialize and train the model (this will take some time)

    print ("Training model...")
    if (filename == "ACEInhibitors"):
        num_features = 210
    elif (filename == "ADHD"):
        num_features = 80
    elif (filename == "Antihistamines"):
        num_features = 29
    elif (filename == "AtypicalAntipsychotics"):
        num_features = 381
    elif (filename == "BetaBlockers"):
        num_features = 194
    elif (filename == "CalciumChannelBlockers"):
        num_features = 329
    elif (filename == "Estrogens"):
        num_features = 233
    elif (filename == "NSAIDS"):
        num_features = 242
    elif (filename == "Opiods"):
        num_features = 55
    elif (filename == "OralHypoglycemics"):
        num_features = 234
    elif (filename == "ProtonPumpInhibitors"):
        num_features = 206
    elif (filename == "SkeletalMuscleRelaxants"):
        num_features = 11
    elif (filename == "Statins"):
        num_features = 467
    elif (filename == "Triptans"):
        num_features = 121
    elif (filename == "UrinaryIncontinence"):
        num_features = 215
    model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, 
                              window = context, sample = downsampling)

    # If you don't plan to train the model any further, calling 
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    # It can be helpful to create a meaningful model name and 
    # save the model for later use. You can load it later using Word2Vec.load()
    
    model_name = filename + "_chi2_features_10minwords_15context"
    folder = "C:/EPC_Data/complete_data/word2vec"
    print ("Saving the model: %s...." % model_name)
    model_name = os.path.join(folder, model_name)
    model.save(model_name)

    print ("Finished Model training and saving for %s...") % filename

In [26]:
#iterate through a directory, load a file, sentence = filename.TiAbsMesh
from __future__ import unicode_literals
path = "C:/EPC_Data/complete_data/raw"
for file in os.listdir(path):
    current_file = os.path.join(path, file)
    filename, _ = file.split(".")
        
    file_obj = pd.read_csv(current_file, sep=",", index_col='PMID')
    corpus = file_obj.TiAbsMesh
   
    train_save_model(create_sentences(corpus, filename), filename)

Begin sentences creation ACEInhibitors...
Parsing sentences from training set


2017-04-03 15:55:54,059 : INFO : collecting all words and their counts
2017-04-03 15:55:54,059 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:55:54,101 : INFO : PROGRESS: at sentence #10000, processed 218649 words, keeping 7797 word types
2017-04-03 15:55:54,142 : INFO : PROGRESS: at sentence #20000, processed 436426 words, keeping 10410 word types
2017-04-03 15:55:54,180 : INFO : collected 11827 word types from a corpus of 617799 raw words and 28186 sentences
2017-04-03 15:55:54,180 : INFO : Loading a fresh vocabulary
2017-04-03 15:55:54,237 : INFO : min_count=10 retains 3441 unique words (29% of original 11827, drops 8386)


Finished data loading and creating sentences
Training model...


2017-04-03 15:55:54,253 : INFO : min_count=10 leaves 595078 word corpus (96% of original 617799, drops 22721)
2017-04-03 15:55:54,253 : INFO : deleting the raw counts dictionary of 11827 items
2017-04-03 15:55:54,253 : INFO : sample=0.001 downsamples 58 most-common words
2017-04-03 15:55:54,253 : INFO : downsampling leaves estimated 452196 word corpus (76.0% of prior 595078)
2017-04-03 15:55:54,269 : INFO : estimated required memory for 3441 words and 210 dimensions: 7501380 bytes
2017-04-03 15:55:54,269 : INFO : resetting layer weights
2017-04-03 15:55:54,328 : INFO : training model with 8 workers on 3441 vocabulary and 210 features, using sg=0 hs=0 sample=0.001 negative=5 window=15
2017-04-03 15:55:54,329 : INFO : expecting 28186 sentences, matching count from corpus used for vocabulary survey
2017-04-03 15:55:55,332 : INFO : PROGRESS: at 71.10% examples, 1607488 words/s, in_qsize 0, out_qsize 1
2017-04-03 15:55:55,765 : INFO : worker thread finished; awaiting finish of 7 more thread

Saving the model: ACEInhibitors_chi2_features_10minwords_15context....
Finished Model training and saving for ACEInhibitors...
Begin sentences creation ADHD...
Parsing sentences from training set


2017-04-03 15:55:56,510 : INFO : collecting all words and their counts
2017-04-03 15:55:56,510 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:55:56,556 : INFO : collected 7801 word types from a corpus of 206969 raw words and 9917 sentences
2017-04-03 15:55:56,559 : INFO : Loading a fresh vocabulary
2017-04-03 15:55:56,572 : INFO : min_count=10 retains 2034 unique words (26% of original 7801, drops 5767)
2017-04-03 15:55:56,572 : INFO : min_count=10 leaves 191428 word corpus (92% of original 206969, drops 15541)
2017-04-03 15:55:56,582 : INFO : deleting the raw counts dictionary of 7801 items
2017-04-03 15:55:56,584 : INFO : sample=0.001 downsamples 61 most-common words
2017-04-03 15:55:56,585 : INFO : downsampling leaves estimated 146242 word corpus (76.4% of prior 191428)
2017-04-03 15:55:56,586 : INFO : estimated required memory for 2034 words and 80 dimensions: 2318760 bytes
2017-04-03 15:55:56,598 : INFO : resetting layer weights
2017-04-03

Finished data loading and creating sentences
Training model...


2017-04-03 15:55:56,990 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-04-03 15:55:56,990 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-04-03 15:55:56,990 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-04-03 15:55:56,990 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-04-03 15:55:56,990 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-04-03 15:55:56,990 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-03 15:55:57,006 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-03 15:55:57,006 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-03 15:55:57,006 : INFO : training on 1034845 raw words (730838 effective words) took 0.4s, 1984309 effective words/s
2017-04-03 15:55:57,006 : INFO : precomputing L2-norms of word weight vectors
2017-04-03 15:55:57,023 : INFO : saving Word2Vec object under C:/EPC_

Saving the model: ADHD_chi2_features_10minwords_15context....
Finished Model training and saving for ADHD...
Begin sentences creation Antihistamines...
Parsing sentences from training set


2017-04-03 15:55:57,282 : INFO : collecting all words and their counts
2017-04-03 15:55:57,283 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:55:57,301 : INFO : collected 4986 word types from a corpus of 73593 raw words and 3562 sentences
2017-04-03 15:55:57,302 : INFO : Loading a fresh vocabulary
2017-04-03 15:55:57,308 : INFO : min_count=10 retains 985 unique words (19% of original 4986, drops 4001)
2017-04-03 15:55:57,309 : INFO : min_count=10 leaves 63553 word corpus (86% of original 73593, drops 10040)
2017-04-03 15:55:57,316 : INFO : deleting the raw counts dictionary of 4986 items
2017-04-03 15:55:57,319 : INFO : sample=0.001 downsamples 71 most-common words
2017-04-03 15:55:57,322 : INFO : downsampling leaves estimated 44789 word corpus (70.5% of prior 63553)
2017-04-03 15:55:57,323 : INFO : estimated required memory for 985 words and 29 dimensions: 721020 bytes
2017-04-03 15:55:57,328 : INFO : resetting layer weights
2017-04-03 15:55:5

Finished data loading and creating sentences
Training model...


2017-04-03 15:55:57,500 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-04-03 15:55:57,500 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-04-03 15:55:57,500 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-04-03 15:55:57,500 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-04-03 15:55:57,500 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-04-03 15:55:57,500 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-03 15:55:57,500 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-03 15:55:57,500 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-03 15:55:57,500 : INFO : training on 367965 raw words (223699 effective words) took 0.2s, 1464722 effective words/s
2017-04-03 15:55:57,519 : INFO : precomputing L2-norms of word weight vectors
2017-04-03 15:55:57,526 : INFO : saving Word2Vec object under C:/EPC_D

Saving the model: Antihistamines_chi2_features_10minwords_15context....
Finished Model training and saving for Antihistamines...
Begin sentences creation AtypicalAntipsychotics...
Parsing sentences from training set


2017-04-03 15:55:58,292 : INFO : collecting all words and their counts
2017-04-03 15:55:58,292 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:55:58,328 : INFO : PROGRESS: at sentence #10000, processed 208348 words, keeping 7571 word types
2017-04-03 15:55:58,338 : INFO : collected 8154 word types from a corpus of 254082 raw words and 12183 sentences
2017-04-03 15:55:58,339 : INFO : Loading a fresh vocabulary
2017-04-03 15:55:58,348 : INFO : min_count=10 retains 2124 unique words (26% of original 8154, drops 6030)
2017-04-03 15:55:58,349 : INFO : min_count=10 leaves 237966 word corpus (93% of original 254082, drops 16116)
2017-04-03 15:55:58,358 : INFO : deleting the raw counts dictionary of 8154 items
2017-04-03 15:55:58,358 : INFO : sample=0.001 downsamples 62 most-common words
2017-04-03 15:55:58,359 : INFO : downsampling leaves estimated 177181 word corpus (74.5% of prior 237966)
2017-04-03 15:55:58,361 : INFO : estimated required memory for

Finished data loading and creating sentences
Training model...


2017-04-03 15:55:59,075 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-04-03 15:55:59,075 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-04-03 15:55:59,075 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-04-03 15:55:59,075 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-04-03 15:55:59,092 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-04-03 15:55:59,092 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-03 15:55:59,092 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-03 15:55:59,092 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-03 15:55:59,092 : INFO : training on 1270410 raw words (885714 effective words) took 0.7s, 1278276 effective words/s
2017-04-03 15:55:59,092 : INFO : precomputing L2-norms of word weight vectors
2017-04-03 15:55:59,117 : INFO : saving Word2Vec object under C:/EPC_

Saving the model: AtypicalAntipsychotics_chi2_features_10minwords_15context....
Finished Model training and saving for AtypicalAntipsychotics...
Begin sentences creation BetaBlockers...
Parsing sentences from training set


2017-04-03 15:56:00,688 : INFO : collecting all words and their counts
2017-04-03 15:56:00,688 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:56:00,733 : INFO : PROGRESS: at sentence #10000, processed 219997 words, keeping 7887 word types
2017-04-03 15:56:00,769 : INFO : PROGRESS: at sentence #20000, processed 434892 words, keeping 10653 word types
2017-04-03 15:56:00,786 : INFO : collected 11342 word types from a corpus of 514859 raw words and 23631 sentences
2017-04-03 15:56:00,786 : INFO : Loading a fresh vocabulary
2017-04-03 15:56:00,802 : INFO : min_count=10 retains 3318 unique words (29% of original 11342, drops 8024)
2017-04-03 15:56:00,802 : INFO : min_count=10 leaves 493241 word corpus (95% of original 514859, drops 21618)
2017-04-03 15:56:00,812 : INFO : deleting the raw counts dictionary of 11342 items
2017-04-03 15:56:00,813 : INFO : sample=0.001 downsamples 57 most-common words
2017-04-03 15:56:00,813 : INFO : downsampling leaves 

Finished data loading and creating sentences
Training model...


2017-04-03 15:56:01,888 : INFO : PROGRESS: at 85.31% examples, 1607144 words/s, in_qsize 0, out_qsize 0
2017-04-03 15:56:02,015 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-04-03 15:56:02,015 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-04-03 15:56:02,015 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-04-03 15:56:02,015 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-04-03 15:56:02,032 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-04-03 15:56:02,032 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-03 15:56:02,036 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-03 15:56:02,039 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-03 15:56:02,039 : INFO : training on 2574295 raw words (1904319 effective words) took 1.2s, 1638967 effective words/s
2017-04-03 15:56:02,040 : INFO : precomput

Saving the model: BetaBlockers_chi2_features_10minwords_15context....
Finished Model training and saving for BetaBlockers...
Begin sentences creation CalciumChannelBlockers...
Parsing sentences from training set


2017-04-03 15:56:02,920 : INFO : collecting all words and their counts
2017-04-03 15:56:02,920 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:56:02,959 : INFO : PROGRESS: at sentence #10000, processed 212303 words, keeping 7884 word types
2017-04-03 15:56:02,976 : INFO : collected 8989 word types from a corpus of 305335 raw words and 14276 sentences
2017-04-03 15:56:02,977 : INFO : Loading a fresh vocabulary
2017-04-03 15:56:02,987 : INFO : min_count=10 retains 2335 unique words (25% of original 8989, drops 6654)
2017-04-03 15:56:02,990 : INFO : min_count=10 leaves 287306 word corpus (94% of original 305335, drops 18029)
2017-04-03 15:56:02,997 : INFO : deleting the raw counts dictionary of 8989 items
2017-04-03 15:56:02,999 : INFO : sample=0.001 downsamples 59 most-common words
2017-04-03 15:56:03,000 : INFO : downsampling leaves estimated 215870 word corpus (75.1% of prior 287306)
2017-04-03 15:56:03,003 : INFO : estimated required memory for

Finished data loading and creating sentences
Training model...


2017-04-03 15:56:03,776 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-04-03 15:56:03,792 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-04-03 15:56:03,792 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-04-03 15:56:03,792 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-04-03 15:56:03,806 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-04-03 15:56:03,806 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-03 15:56:03,806 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-03 15:56:03,806 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-03 15:56:03,806 : INFO : training on 1526675 raw words (1079269 effective words) took 0.8s, 1418480 effective words/s
2017-04-03 15:56:03,806 : INFO : precomputing L2-norms of word weight vectors
2017-04-03 15:56:03,831 : INFO : saving Word2Vec object under C:/EPC

Saving the model: CalciumChannelBlockers_chi2_features_10minwords_15context....
Finished Model training and saving for CalciumChannelBlockers...
Begin sentences creation Estrogens...
Parsing sentences from training set


2017-04-03 15:56:04,135 : INFO : collecting all words and their counts
2017-04-03 15:56:04,135 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:56:04,151 : INFO : collected 5487 word types from a corpus of 105634 raw words and 4793 sentences
2017-04-03 15:56:04,151 : INFO : Loading a fresh vocabulary
2017-04-03 15:56:04,170 : INFO : min_count=10 retains 1234 unique words (22% of original 5487, drops 4253)
2017-04-03 15:56:04,171 : INFO : min_count=10 leaves 94514 word corpus (89% of original 105634, drops 11120)
2017-04-03 15:56:04,177 : INFO : deleting the raw counts dictionary of 5487 items
2017-04-03 15:56:04,178 : INFO : sample=0.001 downsamples 66 most-common words
2017-04-03 15:56:04,180 : INFO : downsampling leaves estimated 68932 word corpus (72.9% of prior 94514)
2017-04-03 15:56:04,180 : INFO : estimated required memory for 1234 words and 233 dimensions: 2917176 bytes
2017-04-03 15:56:04,184 : INFO : resetting layer weights
2017-04-03 1

Finished data loading and creating sentences
Training model...


2017-04-03 15:56:04,434 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-04-03 15:56:04,434 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-04-03 15:56:04,434 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-04-03 15:56:04,434 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-04-03 15:56:04,450 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-04-03 15:56:04,450 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-03 15:56:04,450 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-03 15:56:04,450 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-03 15:56:04,450 : INFO : training on 528170 raw words (344902 effective words) took 0.2s, 1387114 effective words/s
2017-04-03 15:56:04,450 : INFO : precomputing L2-norms of word weight vectors
2017-04-03 15:56:04,470 : INFO : saving Word2Vec object under C:/EPC_D

Saving the model: Estrogens_chi2_features_10minwords_15context....
Finished Model training and saving for Estrogens...
Begin sentences creation NSAIDS...
Parsing sentences from training set


2017-04-03 15:56:04,753 : INFO : collecting all words and their counts
2017-04-03 15:56:04,753 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:56:04,769 : INFO : collected 5506 word types from a corpus of 96245 raw words and 4426 sentences
2017-04-03 15:56:04,769 : INFO : Loading a fresh vocabulary
2017-04-03 15:56:04,788 : INFO : min_count=10 retains 1104 unique words (20% of original 5506, drops 4402)
2017-04-03 15:56:04,789 : INFO : min_count=10 leaves 84902 word corpus (88% of original 96245, drops 11343)
2017-04-03 15:56:04,793 : INFO : deleting the raw counts dictionary of 5506 items
2017-04-03 15:56:04,795 : INFO : sample=0.001 downsamples 67 most-common words
2017-04-03 15:56:04,798 : INFO : downsampling leaves estimated 61582 word corpus (72.5% of prior 84902)
2017-04-03 15:56:04,799 : INFO : estimated required memory for 1104 words and 242 dimensions: 2689344 bytes
2017-04-03 15:56:04,803 : INFO : resetting layer weights
2017-04-03 15:

Finished data loading and creating sentences
Training model...


2017-04-03 15:56:05,036 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-04-03 15:56:05,036 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-04-03 15:56:05,036 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-04-03 15:56:05,036 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-04-03 15:56:05,036 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-04-03 15:56:05,036 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-03 15:56:05,036 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-03 15:56:05,052 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-03 15:56:05,052 : INFO : training on 481225 raw words (307909 effective words) took 0.2s, 1379240 effective words/s
2017-04-03 15:56:05,052 : INFO : precomputing L2-norms of word weight vectors
2017-04-03 15:56:05,052 : INFO : saving Word2Vec object under C:/EPC_D

Saving the model: NSAIDS_chi2_features_10minwords_15context....
Finished Model training and saving for NSAIDS...
Begin sentences creation Opiods...
Parsing sentences from training set


2017-04-03 15:56:06,375 : INFO : collecting all words and their counts
2017-04-03 15:56:06,375 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:56:06,418 : INFO : PROGRESS: at sentence #10000, processed 199329 words, keeping 8371 word types
2017-04-03 15:56:06,453 : INFO : PROGRESS: at sentence #20000, processed 395954 words, keeping 11313 word types
2017-04-03 15:56:06,466 : INFO : collected 11827 word types from a corpus of 445067 raw words and 22510 sentences
2017-04-03 15:56:06,467 : INFO : Loading a fresh vocabulary
2017-04-03 15:56:06,477 : INFO : min_count=10 retains 3060 unique words (25% of original 11827, drops 8767)
2017-04-03 15:56:06,480 : INFO : min_count=10 leaves 421743 word corpus (94% of original 445067, drops 23324)
2017-04-03 15:56:06,487 : INFO : deleting the raw counts dictionary of 11827 items
2017-04-03 15:56:06,490 : INFO : sample=0.001 downsamples 55 most-common words
2017-04-03 15:56:06,490 : INFO : downsampling leaves 

Finished data loading and creating sentences
Training model...


2017-04-03 15:56:07,342 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-04-03 15:56:07,342 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-04-03 15:56:07,342 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-04-03 15:56:07,342 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-04-03 15:56:07,342 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-04-03 15:56:07,342 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-03 15:56:07,342 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-03 15:56:07,342 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-03 15:56:07,342 : INFO : training on 2225335 raw words (1577638 effective words) took 0.8s, 1951460 effective words/s
2017-04-03 15:56:07,342 : INFO : precomputing L2-norms of word weight vectors
2017-04-03 15:56:07,372 : INFO : saving Word2Vec object under C:/EPC

Saving the model: Opiods_chi2_features_10minwords_15context....
Finished Model training and saving for Opiods...
Begin sentences creation OralHypoglycemics...
Parsing sentences from training set


2017-04-03 15:56:07,763 : INFO : collecting all words and their counts
2017-04-03 15:56:07,763 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:56:07,795 : INFO : collected 5950 word types from a corpus of 132877 raw words and 6140 sentences
2017-04-03 15:56:07,795 : INFO : Loading a fresh vocabulary
2017-04-03 15:56:07,795 : INFO : min_count=10 retains 1356 unique words (22% of original 5950, drops 4594)
2017-04-03 15:56:07,811 : INFO : min_count=10 leaves 120674 word corpus (90% of original 132877, drops 12203)
2017-04-03 15:56:07,816 : INFO : deleting the raw counts dictionary of 5950 items
2017-04-03 15:56:07,818 : INFO : sample=0.001 downsamples 63 most-common words
2017-04-03 15:56:07,819 : INFO : downsampling leaves estimated 86797 word corpus (71.9% of prior 120674)
2017-04-03 15:56:07,821 : INFO : estimated required memory for 1356 words and 234 dimensions: 3216432 bytes
2017-04-03 15:56:07,825 : INFO : resetting layer weights
2017-04-03

Finished data loading and creating sentences
Training model...


2017-04-03 15:56:08,117 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-04-03 15:56:08,117 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-04-03 15:56:08,131 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-04-03 15:56:08,131 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-04-03 15:56:08,131 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-04-03 15:56:08,131 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-03 15:56:08,131 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-03 15:56:08,131 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-03 15:56:08,131 : INFO : training on 664385 raw words (434233 effective words) took 0.3s, 1501491 effective words/s
2017-04-03 15:56:08,131 : INFO : precomputing L2-norms of word weight vectors
2017-04-03 15:56:08,154 : INFO : saving Word2Vec object under C:/EPC_D

Saving the model: OralHypoglycemics_chi2_features_10minwords_15context....
Finished Model training and saving for OralHypoglycemics...
Begin sentences creation ProtonPumpInhibitors...
Parsing sentences from training set


2017-04-03 15:56:09,130 : INFO : collecting all words and their counts
2017-04-03 15:56:09,130 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:56:09,174 : INFO : PROGRESS: at sentence #10000, processed 200995 words, keeping 6858 word types
2017-04-03 15:56:09,197 : INFO : collected 8135 word types from a corpus of 330134 raw words and 16412 sentences
2017-04-03 15:56:09,198 : INFO : Loading a fresh vocabulary
2017-04-03 15:56:09,211 : INFO : min_count=10 retains 2210 unique words (27% of original 8135, drops 5925)
2017-04-03 15:56:09,213 : INFO : min_count=10 leaves 314026 word corpus (95% of original 330134, drops 16108)
2017-04-03 15:56:09,220 : INFO : deleting the raw counts dictionary of 8135 items
2017-04-03 15:56:09,221 : INFO : sample=0.001 downsamples 61 most-common words
2017-04-03 15:56:09,223 : INFO : downsampling leaves estimated 228987 word corpus (72.9% of prior 314026)
2017-04-03 15:56:09,226 : INFO : estimated required memory for

Finished data loading and creating sentences
Training model...


2017-04-03 15:56:09,986 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-04-03 15:56:09,986 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-04-03 15:56:09,986 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-04-03 15:56:09,986 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-04-03 15:56:10,002 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-04-03 15:56:10,002 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-03 15:56:10,002 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-03 15:56:10,002 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-03 15:56:10,002 : INFO : training on 1650670 raw words (1145368 effective words) took 0.7s, 1548017 effective words/s
2017-04-03 15:56:10,002 : INFO : precomputing L2-norms of word weight vectors
2017-04-03 15:56:10,025 : INFO : saving Word2Vec object under C:/EPC

Saving the model: ProtonPumpInhibitors_chi2_features_10minwords_15context....
Finished Model training and saving for ProtonPumpInhibitors...
Begin sentences creation SkeletalMuscleRelaxants...
Parsing sentences from training set


2017-04-03 15:56:10,865 : INFO : collecting all words and their counts
2017-04-03 15:56:10,865 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:56:10,911 : INFO : PROGRESS: at sentence #10000, processed 191556 words, keeping 10641 word types
2017-04-03 15:56:10,931 : INFO : collected 12740 word types from a corpus of 290075 raw words and 15274 sentences
2017-04-03 15:56:10,933 : INFO : Loading a fresh vocabulary
2017-04-03 15:56:10,947 : INFO : min_count=10 retains 2999 unique words (23% of original 12740, drops 9741)
2017-04-03 15:56:10,948 : INFO : min_count=10 leaves 264542 word corpus (91% of original 290075, drops 25533)
2017-04-03 15:56:10,957 : INFO : deleting the raw counts dictionary of 12740 items
2017-04-03 15:56:10,960 : INFO : sample=0.001 downsamples 46 most-common words
2017-04-03 15:56:10,961 : INFO : downsampling leaves estimated 205888 word corpus (77.8% of prior 264542)
2017-04-03 15:56:10,963 : INFO : estimated required memory

Finished data loading and creating sentences
Training model...


2017-04-03 15:56:11,499 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-04-03 15:56:11,499 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-04-03 15:56:11,499 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-04-03 15:56:11,499 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-04-03 15:56:11,499 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-04-03 15:56:11,499 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-03 15:56:11,515 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-03 15:56:11,515 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-03 15:56:11,515 : INFO : training on 1450375 raw words (1029124 effective words) took 0.5s, 2037120 effective words/s
2017-04-03 15:56:11,515 : INFO : precomputing L2-norms of word weight vectors
2017-04-03 15:56:11,536 : INFO : saving Word2Vec object under C:/EPC

Saving the model: SkeletalMuscleRelaxants_chi2_features_10minwords_15context....
Finished Model training and saving for SkeletalMuscleRelaxants...
Begin sentences creation Statins...
Parsing sentences from training set


2017-04-03 15:56:13,490 : INFO : collecting all words and their counts
2017-04-03 15:56:13,490 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:56:13,530 : INFO : PROGRESS: at sentence #10000, processed 218883 words, keeping 8506 word types
2017-04-03 15:56:13,569 : INFO : PROGRESS: at sentence #20000, processed 439430 words, keeping 11972 word types
2017-04-03 15:56:13,607 : INFO : PROGRESS: at sentence #30000, processed 658132 words, keeping 13993 word types
2017-04-03 15:56:13,622 : INFO : collected 14553 word types from a corpus of 732006 raw words and 33435 sentences
2017-04-03 15:56:13,624 : INFO : Loading a fresh vocabulary


Finished data loading and creating sentences
Training model...


2017-04-03 15:56:13,690 : INFO : min_count=10 retains 4126 unique words (28% of original 14553, drops 10427)
2017-04-03 15:56:13,693 : INFO : min_count=10 leaves 703412 word corpus (96% of original 732006, drops 28594)
2017-04-03 15:56:13,703 : INFO : deleting the raw counts dictionary of 14553 items
2017-04-03 15:56:13,703 : INFO : sample=0.001 downsamples 58 most-common words
2017-04-03 15:56:13,706 : INFO : downsampling leaves estimated 544761 word corpus (77.4% of prior 703412)
2017-04-03 15:56:13,707 : INFO : estimated required memory for 4126 words and 467 dimensions: 17477736 bytes
2017-04-03 15:56:13,719 : INFO : resetting layer weights
2017-04-03 15:56:13,780 : INFO : training model with 8 workers on 4126 vocabulary and 467 features, using sg=0 hs=0 sample=0.001 negative=5 window=15
2017-04-03 15:56:13,782 : INFO : expecting 33435 sentences, matching count from corpus used for vocabulary survey
2017-04-03 15:56:14,782 : INFO : PROGRESS: at 42.56% examples, 1156257 words/s, in_

Saving the model: Statins_chi2_features_10minwords_15context....
Finished Model training and saving for Statins...
Begin sentences creation Triptans...
Parsing sentences from training set


2017-04-03 15:56:16,674 : INFO : collecting all words and their counts
2017-04-03 15:56:16,674 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:56:16,709 : INFO : collected 6417 word types from a corpus of 149167 raw words and 7324 sentences
2017-04-03 15:56:16,710 : INFO : Loading a fresh vocabulary
2017-04-03 15:56:16,720 : INFO : min_count=10 retains 1482 unique words (23% of original 6417, drops 4935)
2017-04-03 15:56:16,721 : INFO : min_count=10 leaves 136034 word corpus (91% of original 149167, drops 13133)
2017-04-03 15:56:16,726 : INFO : deleting the raw counts dictionary of 6417 items
2017-04-03 15:56:16,727 : INFO : sample=0.001 downsamples 66 most-common words
2017-04-03 15:56:16,730 : INFO : downsampling leaves estimated 96962 word corpus (71.3% of prior 136034)
2017-04-03 15:56:16,730 : INFO : estimated required memory for 1482 words and 121 dimensions: 2175576 bytes
2017-04-03 15:56:16,736 : INFO : resetting layer weights
2017-04-03

Finished data loading and creating sentences
Training model...


2017-04-03 15:56:17,017 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-04-03 15:56:17,017 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-04-03 15:56:17,017 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-04-03 15:56:17,017 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-04-03 15:56:17,017 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-04-03 15:56:17,033 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-03 15:56:17,033 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-03 15:56:17,033 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-03 15:56:17,033 : INFO : training on 745835 raw words (485279 effective words) took 0.3s, 1777927 effective words/s
2017-04-03 15:56:17,033 : INFO : precomputing L2-norms of word weight vectors
2017-04-03 15:56:17,049 : INFO : saving Word2Vec object under C:/EPC_D

Saving the model: Triptans_chi2_features_10minwords_15context....
Finished Model training and saving for Triptans...
Begin sentences creation UrinaryIncontinence...
Parsing sentences from training set


2017-04-03 15:56:17,292 : INFO : collecting all words and their counts
2017-04-03 15:56:17,292 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-03 15:56:17,305 : INFO : collected 5386 word types from a corpus of 72183 raw words and 3664 sentences
2017-04-03 15:56:17,308 : INFO : Loading a fresh vocabulary
2017-04-03 15:56:17,312 : INFO : min_count=10 retains 1003 unique words (18% of original 5386, drops 4383)
2017-04-03 15:56:17,313 : INFO : min_count=10 leaves 60999 word corpus (84% of original 72183, drops 11184)
2017-04-03 15:56:17,318 : INFO : deleting the raw counts dictionary of 5386 items
2017-04-03 15:56:17,319 : INFO : sample=0.001 downsamples 55 most-common words
2017-04-03 15:56:17,319 : INFO : downsampling leaves estimated 43743 word corpus (71.7% of prior 60999)
2017-04-03 15:56:17,321 : INFO : estimated required memory for 1003 words and 215 dimensions: 2226660 bytes
2017-04-03 15:56:17,323 : INFO : resetting layer weights
2017-04-03 15:

Finished data loading and creating sentences
Training model...


2017-04-03 15:56:17,500 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-03 15:56:17,500 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-03 15:56:17,500 : INFO : training on 360915 raw words (218491 effective words) took 0.2s, 1425940 effective words/s
2017-04-03 15:56:17,500 : INFO : precomputing L2-norms of word weight vectors
2017-04-03 15:56:17,500 : INFO : saving Word2Vec object under C:/EPC_Data/complete_data/word2vec\UrinaryIncontinence_chi2_features_10minwords_15context, separately None
2017-04-03 15:56:17,515 : INFO : not storing attribute syn0norm
2017-04-03 15:56:17,516 : INFO : not storing attribute cum_table
2017-04-03 15:56:17,542 : INFO : saved C:/EPC_Data/complete_data/word2vec\UrinaryIncontinence_chi2_features_10minwords_15context


Saving the model: UrinaryIncontinence_chi2_features_10minwords_15context....
Finished Model training and saving for UrinaryIncontinence...
