In [1]:
import cPickle as pickle

import scipy
import numpy as np
import pandas as pd
import re

# Load English stopwords from NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize, RegexpTokenizer, WhitespaceTokenizer
import nltk.data

from gensim.models import word2vec
import logging

import os

print ("Required Libraries loaded.")

Required Libraries loaded.


In [2]:
# Import articles for string cleaning

def clean_text(text, stem_words = False, remove_stopwords = False):
    ###clean
    texts = re.sub("[^a-zA-Z]",    #pattern to match
              " ",              #replace other with this
              str(text))                 #text to apply to
         
    #print "Text recived: ", texts 
    clean_corpus = texts.lower().split()
    #print "corpus: ", clean_corpus
    if stem_words:
        # Porter stemmer
        porter = nltk.PorterStemmer()
        # Snowball stemmer
        snowball = nltk.SnowballStemmer('english')
        # Lancaster stemmer
        lancaster = nltk.LancasterStemmer()
        # General stemming Lambda function to stem tokens
        clean_corpus = lambda tokens: [porter.stem(w) for w in corpus]
    if remove_stopwords:   # Optionally remove stop words
        stops = set(stopwords.words("english"))
        clean_corpus = [w for w in clean_corpus if not w in stops]
        #print "Clean_corpus: ", clean_corpus
    
    return (clean_corpus)

In [3]:
# Use the punkt tokenizer for sentence splitting

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def article_to_sentences(article, tokenizer, stem_words=False, remove_stopwords=False):
    """
    article - article to use as input to create the wordlist
    tokenizer - the tokenizer to use to split into sentences
    stem_words - boolean, whether to use the stemmer function or not
    remove_stopwords - boolean, whether to remove the stopwords function or not
    
    article_to_sentences: Function to convert a document to a sequence of sentences,
    optionally removing stop words.  
    returns: a sequence of sentences where each sentence is itself a sequence of words
    """
    raw_sentences = tokenizer.tokenize(article.strip())   # Punkt tokenize into sentences
    sentences = []   # create list of sentences 
    for raw_sentence in raw_sentences:
        #print "raw sentence: ", raw_sentence
        #print len(raw_sentence)
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(clean_text(raw_sentence, stem_words, remove_stopwords))

    # returns a list of lists: list of sentences composed of lists of words
        #print "Sentences: ", sentences
    return sentences

In [4]:
def create_sentences(corpus, name):
    print ("Begin sentences creation %s..." % name)
    sentences = []  # Initialize an empty list of sentences

    print ("Parsing sentences from training set")
    for article in corpus:
        sentences += article_to_sentences(article, tokenizer)
    print ("Finished data loading and creating sentences")
    return sentences

In [5]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
def train_save_model(sentences, filename):

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

    # Set values for various parameters
    #num_features = 210    # Word vector dimensionality                      
    min_word_count = 10   # Minimum word count                        
    num_workers = 8       # Number of threads to run in parallel
    context = 15          # Context window size                                                                                    
    downsampling = 1e-3   # Downsample setting for frequent words

    # Initialize and train the model (this will take some time)

    print ("Training model...")
    if (filename == "aceinhibitor"):
        num_features = 210
    elif (filename == "adhd"):
        num_features = 80
    elif (filename == "antihistamines"):
        num_features = 29
    elif (filename == "atypicalAntipsychotics"):
        num_features = 381
    elif (filename == "betaBlockers"):
        num_features = 194
    elif (filename == "calciumChannelBlockers"):
        num_features = 329
    elif (filename == "estrogens"):
        num_features = 233
    elif (filename == "nsaids"):
        num_features = 242
    elif (filename == "opiods"):
        num_features = 55
    elif (filename == "oralHypoglycemics"):
        num_features = 234
    elif (filename == "protonPumpInhibitors"):
        num_features = 206
    elif (filename == "skeletalMuscleRelaxants"):
        num_features = 11
    elif (filename == "statins"):
        num_features = 467
    elif (filename == "triptans"):
        num_features = 121
    elif (filename == "urinaryIncontinence"):
        num_features = 215
    model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, 
                              window = context, sample = downsampling)

    # If you don't plan to train the model any further, calling 
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    # It can be helpful to create a meaningful model name and 
    # save the model for later use. You can load it later using Word2Vec.load()
    
    model_name = filename + "_chi2_features_10minwords_15context"
    folder = "C:\Users\CR107\Dropbox\PhD\Experiments\TREC_2004\data\word2vec"
    print ("Saving the model: %s...." % model_name)
    model_name = os.path.join(folder, model_name)
    model.save(model_name)

    print ("Finished Model training and saving for %s...") % filename

In [6]:
#iterate through a directory, load a file, sentence = filename.TIABSMh
path = "C:\EPC_Data\TREC_BROKEN\No_Mh_Tag"
for file in os.listdir(path):
    current_file = os.path.join(path, file)
    filename, _, _ = file.split("_")
        
    file_obj = pd.read_csv(current_file, sep=",", index_col='PMID')
    corpus = file_obj.TIABSMh
   
    train_save_model(create_sentences(corpus, filename), filename)

Begin sentences creation aceinhibitor...
Parsing sentences from training set


2017-01-23 18:00:50,151 : INFO : collecting all words and their counts
2017-01-23 18:00:50,151 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:00:50,196 : INFO : PROGRESS: at sentence #10000, processed 253082 words, keeping 11950 word types
2017-01-23 18:00:50,230 : INFO : PROGRESS: at sentence #20000, processed 504463 words, keeping 16753 word types
2017-01-23 18:00:50,252 : INFO : collected 18630 word types from a corpus of 633837 raw words and 24993 sentences
2017-01-23 18:00:50,276 : INFO : min_count=10 retains 3900 unique words (drops 14730)
2017-01-23 18:00:50,276 : INFO : min_count leaves 599341 word corpus (94% of original 633837)
2017-01-23 18:00:50,288 : INFO : deleting the raw counts dictionary of 18630 items
2017-01-23 18:00:50,292 : INFO : sample=0.001 downsamples 56 most-common words
2017-01-23 18:00:50,292 : INFO : downsampling leaves estimated 458832 word corpus (76.6% of prior 599341)
2017-01-23 18:00:50,295 : INFO : estimated r

Finished data loading and creating sentences
Training model...


2017-01-23 18:00:50,362 : INFO : training model with 8 workers on 3900 vocabulary and 210 features, using sg=0 hs=0 sample=0.001 negative=5
2017-01-23 18:00:50,364 : INFO : expecting 24993 sentences, matching count from corpus used for vocabulary survey
2017-01-23 18:00:51,469 : INFO : PROGRESS: at 65.50% examples, 1480399 words/s, in_qsize 0, out_qsize 0
2017-01-23 18:00:52,032 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:00:52,042 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:00:52,043 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:00:52,049 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:00:52,052 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:00:52,053 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-01-23 18:00:52,058 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-

Saving the model: aceinhibitor_chi2_features_10minwords_15context....
Finished Model training and saving for aceinhibitor...
Begin sentences creation adhd...
Parsing sentences from training set


2017-01-23 18:00:52,851 : INFO : collecting all words and their counts
2017-01-23 18:00:52,851 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:00:52,891 : INFO : collected 10626 word types from a corpus of 214896 raw words and 8900 sentences
2017-01-23 18:00:52,903 : INFO : min_count=10 retains 2144 unique words (drops 8482)
2017-01-23 18:00:52,905 : INFO : min_count leaves 194197 word corpus (90% of original 214896)
2017-01-23 18:00:52,913 : INFO : deleting the raw counts dictionary of 10626 items
2017-01-23 18:00:52,914 : INFO : sample=0.001 downsamples 62 most-common words
2017-01-23 18:00:52,917 : INFO : downsampling leaves estimated 148129 word corpus (76.3% of prior 194197)
2017-01-23 18:00:52,920 : INFO : estimated required memory for 2144 words and 80 dimensions: 2444160 bytes
2017-01-23 18:00:52,927 : INFO : resetting layer weights
2017-01-23 18:00:52,960 : INFO : training model with 8 workers on 2144 vocabulary and 80 features, using s

Finished data loading and creating sentences
Training model...


2017-01-23 18:00:53,404 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:00:53,404 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:00:53,404 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:00:53,404 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:00:53,404 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:00:53,404 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-01-23 18:00:53,414 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-01-23 18:00:53,414 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-01-23 18:00:53,414 : INFO : training on 1074480 raw words (740802 effective words) took 0.4s, 2041971 effective words/s
2017-01-23 18:00:53,414 : INFO : precomputing L2-norms of word weight vectors
2017-01-23 18:00:53,440 : INFO : saving Word2Vec object under C:\User

Saving the model: adhd_chi2_features_10minwords_15context....
Finished Model training and saving for adhd...


2017-01-23 18:00:53,844 : INFO : collecting all words and their counts
2017-01-23 18:00:53,844 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Begin sentences creation antihistamines...
Parsing sentences from training set
Finished data loading and creating sentences
Training model...


2017-01-23 18:00:53,854 : INFO : collected 6224 word types from a corpus of 74720 raw words and 3091 sentences
2017-01-23 18:00:53,865 : INFO : min_count=10 retains 987 unique words (drops 5237)
2017-01-23 18:00:53,868 : INFO : min_count leaves 62623 word corpus (83% of original 74720)
2017-01-23 18:00:53,874 : INFO : deleting the raw counts dictionary of 6224 items
2017-01-23 18:00:53,875 : INFO : sample=0.001 downsamples 67 most-common words
2017-01-23 18:00:53,878 : INFO : downsampling leaves estimated 44141 word corpus (70.5% of prior 62623)
2017-01-23 18:00:53,882 : INFO : estimated required memory for 987 words and 29 dimensions: 722484 bytes
2017-01-23 18:00:53,891 : INFO : resetting layer weights
2017-01-23 18:00:53,911 : INFO : training model with 8 workers on 987 vocabulary and 29 features, using sg=0 hs=0 sample=0.001 negative=5
2017-01-23 18:00:53,913 : INFO : expecting 3091 sentences, matching count from corpus used for vocabulary survey
2017-01-23 18:00:54,086 : INFO : wo

Saving the model: antihistamines_chi2_features_10minwords_15context....
Finished Model training and saving for antihistamines...
Begin sentences creation atypicalAntipsychotics...
Parsing sentences from training set


2017-01-23 18:00:55,154 : INFO : collecting all words and their counts
2017-01-23 18:00:55,154 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:00:55,190 : INFO : PROGRESS: at sentence #10000, processed 246004 words, keeping 10842 word types
2017-01-23 18:00:55,194 : INFO : collected 11162 word types from a corpus of 262285 raw words and 10669 sentences
2017-01-23 18:00:55,207 : INFO : min_count=10 retains 2255 unique words (drops 8907)
2017-01-23 18:00:55,207 : INFO : min_count leaves 241151 word corpus (91% of original 262285)
2017-01-23 18:00:55,216 : INFO : deleting the raw counts dictionary of 11162 items
2017-01-23 18:00:55,217 : INFO : sample=0.001 downsamples 62 most-common words
2017-01-23 18:00:55,219 : INFO : downsampling leaves estimated 180424 word corpus (74.8% of prior 241151)
2017-01-23 18:00:55,220 : INFO : estimated required memory for 2255 words and 381 dimensions: 8000740 bytes
2017-01-23 18:00:55,230 : INFO : resetting layer 

Finished data loading and creating sentences
Training model...


2017-01-23 18:00:56,012 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:00:56,012 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:00:56,012 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:00:56,022 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:00:56,022 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:00:56,032 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-01-23 18:00:56,033 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-01-23 18:00:56,036 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-01-23 18:00:56,038 : INFO : training on 1311425 raw words (902425 effective words) took 0.7s, 1351462 effective words/s
2017-01-23 18:00:56,039 : INFO : precomputing L2-norms of word weight vectors
2017-01-23 18:00:56,052 : INFO : saving Word2Vec object under C:\User

Saving the model: atypicalAntipsychotics_chi2_features_10minwords_15context....
Finished Model training and saving for atypicalAntipsychotics...
Begin sentences creation betaBlockers...
Parsing sentences from training set


2017-01-23 18:00:58,029 : INFO : collecting all words and their counts
2017-01-23 18:00:58,029 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:00:58,069 : INFO : PROGRESS: at sentence #10000, processed 252058 words, keeping 11801 word types
2017-01-23 18:00:58,101 : INFO : PROGRESS: at sentence #20000, processed 503517 words, keeping 17273 word types
2017-01-23 18:00:58,108 : INFO : collected 17636 word types from a corpus of 530446 raw words and 21051 sentences
2017-01-23 18:00:58,125 : INFO : min_count=10 retains 3667 unique words (drops 13969)
2017-01-23 18:00:58,127 : INFO : min_count leaves 497632 word corpus (93% of original 530446)
2017-01-23 18:00:58,135 : INFO : deleting the raw counts dictionary of 17636 items
2017-01-23 18:00:58,138 : INFO : sample=0.001 downsamples 49 most-common words
2017-01-23 18:00:58,140 : INFO : downsampling leaves estimated 384294 word corpus (77.2% of prior 497632)
2017-01-23 18:00:58,141 : INFO : estimated r

Finished data loading and creating sentences
Training model...


2017-01-23 18:00:59,286 : INFO : PROGRESS: at 85.69% examples, 1638047 words/s, in_qsize 3, out_qsize 0
2017-01-23 18:00:59,424 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:00:59,424 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:00:59,436 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:00:59,438 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:00:59,441 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:00:59,443 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-01-23 18:00:59,444 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-01-23 18:00:59,447 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-01-23 18:00:59,447 : INFO : training on 2652230 raw words (1921647 effective words) took 1.2s, 1653664 effective words/s
2017-01-23 18:00:59,448 : INFO : precomput

Saving the model: betaBlockers_chi2_features_10minwords_15context....
Finished Model training and saving for betaBlockers...
Begin sentences creation calciumChannelBlockers...
Parsing sentences from training set


2017-01-23 18:01:00,436 : INFO : collecting all words and their counts
2017-01-23 18:01:00,436 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:01:00,474 : INFO : PROGRESS: at sentence #10000, processed 245782 words, keeping 11844 word types
2017-01-23 18:01:00,484 : INFO : collected 13052 word types from a corpus of 312962 raw words and 12682 sentences
2017-01-23 18:01:00,497 : INFO : min_count=10 retains 2515 unique words (drops 10537)
2017-01-23 18:01:00,499 : INFO : min_count leaves 287903 word corpus (91% of original 312962)
2017-01-23 18:01:00,507 : INFO : deleting the raw counts dictionary of 13052 items
2017-01-23 18:01:00,509 : INFO : sample=0.001 downsamples 58 most-common words
2017-01-23 18:01:00,512 : INFO : downsampling leaves estimated 216876 word corpus (75.3% of prior 287903)
2017-01-23 18:01:00,513 : INFO : estimated required memory for 2515 words and 329 dimensions: 7876980 bytes
2017-01-23 18:01:00,519 : INFO : resetting layer

Finished data loading and creating sentences
Training model...


2017-01-23 18:01:01,430 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:01:01,440 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:01:01,440 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:01:01,451 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:01:01,454 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:01:01,457 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-01-23 18:01:01,460 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-01-23 18:01:01,461 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-01-23 18:01:01,463 : INFO : training on 1564810 raw words (1084530 effective words) took 0.8s, 1329894 effective words/s
2017-01-23 18:01:01,463 : INFO : precomputing L2-norms of word weight vectors
2017-01-23 18:01:01,477 : INFO : saving Word2Vec object under C:\Use

Saving the model: calciumChannelBlockers_chi2_features_10minwords_15context....
Finished Model training and saving for calciumChannelBlockers...
Begin sentences creation estrogens...
Parsing sentences from training set


2017-01-23 18:01:01,993 : INFO : collecting all words and their counts
2017-01-23 18:01:01,996 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:01:02,012 : INFO : collected 7208 word types from a corpus of 107545 raw words and 4316 sentences
2017-01-23 18:01:02,019 : INFO : min_count=10 retains 1244 unique words (drops 5964)
2017-01-23 18:01:02,020 : INFO : min_count leaves 93369 word corpus (86% of original 107545)
2017-01-23 18:01:02,026 : INFO : deleting the raw counts dictionary of 7208 items
2017-01-23 18:01:02,029 : INFO : sample=0.001 downsamples 68 most-common words
2017-01-23 18:01:02,030 : INFO : downsampling leaves estimated 68098 word corpus (72.9% of prior 93369)
2017-01-23 18:01:02,032 : INFO : estimated required memory for 1244 words and 233 dimensions: 2940816 bytes
2017-01-23 18:01:02,039 : INFO : resetting layer weights
2017-01-23 18:01:02,059 : INFO : training model with 8 workers on 1244 vocabulary and 233 features, using sg=0

Finished data loading and creating sentences
Training model...


2017-01-23 18:01:02,371 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:01:02,371 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:01:02,381 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:01:02,381 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:01:02,381 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:01:02,391 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-01-23 18:01:02,394 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-01-23 18:01:02,395 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-01-23 18:01:02,397 : INFO : training on 537725 raw words (340899 effective words) took 0.2s, 1396630 effective words/s
2017-01-23 18:01:02,398 : INFO : precomputing L2-norms of word weight vectors
2017-01-23 18:01:02,408 : INFO : saving Word2Vec object under C:\Users

Saving the model: estrogens_chi2_features_10minwords_15context....
Finished Model training and saving for estrogens...
Begin sentences creation nsaids...
Parsing sentences from training set


2017-01-23 18:01:02,815 : INFO : collecting all words and their counts
2017-01-23 18:01:02,815 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:01:02,825 : INFO : collected 6881 word types from a corpus of 96702 raw words and 3861 sentences
2017-01-23 18:01:02,841 : INFO : min_count=10 retains 1124 unique words (drops 5757)
2017-01-23 18:01:02,842 : INFO : min_count leaves 83184 word corpus (86% of original 96702)
2017-01-23 18:01:02,848 : INFO : deleting the raw counts dictionary of 6881 items
2017-01-23 18:01:02,848 : INFO : sample=0.001 downsamples 64 most-common words
2017-01-23 18:01:02,849 : INFO : downsampling leaves estimated 60256 word corpus (72.4% of prior 83184)
2017-01-23 18:01:02,851 : INFO : estimated required memory for 1124 words and 242 dimensions: 2738064 bytes
2017-01-23 18:01:02,855 : INFO : resetting layer weights
2017-01-23 18:01:02,871 : INFO : training model with 8 workers on 1124 vocabulary and 242 features, using sg=0 h

Finished data loading and creating sentences
Training model...


2017-01-23 18:01:03,151 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:01:03,151 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:01:03,161 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:01:03,161 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:01:03,161 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:01:03,161 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-01-23 18:01:03,161 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-01-23 18:01:03,161 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-01-23 18:01:03,171 : INFO : training on 483510 raw words (301113 effective words) took 0.2s, 1408312 effective words/s
2017-01-23 18:01:03,171 : INFO : precomputing L2-norms of word weight vectors
2017-01-23 18:01:03,171 : INFO : saving Word2Vec object under C:\Users

Saving the model: nsaids_chi2_features_10minwords_15context....
Finished Model training and saving for nsaids...
Begin sentences creation opiods...
Parsing sentences from training set


2017-01-23 18:01:04,759 : INFO : collecting all words and their counts
2017-01-23 18:01:04,759 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:01:04,796 : INFO : PROGRESS: at sentence #10000, processed 234961 words, keeping 12120 word types
2017-01-23 18:01:04,825 : INFO : collected 16497 word types from a corpus of 453592 raw words and 19470 sentences
2017-01-23 18:01:04,839 : INFO : min_count=10 retains 3230 unique words (drops 13267)
2017-01-23 18:01:04,842 : INFO : min_count leaves 422371 word corpus (93% of original 453592)
2017-01-23 18:01:04,851 : INFO : deleting the raw counts dictionary of 16497 items
2017-01-23 18:01:04,854 : INFO : sample=0.001 downsamples 58 most-common words
2017-01-23 18:01:04,855 : INFO : downsampling leaves estimated 318452 word corpus (75.4% of prior 422371)
2017-01-23 18:01:04,858 : INFO : estimated required memory for 3230 words and 55 dimensions: 3036200 bytes
2017-01-23 18:01:04,865 : INFO : resetting layer 

Finished data loading and creating sentences
Training model...


2017-01-23 18:01:05,727 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:01:05,727 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:01:05,727 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:01:05,737 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:01:05,737 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:01:05,737 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-01-23 18:01:05,737 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-01-23 18:01:05,737 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-01-23 18:01:05,747 : INFO : training on 2267960 raw words (1592179 effective words) took 0.8s, 2087628 effective words/s
2017-01-23 18:01:05,749 : INFO : precomputing L2-norms of word weight vectors
2017-01-23 18:01:05,765 : INFO : saving Word2Vec object under C:\Use

Saving the model: opiods_chi2_features_10minwords_15context....
Finished Model training and saving for opiods...
Begin sentences creation oralHypoglycemics...
Parsing sentences from training set


2017-01-23 18:01:06,394 : INFO : collecting all words and their counts
2017-01-23 18:01:06,394 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:01:06,404 : INFO : collected 7880 word types from a corpus of 137638 raw words and 5580 sentences
2017-01-23 18:01:06,424 : INFO : min_count=10 retains 1445 unique words (drops 6435)
2017-01-23 18:01:06,424 : INFO : min_count leaves 122123 word corpus (88% of original 137638)
2017-01-23 18:01:06,430 : INFO : deleting the raw counts dictionary of 7880 items
2017-01-23 18:01:06,430 : INFO : sample=0.001 downsamples 63 most-common words
2017-01-23 18:01:06,431 : INFO : downsampling leaves estimated 88707 word corpus (72.6% of prior 122123)
2017-01-23 18:01:06,433 : INFO : estimated required memory for 1445 words and 234 dimensions: 3427540 bytes
2017-01-23 18:01:06,437 : INFO : resetting layer weights
2017-01-23 18:01:06,457 : INFO : training model with 8 workers on 1445 vocabulary and 234 features, using sg

Finished data loading and creating sentences
Training model...


2017-01-23 18:01:06,838 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:01:06,839 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:01:06,844 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:01:06,846 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:01:06,851 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:01:06,854 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-01-23 18:01:06,855 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-01-23 18:01:06,858 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-01-23 18:01:06,858 : INFO : training on 688190 raw words (443328 effective words) took 0.3s, 1431576 effective words/s
2017-01-23 18:01:06,859 : INFO : precomputing L2-norms of word weight vectors
2017-01-23 18:01:06,868 : INFO : saving Word2Vec object under C:\Users

Saving the model: oralHypoglycemics_chi2_features_10minwords_15context....
Finished Model training and saving for oralHypoglycemics...
Begin sentences creation protonPumpInhibitors...
Parsing sentences from training set


2017-01-23 18:01:08,061 : INFO : collecting all words and their counts
2017-01-23 18:01:08,061 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:01:08,099 : INFO : PROGRESS: at sentence #10000, processed 230302 words, keeping 9660 word types
2017-01-23 18:01:08,115 : INFO : collected 11243 word types from a corpus of 333475 raw words and 14535 sentences
2017-01-23 18:01:08,125 : INFO : min_count=10 retains 2362 unique words (drops 8881)
2017-01-23 18:01:08,125 : INFO : min_count leaves 312001 word corpus (93% of original 333475)
2017-01-23 18:01:08,134 : INFO : deleting the raw counts dictionary of 11243 items
2017-01-23 18:01:08,134 : INFO : sample=0.001 downsamples 61 most-common words
2017-01-23 18:01:08,137 : INFO : downsampling leaves estimated 230368 word corpus (73.8% of prior 312001)
2017-01-23 18:01:08,138 : INFO : estimated required memory for 2362 words and 206 dimensions: 5073576 bytes
2017-01-23 18:01:08,144 : INFO : resetting layer w

Finished data loading and creating sentences
Training model...


2017-01-23 18:01:08,964 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:01:08,974 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:01:08,974 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:01:08,974 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:01:08,974 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:01:08,984 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-01-23 18:01:08,984 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-01-23 18:01:08,984 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-01-23 18:01:08,984 : INFO : training on 1667375 raw words (1151854 effective words) took 0.7s, 1577631 effective words/s
2017-01-23 18:01:08,984 : INFO : precomputing L2-norms of word weight vectors
2017-01-23 18:01:09,006 : INFO : saving Word2Vec object under C:\Use

Saving the model: protonPumpInhibitors_chi2_features_10minwords_15context....
Finished Model training and saving for protonPumpInhibitors...
Begin sentences creation skeletalMuscleRelaxants...
Parsing sentences from training set


2017-01-23 18:01:10,059 : INFO : collecting all words and their counts
2017-01-23 18:01:10,059 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:01:10,094 : INFO : PROGRESS: at sentence #10000, processed 236263 words, keeping 16488 word types
2017-01-23 18:01:10,105 : INFO : collected 18425 word types from a corpus of 298513 raw words and 12676 sentences
2017-01-23 18:01:10,121 : INFO : min_count=10 retains 3075 unique words (drops 15350)
2017-01-23 18:01:10,121 : INFO : min_count leaves 263712 word corpus (88% of original 298513)
2017-01-23 18:01:10,131 : INFO : deleting the raw counts dictionary of 18425 items
2017-01-23 18:01:10,131 : INFO : sample=0.001 downsamples 45 most-common words
2017-01-23 18:01:10,134 : INFO : downsampling leaves estimated 202852 word corpus (76.9% of prior 263712)
2017-01-23 18:01:10,134 : INFO : estimated required memory for 3075 words and 11 dimensions: 1808100 bytes
2017-01-23 18:01:10,144 : INFO : resetting layer 

Finished data loading and creating sentences
Training model...


2017-01-23 18:01:10,726 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:01:10,726 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:01:10,726 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:01:10,726 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:01:10,726 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:01:10,736 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-01-23 18:01:10,736 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-01-23 18:01:10,736 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-01-23 18:01:10,736 : INFO : training on 1492565 raw words (1014274 effective words) took 0.5s, 2149997 effective words/s
2017-01-23 18:01:10,736 : INFO : precomputing L2-norms of word weight vectors
2017-01-23 18:01:10,757 : INFO : saving Word2Vec object under C:\Use

Saving the model: skeletalMuscleRelaxants_chi2_features_10minwords_15context....
Finished Model training and saving for skeletalMuscleRelaxants...
Begin sentences creation statins...
Parsing sentences from training set


2017-01-23 18:01:13,124 : INFO : collecting all words and their counts
2017-01-23 18:01:13,124 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:01:13,164 : INFO : PROGRESS: at sentence #10000, processed 257783 words, keeping 13546 word types
2017-01-23 18:01:13,197 : INFO : PROGRESS: at sentence #20000, processed 513081 words, keeping 19771 word types
2017-01-23 18:01:13,227 : INFO : collected 23692 word types from a corpus of 741783 raw words and 28886 sentences
2017-01-23 18:01:13,249 : INFO : min_count=10 retains 4579 unique words (drops 19113)
2017-01-23 18:01:13,250 : INFO : min_count leaves 697558 word corpus (94% of original 741783)
2017-01-23 18:01:13,262 : INFO : deleting the raw counts dictionary of 23692 items
2017-01-23 18:01:13,263 : INFO : sample=0.001 downsamples 52 most-common words
2017-01-23 18:01:13,265 : INFO : downsampling leaves estimated 542657 word corpus (77.8% of prior 697558)
2017-01-23 18:01:13,266 : INFO : estimated r

Finished data loading and creating sentences
Training model...


2017-01-23 18:01:13,348 : INFO : training model with 8 workers on 4579 vocabulary and 467 features, using sg=0 hs=0 sample=0.001 negative=5
2017-01-23 18:01:13,348 : INFO : expecting 28886 sentences, matching count from corpus used for vocabulary survey
2017-01-23 18:01:14,436 : INFO : PROGRESS: at 37.39% examples, 1014908 words/s, in_qsize 16, out_qsize 0
2017-01-23 18:01:15,431 : INFO : PROGRESS: at 79.38% examples, 1077119 words/s, in_qsize 14, out_qsize 0
2017-01-23 18:01:15,881 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:01:15,881 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:01:15,891 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:01:15,900 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:01:15,907 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:01:15,910 : INFO : worker thread finished; awaiting finish of 2 mor

Saving the model: statins_chi2_features_10minwords_15context....
Finished Model training and saving for statins...
Begin sentences creation triptans...
Parsing sentences from training set


2017-01-23 18:01:16,546 : INFO : collecting all words and their counts
2017-01-23 18:01:16,556 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:01:16,566 : INFO : collected 8395 word types from a corpus of 151688 raw words and 6328 sentences
2017-01-23 18:01:16,584 : INFO : min_count=10 retains 1540 unique words (drops 6855)
2017-01-23 18:01:16,585 : INFO : min_count leaves 135313 word corpus (89% of original 151688)
2017-01-23 18:01:16,592 : INFO : deleting the raw counts dictionary of 8395 items
2017-01-23 18:01:16,592 : INFO : sample=0.001 downsamples 66 most-common words
2017-01-23 18:01:16,595 : INFO : downsampling leaves estimated 96975 word corpus (71.7% of prior 135313)
2017-01-23 18:01:16,595 : INFO : estimated required memory for 1540 words and 121 dimensions: 2260720 bytes
2017-01-23 18:01:16,601 : INFO : resetting layer weights
2017-01-23 18:01:16,621 : INFO : training model with 8 workers on 1540 vocabulary and 121 features, using sg

Finished data loading and creating sentences
Training model...


2017-01-23 18:01:16,970 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:01:16,971 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:01:16,973 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:01:16,977 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:01:16,980 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:01:16,982 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-01-23 18:01:16,984 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-01-23 18:01:16,986 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-01-23 18:01:16,987 : INFO : training on 758440 raw words (484883 effective words) took 0.3s, 1736543 effective words/s
2017-01-23 18:01:16,990 : INFO : precomputing L2-norms of word weight vectors
2017-01-23 18:01:16,997 : INFO : saving Word2Vec object under C:\Users

Saving the model: triptans_chi2_features_10minwords_15context....
Finished Model training and saving for triptans...
Begin sentences creation urinaryIncontinence...
Parsing sentences from training set


2017-01-23 18:01:17,357 : INFO : collecting all words and their counts
2017-01-23 18:01:17,357 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-23 18:01:17,368 : INFO : collected 6502 word types from a corpus of 72639 raw words and 3126 sentences
2017-01-23 18:01:17,381 : INFO : min_count=10 retains 1004 unique words (drops 5498)
2017-01-23 18:01:17,384 : INFO : min_count leaves 59811 word corpus (82% of original 72639)
2017-01-23 18:01:17,388 : INFO : deleting the raw counts dictionary of 6502 items
2017-01-23 18:01:17,388 : INFO : sample=0.001 downsamples 56 most-common words
2017-01-23 18:01:17,391 : INFO : downsampling leaves estimated 42611 word corpus (71.2% of prior 59811)
2017-01-23 18:01:17,391 : INFO : estimated required memory for 1004 words and 215 dimensions: 2228880 bytes
2017-01-23 18:01:17,395 : INFO : resetting layer weights
2017-01-23 18:01:17,410 : INFO : training model with 8 workers on 1004 vocabulary and 215 features, using sg=0 h

Finished data loading and creating sentences
Training model...


2017-01-23 18:01:17,680 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-01-23 18:01:17,688 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-01-23 18:01:17,690 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-01-23 18:01:17,691 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-01-23 18:01:17,694 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-01-23 18:01:17,696 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-01-23 18:01:17,697 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-01-23 18:01:17,697 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-01-23 18:01:17,698 : INFO : training on 363195 raw words (213090 effective words) took 0.2s, 1081025 effective words/s
2017-01-23 18:01:17,700 : INFO : precomputing L2-norms of word weight vectors
2017-01-23 18:01:17,707 : INFO : saving Word2Vec object under C:\Users

Saving the model: urinaryIncontinence_chi2_features_10minwords_15context....
Finished Model training and saving for urinaryIncontinence...
