In [1]:
import gensim
from gensim import corpora, models, similarities
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pymongo as pm
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix



In [2]:
client = pm.MongoClient('localhost', 27017)
db = client['full_texts'] # creates database if not there
#documents = db['texts'] # creates new collection if not there
documents = db['train_val']

get data with each label

In [3]:
df_conclusion = pd.DataFrame(list(documents.find({'label': 'conclusion'}).limit(10000)))
df_result = pd.DataFrame(list(documents.find({'label': 'result'}).limit(10000)))
df_method = pd.DataFrame(list(documents.find({'label': 'method'}).limit(10000)))
df_intro = pd.DataFrame(list(documents.find({'label': 'introduction'}).limit(10000)))

dfs = [df_conclusion, df_result, df_method, df_intro]

# merge the dfs!
merged_df = pd.concat(dfs)
merged_df.sort_values('date', inplace = True)

write data to tsv to check stuff out...

In [7]:
merged_df.to_csv("dev_data.tsv", sep = "\t", index = False)

drop short rows

In [67]:
merged_df = merged_df.loc[merged_df['text'].map(len) > 400]
merged_df.reset_index(inplace = True, drop = True)
print(merged_df.shape)

(38933, 6)


In [52]:
merged_df = merged_df.sample(200) # use a small set for playin'

convert labels for binary classification

In [68]:
pattern = r'^(?!method).*$'
merged_df['label'].replace(pattern, 'other', regex = True, inplace = True) # check this is working

split up sections into sentences

In [17]:
from nltk.tokenize import sent_tokenize

In [21]:
merged_df['sentences'] = merged_df.apply(lambda row: sent_tokenize(row['text']), axis=1)

In [24]:
sentences = []
label = []
doc_id = []

for i, row in merged_df.iterrows():
    for sentence in row['sentences']:
        sentences.append(sentence)
        label.append(row['label'])
        doc_id.append(row['article'])

In [27]:
sentence_df = pd.DataFrame({'id': doc_id, 'label': label, 'sentence': sentences})

Split up the data. Note that I am in a development phase and really the test set is a validation set.

my non-random way

In [53]:
X_train = merged_df.loc[:31999, 'text']
y_train = merged_df.loc[:31999, 'label']
print(y_train[y_train.isin(['conclusion'])].shape)
print(y_train[y_train.isin(['method'])].shape)
print(y_train[y_train.isin(['introduction'])].shape)
print(y_train[y_train.isin(['result'])].shape)
X_test = merged_df.loc[32000:, 'text']
y_test = merged_df.loc[32000:, 'label']
print(y_test[y_test.isin(['method'])].shape)

# # and if using sentences...
# X_train = sentence_df.loc[:122967, ['sentence', 'article']]
# y_train = sentence_df.loc[:122967, 'label']
# print(y_train[y_train.isin(['method'])].shape)

# X_test = sentence_df.loc[:122968, ['sentence', 'article']]
# y_test = sentence_df.loc[:122968, 'label']
# print(y_test[y_test.isin(['method'])].shape)

KeyError: 31999

sklearn way

In [75]:
# sklearn way, that does things randomly
X_train, X_test, y_train, y_test = train_test_split(
    #merged_df[['text', 'article', 'label']], # use when I want doc id'
    merged_df['text'],
    merged_df['label'],
    test_size=0.2,
    stratify = merged_df['label'])

strip all whitespace 

In [70]:
X_train = [re.sub(r'\s+', ' ', doc) for doc in X_train]
X_test = [re.sub(r'\s+', ' ', doc) for doc in X_test]

tokenize with nltk

In [60]:
train_docs = [TaggedDocument(word_tokenize(row), ["{}".format(i)]) for i, row in enumerate(X_train)]
test_docs = [TaggedDocument(word_tokenize(row), ["{}".format(i)]) for i, row in enumerate(X_test)]

using doc2vec. If I want to use unique id for each document, use lines of code that include 'article' key.
Otherwise, use lines that tag docs with label

In [73]:
# using doc pub# as doc id. why does model drop some with this method?
test_docs = [TaggedDocument(gensim.utils.simple_preprocess(row['text']), [row['article']]) for i, row in X_test.iterrows()]
train_docs = [TaggedDocument(gensim.utils.simple_preprocess(row['text']), [row['article']]) for i, row in X_train.iterrows()]
# Here's an alternative way
#tagged = X_train.apply(lambda r: TaggedDocument(words=gensim.utils.simple_preprocess(r['text']), tags=[r['article']]), axis=1)
# use tag.values as input to Doc2Vec

# unique doc id as doc tag
# there is something wird going on with iterrows. model drops docs. However, below method drops no values
# train_docs = [TaggedDocument(gensim.utils.simple_preprocess(row), ["{}".format(i)]) for i, row in enumerate(X_train)]
# test_docs = [TaggedDocument(gensim.utils.simple_preprocess(row), ["{}".format(i)]) for i, row in enumerate(X_test)]

# labels as doc tag
# train_docs = [TaggedDocument(gensim.utils.simple_preprocess(row), [label]) for label, row in zip(y_train, X_train)]
# test_docs = [TaggedDocument(gensim.utils.simple_preprocess(row), [label]) for label, row in zip(y_test, X_test)] 

print(len(train_docs), len(test_docs))

31146 7787


In [67]:
import multiprocessing

In [68]:
multiprocessing.cpu_count()

8

Build doc2vec model

In [74]:
model = Doc2Vec(train_docs, vector_size = 100, epochs = 20, window=10, workers = 5) #min_count=0)min_count=3, sample=1e-4 # window of 10 is used in paper

# if I don't input train_docs to Doc2Vec, I can build vocab and train in separate steps
#model = Doc2Vec(train_docs, epochs = 20, window=100)
#model.build_vocab(train_docs)
#model.train(train_docs, total_examples=model.corpus_count, epochs=model.epochs)

model.docvecs.count # make sure model is same length as training data

14051

save model

In [70]:
model.save("doc2vec_classid.model")

or load new model

In [2]:
#loaded_model = Doc2Vec.load("test_doc2vec_binary.model")
# on monday, 8-6, load model
model = Doc2Vec.load("doc2vec_uniqueid_retrained.model")

In [12]:
model.most_similar([""])

  """Entry point for launching an IPython kernel.


KeyError: "word '1' not in vocabulary"

How good is the model?

do some assesment here https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb

In [71]:
ranks = []
second_ranks = []
for doc_id in range(len(train_docs)):
    inferred_vector = model.infer_vector(train_docs[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])
collections.Counter(ranks)

  if np.issubdtype(vec.dtype, np.int):


ValueError: 0 is not in list

In [None]:
print('Document ({}): {}\n'.format(doc_id, ' '.join(train_docs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_docs[sims[index][0]].words)))

If using unique doc ids, get vectors for sklearn from model, and use infer_vector to generate embeddings for testing

In [155]:
embeddings_train = [model[doc] for doc in range(len(model.docvecs))] # get d
labels_train = y_train # set labels as y_train to keep variables consistent
labels_test, embeddings_test = zip(*[(doc.tags[0], model.infer_vector(doc.words, epochs = 20)) for doc in test_docs])
labels_test = y_test # the labels_test is no good if not using classes as labels

If I have used the class labels, infer vectors for training and val data. 
Or, even if I have used unique ids, use the training data (seen data) to infer vectors anyway. 

In [72]:
# gives labels and vectors, though I already have the labels in y_train
labels_train, embeddings_train = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in train_docs])
labels_test, embeddings_test = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in test_docs])
labels_train = y_train
labels_test = y_test

and then run through sklearn

In [18]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

In [28]:
rskf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 5, random_state = 1)

In [156]:
#sk_model = LinearSVC()
#sk_model = SVC(kernel='linear')
#sk_model = BernoulliNB()
sk_model = LogisticRegression(random_state = 0)
sk_model.fit(embeddings_train, labels_train)

y_predictions = sk_model.predict(embeddings_test)

or try out the one vs rest classifier:

In [None]:
from sklearn.multiclass import OneVsRestClassifier

In [81]:
model_ovr = OneVsRestClassifier(LinearSVC(random_state=0)).fit(embeddings_train, labels_train)
y_predictions = model_ovr.predict(embeddings_test)

then score it up!

In [157]:
# print("f1", f1_score(labels_test, y_predictions, average = 'macro'))
# print("recall", recall_score(labels_test, y_predictions, average = 'macro'))
# print("accuracy", accuracy_score(labels_test, y_predictions))
# print("precision", precision_score(labels_test, y_predictions, average = 'macro'))
# print("confusion_matrix\n", confusion_matrix(labels_test, y_predictions))

# binary label
print("f1", f1_score(labels_test, y_predictions, pos_label = 'method', average = "binary"))
print("recall", recall_score(labels_test, y_predictions, pos_label = 'method', average = "binary"))
print("accuracy", accuracy_score(labels_test, y_predictions))
print("precision", precision_score(labels_test, y_predictions, pos_label = 'method', average = "binary"))
print(confusion_matrix(labels_test, y_predictions, labels=['method', 'other']))

f1 0.5984023238925199
recall 0.6163051608077786
accuracy 0.8404730996682532
precision 0.5815102328863797
[[ 824  513]
 [ 593 5003]]


and here a cool thing: a classification report

In [21]:
from sklearn.metrics import classification_report

In [161]:
report = classification_report(labels_test, y_predictions)#, target_names = ['method', 'other'])
print(report)

             precision    recall  f1-score   support

     method       0.58      0.62      0.60      1337
      other       0.91      0.89      0.90      5596

avg / total       0.84      0.84      0.84      6933



check out the misclassified sections

In [33]:
X_test.index

RangeIndex(start=32000, stop=38933, step=1)

In [46]:
np_y_test = np.asarray(labels_test)
misclassified = np.where(np_y_test != y_predictions) # returns indices where condition is true
print(misclassified)

(array([   6,    7,   10,   11,   12,   13,   14,   16,   18,   21,   24,
         26,   28,   30,   31,   32,   33,   34,   35,   36,   40,   41,
         42,   45,   49,   50,   53,   55,   56,   58,   60,   63,   65,
         68,   69,   70,   73,   74,   75,   76,   78,   81,   83,   84,
         87,   88,   89,   90,   91,   92,   94,   96,   99,  101,  102,
        103,  106,  107,  108,  111,  113,  115,  116,  117,  118,  120,
        123,  126,  129,  130,  132,  133,  135,  138,  139,  142,  143,
        144,  145,  147,  151,  155,  156,  160,  162,  163,  165,  166,
        167,  169,  172,  178,  230,  238,  310,  329,  334,  335,  340,
        352,  357,  381,  396,  398,  409,  444,  448,  452,  459,  533,
        549,  652,  739,  763,  767,  768,  773,  774,  775,  778,  779,
        781,  810,  815,  825,  828,  829,  830,  831,  833,  834,  836,
        837,  838,  839,  841,  843,  846,  847,  848,  853,  857,  858,
        861,  863,  865,  867,  868,  872,  873,  

In [62]:
cnt = 0
for index in misclassified[0][500:]:
    cnt += 1
    print(index, "prediction:", y_predictions[index], "true label:", np_y_test[index])
    print(X_test.iloc[index],"\n\n\n")
    if cnt == 20:
        break

2714 prediction: method true label: other
<sec xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" sec-type="results"><title>RESULTS</title><p>The results from the measurements aboard the vessels and from the headspace of the linseed oil are summarized in <xref ref-type="table" rid="tbl1">Table 1</xref>. Particularly notable are the high CO and CO<sub>2</sub> concentrations and the low oxygen levels. No indication of the presence of H<sub>2</sub>S was seen. When the oxygen level from each vessel was plotted as function of the CO level, a high degree of correlation was obtained (<italic>r</italic><sup>2</sup>) = 0.92, see <xref ref-type="fig" rid="fig2">Fig. 2</xref>. Good correlation was also found between CO and butane equivalents (<italic>r</italic><sup>2</sup> = 0.93) and CO<sub>2</sub> and propylene (<italic>r</italic><sup>2</sup> = 0.98).</p><table-wrap id="tbl1" position="float"><label>Table 1.</label><caption><p>Summary of all measurements f

In [50]:
index = 7
print("prediction:", y_predictions[index], "true label:", np_y_test[index])
X_test.iloc[index] # works even though X_test index is between 32000-40000

prediction: other true label: method


'<sec xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" sec-type="" id="sec2"><title>Experimental</title><sec id="sec2.1"><title/><sec id="sec2.1.1"><title>Crystal data</title><p>\n                  <list list-type="simple"><list-item><p>C<sub>22</sub>H<sub>22</sub>N<sub>2</sub>\n                        </p></list-item><list-item><p>\n                           <italic>M</italic>\n                           <italic><sub>r</sub></italic> = 314.42</p></list-item><list-item><p>Trigonal, <inline-formula><inline-graphic xlink:href="e-64-00o78-efi4.jpg" mimetype="image" mime-subtype="gif"/></inline-formula>\n                        </p></list-item><list-item><p>\n                           <italic>a</italic> = 21.173 (8) Å</p></list-item><list-item><p>\n                           <italic>c</italic> = 10.476 (2) Å</p></list-item><list-item><p>\n                           <italic>V</italic> = 4067 (2) Å<sup>3</sup>\n                        </p></list-ite

or use cross validation 

In [29]:
from sklearn.model_selection import cross_validate

In [115]:
sk_model = LogisticRegression(random_state = 0)
scoring = ['f1_macro', 'precision_macro', 'recall_macro', 'accuracy']
scores = cross_validate(sk_model, embeddings_train, labels_train, 
                        scoring=scoring,cv=5, return_train_score=True) # cv =rskf

In [31]:
scores

{'fit_time': array([0.93449569, 1.12037468, 1.04741478, 1.00343919, 1.06040764]),
 'score_time': array([0.11991429, 0.1769011 , 0.16690731, 0.19689083, 0.10893917]),
 'test_f1_macro': array([0.87622222, 0.87481792, 0.87889957, 0.87110543, 0.87286268]),
 'train_f1_macro': array([0.87727107, 0.87789996, 0.87684717, 0.87924096, 0.87814096]),
 'test_precision_macro': array([0.88884046, 0.89740086, 0.89900688, 0.88791116, 0.88719244]),
 'train_precision_macro': array([0.89524   , 0.89488418, 0.89458883, 0.89642344, 0.89599976]),
 'test_recall_macro': array([0.86546091, 0.85742225, 0.86299287, 0.85743794, 0.86088778]),
 'train_recall_macro': array([0.86277175, 0.86405312, 0.86250291, 0.86524919, 0.8637068 ]),
 'test_accuracy': array([0.90974843, 0.91052052, 0.91302296, 0.90688896, 0.90767537]),
 'train_accuracy': array([0.91149294, 0.91177164, 0.91114606, 0.91275801, 0.91208964])}

There is also a sklearn wrapper that I don't know how to use and isn't super well documented.
I'd have to change up how I am using my dfs. See here:
https://github.com/RaRe-Technologies/gensim/blob/master/docs/notebooks/sklearn_api.ipynb
and docs:
https://radimrehurek.com/gensim/sklearn_api/d2vmodel.html

In [60]:
from gensim.sklearn_api import D2VTransformer

In [61]:
model_D2V = D2VTransformer(min_count=1)
#embedding_train = model_D2V.fit(X_train)
# do some more stuff
#embedding_test = model_D2V.transform(X_test)

and then use those doc2vec embeddings with sklearn

In [None]:
rskf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 5, random_state = 1)

In [None]:
model_svc = LinearSVC()
model_svc.fit(embedding_train, y_train)
y_predictions = model_svc.predict(embedding_test)

how does the model do?

In [None]:
print("f1", f1_score(y_test, y_predictions, average = 'macro'))
print("recall", recall_score(y_test, y_predictions, average = 'macro'))
print("accuracy", accuracy_score(y_test, y_predictions))
print("precision", precision_score(y_test, y_predictions, average = 'macro'))
print("confusion_matrix", confusion_matrix(y_test, y_predictions))