In [1]:
"""Doc2Vec model on left and right articles. I followed along to the IMDB tutorial from gojomo"""
import utils
import pandas as pd
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Gets left and right data files
df1 = pd.read_csv('../../data/all_left_filtered.csv')
df2 = pd.read_csv('../../data/all_right_filtered.csv')

In [3]:
articles1 = list(df1['content'])
articles2 = list(df2['content'])

#Runs the preprocessing method on both articles
left = utils.preprocess(articles1)
right = utils.preprocess(articles2)

In [4]:
def merge(list1, list2): 
    """Merge lists together into tuple"""
    merged_list = tuple(zip(list1, list2))  
    return merged_list 

denial0 = list(df1['denial?'])
denial1 = list(df2['denial?'])

left_tuples = merge(left, denial0)
right_tuples = merge(right, denial1)
all_articles = left_tuples+right_tuples
print(all_articles[0][0][0:10], all_articles[0][1])

['Key', 'event', '1988', 'EPA', 'decided', 'classify', 'oil', 'gas', 'waste', 'non'] 0


In [5]:
%%time

import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

# this data object class suffices as a `TaggedDocument` (with `words` and `tags`) 
LabelledDocument = namedtuple('LabelledDocument', 'words tags denial')

alldocs = []

#Converts each article to a TaggedDocument and puts in the alldocs list
for i in range(len(all_articles)):
    words = all_articles[i][0]
    denial = all_articles[i][1]
    tags = [i]
    alldocs.append(LabelledDocument(words, tags, denial))
    
print(alldocs[0])

LabelledDocument(words=['Key', 'event', '1988', 'EPA', 'decided', 'classify', 'oil', 'gas', 'waste', 'non', 'hazardous', 'even', 'though', 'contains', 'dangerous', 'chemicals', 'small', 'town', 'Nordheim', 'Texas', 'residents', 'trying', 'stop', 'commercial', 'oil', 'gas', 'waste', 'facility', 'proposed', 'large', 'plot', 'land', 'less', 'mile', 'away', 'worry', 'Texas', 'wind', 'carry', 'toxic', 'air', 'emissions', 'town', 'across', 'campus', 'local', 'school', 'residents', 'effort', 'hampered', 'U', 'Environmental', 'Protection', 'Agency', 'decision', '1988', 'classify', 'oil', 'gas', 'waste', 'non', 'hazardous', 'even', 'though', 'contains', 'chemicals', 'including', 'benzene', 'known', 'cause', 'health', 'problems', 'industry', 'lobbied', 'hard', 'non', 'hazardous', 'classification', 'arguing', 'cost', 'treating', 'waste', 'hazardous', 'would', 'exorbitant', 'look', 'exemption', 'came', 'recent', 'effort', 'repeal', 'READ', 'Open', 'Pits', 'Offer', 'Cheap', 'Disposal', 'Fracking', 

In [6]:
#Shuffles the doclist
from random import shuffle
doc_list = alldocs[:]  
shuffle(doc_list)

In [7]:
%%time
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

#Trains 3 models with different params
simple_models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores, alpha=0.05, comment='alpha=0.05'),
    # PV-DM w/ concatenation - big, slow, experimental mode
    # window=5 (both sides) approximates paper's apparent 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=5, negative=5, hs=0, min_count=2, sample=0, 
            epochs=20, workers=cores),
]

for model in simple_models:
    model.build_vocab(alldocs)
    print("%s vocabulary scanned & state initialized" % model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

Doc2Vec(dbow,d100,n5,mc2,t4) vocabulary scanned & state initialized
Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t4) vocabulary scanned & state initialized
Doc2Vec(dm/c,d100,n5,w5,mc2,t4) vocabulary scanned & state initialized
Wall time: 37.8 s


In [8]:
#Some other models to test
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[1]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[2]])

In [9]:
from collections import defaultdict

#Training models
for model in simple_models: 
    print("Training %s" % model)
    %time model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs)

Training Doc2Vec(dbow,d100,n5,mc2,t4)
Wall time: 5min 17s
Training Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t4)
Wall time: 6min 27s
Training Doc2Vec(dm/c,d100,n5,w5,mc2,t4)
Wall time: 10min 2s


In [10]:
import random

#Gets most similar, somewhat similar and least similar articles compared to a target article
doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))

TARGET (12498): «Fridge freezer left ditch Malcolm Campbell CC SA 2 0 via Wikimedia Commons Guest essay Eric Worrall According Time Magazine forcing everyone switch CFC refrigerants climate friendly refrigerants job creation opportunity One Climate Change Deal Trump Administration Might Back JUSTIN WORLAND February 6 2018 Trump Administration hesitated throw key deal reached 2016 phase pollutant found air conditioners factor climate change part American companies think could huge business opportunity Administration official declined say Monday whether Trump would send Kigali amendment Senate ratification president wants make sure international environment agreement harm U workers said George David Banks White House international energy environment advisor event Hudson Institute Monday president decide support Kigali largely wants create U jobs want global leaders like alway industry says Stephen Yurek CEO Air Conditioning Heating Refrigeration Institute pick products technology pick de

In [11]:
word_models = simple_models[:]

#Saves the models
for i in range(0, len(word_models)):
    model.save("d2v{}.model".format(i+1))
    print("Model Saved")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Model Saved


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Model Saved


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Model Saved


In [12]:
len(alldocs)

model3_vectors = []
model3_labels = []

# all_vectors2 = []
# all_labels2 = []

# all_vectors3 = []
# all_labels3 = []

#Putting vectors in one array, labels in another
for i in range(len(alldocs)):
    model3_vectors.append(simple_models[2].docvecs[i])
    model3_labels.append(alldocs[i].denial)
    
import pickle

with open('model3_vectors', 'wb') as fp:
    pickle.dump(model3_vectors, fp)
    
with open('model3_vectors', 'wb') as fp:
    pickle.dump(model3_labels, fp)
# alldocs[367]
# model['[367]']
# simple_models[0]['[367]']
# alldocs[366]
# similar_doc = simple_models[0].docvecs.most_similar(0)
# similar_doc
# print(alldocs[367])
# simple_models[0]['367']
# simple_models[0][0]

# simple_models[0].docvecs.count
# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
# print(model.docvecs[0])

In [13]:
# all_labels1[0]
#Training rf
# utils.train_rf(all_vectors1, all_labels1)

In [14]:
utils.train_svm(model3_vectors, model3_labels, list)

Fitting 3 folds for each of 75 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 44.4min
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed: 94.8min finished


{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
Model accuracy on test set: 86.14521220624343


OSError: [Errno 22] Invalid argument: "best_svm_<class 'list'>.pb"