In [1]:
"""Doc2Vec model on left and right articles. I followed along to the IMDB tutorial from gojomo"""
import utils
import pandas as pd
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Gets left and right data files
df1 = pd.read_csv('../../data/all_left_filtered.csv')
df2 = pd.read_csv('../../data/all_right_filtered.csv')

In [3]:
articles1 = list(df1['content'])
articles2 = list(df2['content'])

#Runs the preprocessing method on both articles
left = utils.preprocess(articles1)
right = utils.preprocess(articles2)

In [4]:
def merge(list1, list2): 
    """Merge lists together into tuple"""
    merged_list = tuple(zip(list1, list2))  
    return merged_list 

denial0 = list(df1['denial?'])
denial1 = list(df2['denial?'])

left_tuples = merge(left, denial0)
right_tuples = merge(right, denial1)
all_articles = left_tuples+right_tuples
print(all_articles[0][0][0:10], all_articles[0][1])

['Key', 'event', '1988', 'EPA', 'decided', 'classify', 'oil', 'gas', 'waste', 'non'] 0


In [5]:
%%time

import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple

# this data object class suffices as a `TaggedDocument` (with `words` and `tags`) 
LabelledDocument = namedtuple('LabelledDocument', 'words tags denial')

alldocs = []

#Converts each article to a TaggedDocument and puts in the alldocs list
for i in range(len(all_articles)):
    words = all_articles[i][0]
    denial = all_articles[i][1]
    tags = [i]
    alldocs.append(LabelledDocument(words, tags, denial))
    
print(alldocs[0])

LabelledDocument(words=['Key', 'event', '1988', 'EPA', 'decided', 'classify', 'oil', 'gas', 'waste', 'non', 'hazardous', 'even', 'though', 'contains', 'dangerous', 'chemicals', 'small', 'town', 'Nordheim', 'Texas', 'residents', 'trying', 'stop', 'commercial', 'oil', 'gas', 'waste', 'facility', 'proposed', 'large', 'plot', 'land', 'less', 'mile', 'away', 'worry', 'Texas', 'wind', 'carry', 'toxic', 'air', 'emissions', 'town', 'across', 'campus', 'local', 'school', 'residents', 'effort', 'hampered', 'U', 'Environmental', 'Protection', 'Agency', 'decision', '1988', 'classify', 'oil', 'gas', 'waste', 'non', 'hazardous', 'even', 'though', 'contains', 'chemicals', 'including', 'benzene', 'known', 'cause', 'health', 'problems', 'industry', 'lobbied', 'hard', 'non', 'hazardous', 'classification', 'arguing', 'cost', 'treating', 'waste', 'hazardous', 'would', 'exorbitant', 'look', 'exemption', 'came', 'recent', 'effort', 'repeal', 'READ', 'Open', 'Pits', 'Offer', 'Cheap', 'Disposal', 'Fracking', 

In [6]:
#Shuffles the doclist
from random import shuffle
doc_list = alldocs[:]  
shuffle(doc_list)

In [7]:
%%time
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

#Creates model
# PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
model = Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0,
                epochs=20, workers=cores, alpha=0.05, comment='alpha=0.05')

model.build_vocab(alldocs)
print("%s vocabulary scanned & state initialized" % model)

Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t4) vocabulary scanned & state initialized
Wall time: 14.5 s


In [8]:
print("Training %s" % model)
%time model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs)

Training Doc2Vec("alpha=0.05",dm/m,d100,n5,w10,mc2,t4)
Wall time: 8min 46s


In [12]:
import random

#Gets most similar, somewhat similar and least similar articles compared to a target article
doc_id = np.random.randint(model.docvecs.count)  # pick random doc, re-run cell for more examples
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents

print(u'TARGET (%d): «%s»\n' % (doc_id, ' '.join(alldocs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)

for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))

TARGET (1100): «Seventeen countries home 25 global population facing extremely high water stress according World Resources Institute research non profit Several drought stricken places around world experienced water crises recent years populated cities like Cape Town Sao Paolo Chennai inching toward Day Zero day taps run dry water longer available 17 countries facing water risk India Middle East North Africa agriculture industry municipalities sucking 80 available surface groundwater every year according WRI Aqueduct Water Risk Atlas tool ranks water stress drought risk riverine flood risk across 189 countries means even small dry shocks nonprofit says likely increase due climate change produce dire consequences Water stress biggest crisis one talking WRI president CEO Dr Andrew Steer said statement consequences plain sight form food insecurity conflict migration financial instability water stressed regions Earth Middle East North Africa also known MENA region home 12 17 risk countries

In [13]:
#Saves model for later
model.save("d2v.model".format(i+1))
print("Model Saved")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Model Saved


In [14]:
model_vectors = []
model_labels = []

#Putting vectors in one array, labels in another
for i in range(len(alldocs)):
    model_vectors.append(model.docvecs[i])
    model_labels.append(alldocs[i].denial)
    
import pickle

with open('model_vectors', 'wb') as fp:
    pickle.dump(model_vectors, fp)
    
with open('model_labels', 'wb') as fp:
    pickle.dump(model_labels, fp)