# Make local embeddings for each document
- global embeddings meant all articles were clustered together. let's try local

In [12]:
import pickle
import pandas as pd, numpy as np

import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [13]:
### load df

DATA_PATH = './data/cleaned/'
DF_NAME = 'concatenated_df_cleaned_glove.pkl'

with open(DATA_PATH + DF_NAME, 'rb') as f:
    df = pickle.load(f)

In [14]:
class TaggedDocumentIterator(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield TaggedDocument(words=doc, tags=[self.labels_list[idx]])

In [15]:
df.head()

Unnamed: 0,created_at,user,id_str,retweeted_status,Hash words,link,entities,Topic Label,urls,no_link,...,month_year,week_month_year,user_id,cluster,is_retweet,hashtags,link_shorteners,article_text,article_text_cleaned,glove_avg
0,2020-01-31 23:08:33+00:00,"{'id': 19031057, 'id_str': '19031057', 'name':...",1223382589689356288,,#coronavirus,https://twitter.com/user/status/12233825896893...,"{'hashtags': [{'text': 'coronavirus', 'indices...","(43, 0.999852366207409)",[https://www.kron4.com/news/national/when-will...,False,...,2020-01,5,19031057,43,False,[coronavirus],[False],"KRON4 by: Aubree Gordon, University of Michig...","[kron, aubree, gordon, university, michigan, c...","[0.14527764461177867, 1.301592749657575, 0.487..."
1,2020-01-31 23:11:29+00:00,"{'id': 798925214, 'id_str': '798925214', 'name...",1223383328843280384,{'created_at': 'Fri Jan 31 20:41:20 +0000 2020...,#coronavirus #vaccine:,https://twitter.com/user/status/12233833288432...,"{'hashtags': [{'text': 'Coronavirus', 'indices...","(6, 0.9883827140131066)",[https://www.cnbc.com/2020/01/31/coronavirus-w...,False,...,2020-01,5,798925214,6,True,"[Coronavirus, vaccine]",[False],U.S. and international health officials are s...,"[international, health, officials, speeding, w...","[0.4456857605208643, 1.4679776202002541, 0.697..."
2,2020-01-31 23:12:03+00:00,"{'id': 806144538049970176, 'id_str': '80614453...",1223383471999127552,,No hashtags,https://twitter.com/user/status/12233834719991...,"{'hashtags': [], 'symbols': [], 'user_mentions...","(74, 0.9999928079220538)",[https://www.businessinsider.com/australia-suc...,False,...,2020-01,5,806144538049970176,74,False,[None],[False],A leading-edge research firm focused on digit...,"[leading_edge, research, firm_focused, active,...","[-0.012559317511040717, 0.5841067982127425, 0...."
3,2020-01-31 23:35:42+00:00,"{'id': 61298849, 'id_str': '61298849', 'name':...",1223389423045206016,{'created_at': 'Tue Jan 28 18:26:16 +0000 2020...,No hashtags,https://twitter.com/user/status/12233894230452...,"{'hashtags': [], 'symbols': [], 'user_mentions...","(36, 0.9999993854913211)",[https://www.greenmedinfo.com/blog/examining-r...,False,...,2020-01,5,61298849,36,True,[None],[False],Mr. Kennedy is in very safe territory by rep...,"[mr, kennedy, safe, territory, reporting, cdc,...","[1.2950098205753602, 3.404326125156658, 2.0269..."
4,2020-01-31 23:37:23+00:00,"{'id': 1152822375567654912, 'id_str': '1152822...",1223389844719599616,,No hashtags,https://twitter.com/user/status/12233898447195...,"{'hashtags': [], 'symbols': [], 'user_mentions...","(74, 0.9999985797901156)",[https://www.dailymail.co.uk/news/article-7952...,False,...,2020-01,5,1152822375567654912,74,False,[None],[True],By Kylie Stevens and Stephen Gibbs and Nic Wh...,"[kylie, stevens, stephen, gibbs, nic, white, d...","[0.23033713073586115, 4.3884405878634425, 4.87..."


In [28]:
df['article_text_cleaned_string'] = df['article_text_cleaned'].apply(lambda x: ' '.join(x))
docs = list(df['article_text_cleaned_string']) # a list where each document is a string

# labels = list(df['id_str'])
labels = list(range(len(df)))

sentences = TaggedDocumentIterator(docs, labels)

In [29]:
model = Doc2Vec(vector_size=100,
                 window=5,
                 min_count=20,
                 workers=2,
                 epochs=100,
                 dm=1,        #1 = paragraph vector - distributed memory; 0 = dbow
                 seed=42)



In [30]:
# train model
model.build_vocab(sentences)


In [33]:
model.train(sentences,
            total_examples=model.corpus_count,
            epochs=model.epochs, 
            )



In [37]:
MODEL_PATH = './models/'
model_name = 'd2v_pv.model'

model.save(MODEL_PATH + model_name)

In [40]:
model.infer_vector(df['article_text_cleaned'][0]) # one document as a list of strings

array([-3.81713500e-03, -3.02705192e-03,  3.95734236e-03, -1.07510528e-03,
        3.87954619e-03,  3.68858757e-03,  4.76369634e-03,  2.89270538e-03,
       -8.39679968e-04,  3.80937057e-03,  4.13925899e-03,  4.03762190e-03,
        4.31357929e-03, -4.61425120e-03,  2.92414887e-04,  9.13685290e-05,
       -3.65538150e-03, -3.83670162e-03,  2.43769912e-03,  1.88678037e-03,
       -3.34629673e-03,  1.92392373e-03, -4.99365805e-03, -3.61639541e-03,
       -4.57807211e-03,  3.64909600e-03,  1.60619745e-03, -4.83668037e-03,
        4.78581106e-03, -2.20841239e-03,  4.38499672e-04,  2.25511496e-03,
       -4.99070762e-03,  2.69463897e-04,  2.97399762e-04,  4.98123839e-03,
        1.44247664e-03,  3.32286814e-03,  1.67207245e-03, -3.27352365e-03,
       -2.43683346e-03,  3.66754574e-03, -2.53688404e-03,  1.51808723e-03,
       -3.93362110e-03,  3.61599750e-03, -3.65189346e-03, -4.44446364e-03,
        3.59915383e-03,  2.27528391e-03, -3.78898985e-04,  4.33972804e-03,
       -3.10815126e-03, -

In [42]:
df['d2v'] = df['article_text_cleaned'].apply(lambda x: model.infer_vector(x))

In [46]:
### save df

DATA_PATH = './data/cleaned/'
DF_NAME = 'concatenated_df_cleaned_glove_and_d2v.pkl'

with open(DATA_PATH + DF_NAME, 'wb') as f:
    pickle.dump(df, f)