In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import multiprocessing
from time import time
import spacy

In [3]:
# df = pd.read_csv('data/articles_categs.csv', index_col=0)
df = df[df['categ']=='politics']

In [4]:
df.groupby('years').count()

Unnamed: 0_level_0,text_final,url,year,categ
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1981 - 1984,4744,4744,4744,4744
1985 - 1988,4870,4870,4870,4870
1989 - 1992,3555,3555,3555,3555
1993 - 1996,5969,5969,5969,5969
1997 - 2000,4417,4417,4417,4417
2001 - 2004,4811,4811,4811,4811
2005 - 2008,3632,3632,3632,3632
2009 - 2012,3175,3175,3175,3175
2013 - 2016,1173,1173,1173,1173


In [5]:
df.head()

Unnamed: 0,years,text_final,url,year,categ
0,1981 - 1984,Senate Republican leader tell White House offi...,https://www.nytimes.com/1981/10/17/us/gop-sena...,1981,politics
1,1981 - 1984,House Senate conferee today kill Senate - appr...,https://www.nytimes.com/1981/07/22/us/conferee...,1981,politics
2,1981 - 1984,"Senator Bob Dole , chairman tax - write Senate...",https://www.nytimes.com/1983/02/18/business/do...,1983,politics
3,1981 - 1984,1981 tax act draw fire provision allow company...,https://www.nytimes.com/1982/09/10/business/th...,1982,politics
4,1981 - 1984,Reagan Administration today forward 20-year_PO...,https://www.nytimes.com/1982/01/29/us/revampin...,1982,politics


In [6]:
def preprocess_text(df, window=10):
    '''Takes all text from dataframe and returns word2vec model for each topic'''    
    final_text = {}
    intervals = df.years.unique()
    nlp = spacy.load("en_core_web_sm")
    for interval in intervals:
        t0=time()
        final_sentences = []
        temp_df = df[df['years']==interval]
        for i in range(temp_df.shape[0]):
            processed_text = nlp(temp_df['text_final'].iloc[i])
            for sent in processed_text.sents:
                curr_sentence = [token.text for token in sent if token.pos_!='PUNCT']
                if len(curr_sentence)>2:
                    final_sentences.append(curr_sentence)
            
            final_text[interval] = final_sentences            
        print("done with %s categ in %0.3fs." % (interval, time() - t0))
    return final_text

In [7]:
processed_text = preprocess_text(df)

done with 1981 - 1984 categ in 432.009s.
done with 1989 - 1992 categ in 364.067s.
done with 1993 - 1996 categ in 728.934s.
done with 2009 - 2012 categ in 354.069s.
done with 2013 - 2016 categ in 141.184s.
done with 1997 - 2000 categ in 512.403s.
done with 2005 - 2008 categ in 417.873s.
done with 1985 - 1988 categ in 480.198s.
done with 2001 - 2004 categ in 541.974s.


In [8]:
import pickle

with open('data/wordvec_text.pickle', 'wb') as f:
    pickle.dump(processed_text, f)


In [9]:
def word2vec_models(final_text, window=10):
    final_models={}
    for interval, text in final_text.items():
        t0=time()
        attributes_model = Word2Vec(sentences=text,
                                    workers=multiprocessing.cpu_count() - 1, # use all cores
                                    window=window, sg=1)
        final_models[interval] = attributes_model
        print("done with %s categ in %0.3fs." % (interval, time() - t0))
    return final_models

In [10]:
models = word2vec_models(processed_text)

done with 1981 - 1984 categ in 23.066s.
done with 1989 - 1992 categ in 19.664s.
done with 1993 - 1996 categ in 35.089s.
done with 2009 - 2012 categ in 18.405s.
done with 2013 - 2016 categ in 6.610s.
done with 1997 - 2000 categ in 27.689s.
done with 2005 - 2008 categ in 22.756s.
done with 1985 - 1988 categ in 25.174s.
done with 2001 - 2004 categ in 29.842s.


In [13]:
with open('data/wordvec_models.pickle', 'wb') as f:
    pickle.dump(models, f)