#### Have put together initial word2vec models on politics and business clusters since 1981, with all adjectives and adverbs marked.  Still searching for interesting topics for comparison over the years.

In [1]:
import pandas as pd
import numpy as np
import calendar
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
from time import time
import spacy
from gensim.models import Word2Vec
import gensim, multiprocessing
import pickle


def preprocess_df(df_raw):
    '''preprocesses text, returning only relevant components for remaining analysis'''
    df_raw['body'] = df_raw['text_body'].apply(join_strs)
    df_raw['len_body'] = df_raw.apply(lambda row: len(row['body']), axis = 1)
    print(df_raw.shape[0]-df_raw[df_raw['len_body']>0].shape[0],'lost records of', df_raw.shape[0])
    df_raw=df_raw[df_raw['len_body']>0]
    df_raw = df_raw.reset_index()
    df_raw['pub_year'] = df_raw.apply(lambda row: row['date'].year, axis=1)
    df_raw = df_raw.drop(['index','date','text_body','title','len_body'], axis=1)
    return df_raw


def nmf_cluster(df, vectorizer_feats = 1000, n_topics = 5):
    '''clusters articles in to different groupings, uses TFIDF vectorizer by default
    returns labeled dataframe'''
    df = df.rename(columns={'index':'original_index'})
    df = df.reset_index()
    df= df.drop(['index'], axis = 1)
    tfidf_vectorizer = TfidfVectorizer(max_df=0.80, min_df=2,
                               max_features=vectorizer_feats,
                               stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(df['body'])
    nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
    topic_score = pd.DataFrame(nmf.fit_transform(tfidf))
    topic_score['categ'] = topic_score[[n for n in range(n_topics)]].idxmax(axis=1)
    topic_score['categ_score'] = topic_score[[n for n in range(n_topics)]].max(axis=1)
    topic_score['categ_2score'] = pd.DataFrame(np.sort(topic_score[[n for n in range(n_topics)]].values)[:,-2:], \
                                           columns=['2nd-largest','largest'])['2nd-largest']
    topic_score['topic_diff'] = topic_score.apply(lambda row: row['categ_score'] - row['categ_2score'], axis = 1)
    df['categ_index'] = topic_score['categ']
    topic_keys = get_clusters(model = nmf, feature_names = tfidf_vectorizer.get_feature_names(), keywords = keywords, n_topics = n_topics)
    df["categ"] = df.apply(lambda row: topic_keys[row.categ_index] , axis=1)
    df = df.drop(['categ_index'], axis =1)
    df = df[df["categ"]!='NAP']
    return(df, nmf, tfidf)


keywords = {"politics": [ 'administration', 'al', 'american', 'budget', 'bush', 'campaign', 'chairman', 'clinton',\
                         'committee', 'congress', 'country', 'county', 'court', 'democrat', 'democratic',\
                         'democrats', 'department', 'election', 'federal', 'forces', 'gore', 'government',\
                         'governor', 'house', 'intelligence', 'iran', 'iraq', 'islamic', 'judge', 'law',\
                         'mccain', 'military', 'mondale', 'obama', 'officers', 'official', 'officials',\
                         'party', 'police', 'political', 'president', 'reagan', 'republican', 'republicans',\
                         'russia', 'russian', 'security', 'senate', 'senator', 'spending', 'state', 'states',\
                         'syria', 'tax', 'today', 'united', 'vote', 'war', 'white'],
            "business": ["company","companies","market","billion","million","banks","financial", "finance",\
                         "economy", "industry"]}


def get_clusters(model, feature_names, keywords, threshold = 5, n_top_words = 30, n_topics = 5):
    '''Assigns topic types based on keywords provided'''
    topic_key = {m:'NAP' for m in range(n_topics)}
    for topic_idx, topic in enumerate(model.components_):
        topic_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        for key in keywords.keys():
            overlap = len(set(keywords[key]) & set(topic_words))
            if overlap>threshold:
                topic_key[topic_idx] = key
    return topic_key    
    
tags = ["JJ","JJR","JJS","RB","RBR","RBS"]

def marker(sentences, to_tag):
    ''' Adds "markedword" to all adjectives/adverbs, for easy reference later '''
    words_final = []
    for sentence in sent_tokenize(sentences):
        for word in pos_tag(word_tokenize(sentence)):
            if word[1] in to_tag:
                words_final.append(word[0]+word[1]+"markedword")
            else:
                words_final.append(word[0])
    return ' '.join(words_final)

def preprocess_text(df, tags = tags):
    '''Tag all words with adjectives/adverbs to find descriptors '''
    t0 = time()
    df['marked_text'] = df['body'].apply(marker, args=(tags,))
    df = df.drop(['url','body','pub_year'],axis=1)
    return df

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print("")
    
def word2vec_models(df):
    '''Takes all text from datafram and returns word2vec model for each topic'''
    final_models = {}
    categs = df.categ.unique()
    for categ in categs:
        input_string = u''
        temp_df = df[df['categ']==categ]
        for m in range(temp_df.shape[0]):
            input_string += temp_df.loc[temp_df.index[m]].marked_text
        nlp = spacy.load('en')
        processed_text = nlp(input_string)
        sentences = [s for s in processed_text.sents]
        processed_sentences = [sent.lemma_.split() for sent in processed_text.sents]
        attributes_model = Word2Vec(
                sentences=processed_sentences,
                workers=multiprocessing.cpu_count() - 1, # use all cores
                window=50, sg=1)
        final_models[categ] = attributes_model
        print("done with %s categ in %0.3fs." % (categ, time() - t0))
        del input_string, temp_df, sentences, processed_text, nlp, processed_sentences
    return final_models

Using Theano backend.


## Read in all data, and merge articles into one dataframe

In [3]:
df0 = pd.read_json('data/full_articles/1981_1995_articles.json')
df1 = pd.read_json('data/full_articles/1996_1999_articles.json')
df2 = pd.read_json('data/full_articles/2000_2009_articles.json')
df3 = pd.read_json('data/full_articles/2010_2015_articles.json')
df4 = pd.read_json('data/full_articles/2007_2016_2017_articles.json')

In [18]:
months = dict((v,k) for k,v in enumerate(calendar.month_abbr))

def date_convert(text):
    text = text.split(('Published: '))[1]
    month = months[text[0:3]]
    day = int(text.split(',')[0].split(' ')[1])
    year = int(text[-4:])
    return pd.Timestamp(datetime.datetime(year, month,day))

df0['date'] = df0['date'].apply(date_convert)


In [17]:
df = df0.append(df1, ignore_index=True).append(df2, ignore_index=True).append(df3, ignore_index=True).append(df4, ignore_index=True)

In [19]:
t0 = time()
df_final = preprocess_df(df)
print("done in %0.3fs." % (time() - t0))

(11782, 'lost records of', 331362)
done in 68.470s.


In [20]:
del df
df_final = df_final.dropna()
with open('data/pickles/model/df_final.pkl', 'wb') as f:
        pickle.dump(df_final, f)

In [19]:
with open('data/pickles/model/df_final.pkl', 'rb') as f:
        df_final = pickle.load(f)

In [22]:
df_final.head()

Unnamed: 0,url,body,pub_year
0,https://www.nytimes.com/1981/01/03/us/snowfall...,\n \nThe ski industry in the eastern half of t...,1981.0
1,https://www.nytimes.com/1981/01/03/nyregion/ma...,"\nMore than two-thirds of the nightclubs, disc...",1981.0
2,https://www.nytimes.com/1981/01/03/business/pr...,"\nBanks across the country, reversing a patter...",1981.0
3,https://www.nytimes.com/1981/01/03/world/hundr...,\n \nDespite a recent effort to create a fair ...,1981.0
4,https://www.nytimes.com/1981/01/03/us/gop-is-a...,"\n \nUsing campaign contributions, computer pr...",1981.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


## Cycle through each time period, number of topics; return an nmf, dataframe, and TFIDF Vectorizer 

In [23]:
intervals = [(m, m+3) for m in range(1981, 2017, 4)]

In [38]:
t0 = time()

nmf_models = {}
for interval in intervals:
    curr_df = df_final[(df_final['pub_year']>=interval[0]) & (df_final['pub_year']<=interval[1])]
    df, nmf, tfidf = nmf_cluster(curr_df, n_topics = 5)
    nmf_models[interval] = (df, nmf, tfidf)
    print(topic_keys)
    print(interval, "done in %0.3fs." % (time() - t0))

categ
((1981, 1984), 'done in 41.525s.')
categ
((1985, 1988), 'done in 84.723s.')
categ
((1989, 1992), 'done in 127.450s.')
categ
((1993, 1996), 'done in 173.848s.')
categ
((1997, 2000), 'done in 249.855s.')
categ
((2001, 2004), 'done in 337.796s.')
categ
((2005, 2008), 'done in 408.877s.')
categ
((2009, 2012), 'done in 459.560s.')
categ
((2013, 2016), 'done in 502.151s.')


In [44]:
with open('data/pickles/model/nmf/nmf_5topics.pkl', 'wb') as f:
        pickle.dump(nmf_models, f)

## Topic Summaries

In [45]:
with open('data/pickles/model/nmf/nmf_5topics.pkl', 'rb') as f:
        nmf_models = pickle.load(f)

In [46]:
for year in range(1981, 2014, 4):
    print((year, year+3))
    print_top_words(nmf_models[(year,year+3)][1], nmf_models[(year,year+3)][2].get_feature_names(), 30)
    print ('--------------------------------')

(1981, 1984)
Topic #0:
tax reagan budget house senate a1 president administration state billion congress republican spending federal democrats committee senator cuts taxes new fiscal white year deficit program democratic republicans percent programs economic

Topic #1:
percent company million market oil prices stock companies rates sales billion year quarter d1 bank rate rose price industry banks analysts shares new business corporation earnings share trading fell economy

Topic #2:
new city york people dr street game school years like year time 000 old season day just miss home building work children center state avenue park play mrs team police

Topic #3:
soviet united states israel american military israeli lebanon reagan union officials government administration beirut minister forces lebanese president missiles foreign a1 arms nuclear talks official nations west moscow troops today

Topic #4:
mr mondale company county party court state campaign judge president case investigation c

## Create Word2Vec Models Based on Dataframes

In [45]:
models1={}
for interval in intervals[0:3]:
    t0 = time()
    df_og = nmf_models[interval][0]
    df_curr = preprocess_text(df_og)
    models = word2vec_models(df_curr)
    models1[interval] = models
    print(interval, "done in %0.3fs." % (time() - t0))
    
models2={}
for interval in intervals[3:6]:
    t0 = time()
    df_og = nmf_models[interval][0]
    df_curr = preprocess_text(df_og)
    models = word2vec_models(df_curr)
    models2[interval] = models
    print(interval, "done in %0.3fs." % (time() - t0))

models3={}
for interval in intervals[6:9]:
    t0 = time()
    df_og = nmf_models[interval][0]
    df_curr = preprocess_text(df_og)
    models = word2vec_models(df_curr)
    models3[interval] = models
    print(interval, "done in %0.3fs." % (time() - t0))

models = {}
for key in models1.keys():
    models[key] = models1[key]

for key in models2.keys():
    models[key] = models2[key]

for key in models3.keys():
    models[key] = models3[key]    

done with business categ in 847.986s.
done with politics categ in 1489.791s.
((1981, 1984), 'done in 1490.050s.')
done with business categ in 1027.041s.
done with politics categ in 1591.042s.
((1985, 1988), 'done in 1591.246s.')
done with business categ in 982.018s.
done with politics categ in 1475.091s.
((1989, 1992), 'done in 1475.315s.')
done with politics categ in 1132.047s.
done with business categ in 1551.971s.
((1993, 1996), 'done in 1552.182s.')
done with politics categ in 1720.795s.
done with business categ in 2431.942s.
((1997, 2000), 'done in 2432.213s.')
done with business categ in 1659.723s.
done with politics categ in 2795.704s.
((2001, 2004), 'done in 2796.120s.')
done with business categ in 1617.136s.
done with politics categ in 2358.714s.
((2005, 2008), 'done in 2359.030s.')
done with business categ in 1353.609s.
done with politics categ in 1726.139s.
((2009, 2012), 'done in 1726.342s.')
done with business categ in 1063.195s.
done with politics categ in 1534.869s.
((20

In [3]:
with open('../data/pickles/model/word2vec_8.15.pkl', 'rb') as f:
        word2vec = pickle.load(f)

In [4]:
sorted(word2vec[(1981,1984)].keys())

def descibers_over_time_all(dict_models, categ, word):
    for key in sorted(dict_models.keys()):
        print(key)
        for w,sim in dict_models[key][categ].most_similar(word,topn=10):
            if (w[-10:]=="markedword"):
                print((w[:-12], sim))
            else:
                print((w, sim))
        print("------------")
        
def descibers_over_time(dict_models, categ, word):
    for key in sorted(dict_models.keys()):
        print(key)
        for w,sim in dict_models[key][categ].most_similar(word,topn=150):
            if (w[-10:]=="markedword"):
                print((w[:-12], sim))
        print("------------")

In [5]:
descibers_over_time(word2vec, 'politics', 'iran')

(1981, 1984)
(u'iranian', 0.9057307243347168)
(u'persian', 0.7032240629196167)
(u'unimaginable', 0.6348543167114258)
(u'islamic', 0.6282001733779907)
(u'shadowy', 0.614230751991272)
(u'algerian', 0.6050736308097839)
(u'arab', 0.574299693107605)
(u'gulf', 0.5737534165382385)
(u'frozen', 0.57230544090271)
(u'debatable', 0.5679643154144287)
(u'implacable', 0.5642850399017334)
(u'50-mile', 0.5468552112579346)
(u'insane', 0.5467159152030945)
(u'feeble', 0.5325487852096558)
(u'14-month', 0.530278205871582)
(u'israel', 0.5294326543807983)
(u'producing', 0.5207966566085815)
(u'fertile', 0.519006609916687)
(u'unambiguous', 0.5158731341362)
(u'qaddafi', 0.5152394771575928)
(u'undefined', 0.5118025541305542)
(u'imaginative', 0.5096311569213867)
------------
(1985, 1988)
(u'iranian', 0.7790703773498535)
(u'persian', 0.6299331188201904)
(u'hostages', 0.5693492293357849)
(u'iran', 0.5681303143501282)
(u'denial', 0.5594326257705688)
(u'gulf', 0.5429402589797974)
(u'diplomatically', 0.5420953035354614