In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import pyLDAvis.gensim

In [4]:
df = pd.read_csv('/Users/vkolhatk/Data/SOCC/raw/gnm_articles.csv')

In [5]:
df.shape

(10339, 8)

In [6]:
df.columns

Index(['article_id', 'title', 'article_url', 'author', 'published_date',
       'ncomments', 'ntop_level_comments', 'article_text'],
      dtype='object')

In [7]:
doc_complete = df['article_text'].tolist()

In [8]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_components = 5
n_top_words = 20

In [9]:
def print_top_words(model, feature_names, n_top_words):
    topic_id_words_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        topic_id_words_dict[topic_idx] = " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    return topic_id_words_dict                                                      

In [10]:
# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
#dataset = fetch_20newsgroups(shuffle=True, random_state=1,
#                             remove=('headers', 'footers', 'quotes'))
#data_samples = dataset.data[:n_samples]
data_samples = doc_complete
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 0.000s.


In [11]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 4.854s.


In [12]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf features for LDA...
done in 4.689s.



In [13]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
topic_id_words_dict = print_top_words(nmf, tfidf_feature_names, n_top_words)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...
done in 3.588s.

Topics in NMF model (Frobenius norm):
Topic #0: canada government oil canadian cent health federal tax economic ontario energy canadians alberta care ottawa billion economy new trade provinces
Topic #1: mr party liberals ndp trudeau election quebec harper conservatives leader liberal conservative government ms campaign minister political voters prime vote
Topic #2: city ford mayor toronto mr transit vancouver council said police says street like cities people new montreal public park just
Topic #3: mr trump obama president war russia military state china syria states iran united world russian putin international canada iraq foreign
Topic #4: women people children court life like men police law school don ms just time young world know said work child



In [14]:
print(nmf)

NMF(alpha=0.1, beta=1, eta=0.1, init=None, l1_ratio=0.5, max_iter=200,
  n_components=5, nls_max_iter=2000, random_state=1, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)


In [15]:
doc_topic = nmf.transform(tf)

In [16]:
print(doc_topic[49])

[ 0.14319448  3.03803393  0.          0.          0.        ]


In [17]:
doc_index_topicid_dict = {}
for n in range(doc_topic.shape[0]):
    topic_most_pr = doc_topic[n].argmax()
    #print("doc: {} topic: {}\n".format(n,topic_most_pr))
    doc_index_topicid_dict[doc_complete[n]] = topic_most_pr

In [18]:
print(topic_id_words_dict)

{0: 'canada government oil canadian cent health federal tax economic ontario energy canadians alberta care ottawa billion economy new trade provinces', 1: 'mr party liberals ndp trudeau election quebec harper conservatives leader liberal conservative government ms campaign minister political voters prime vote', 2: 'city ford mayor toronto mr transit vancouver council said police says street like cities people new montreal public park just', 3: 'mr trump obama president war russia military state china syria states iran united world russian putin international canada iraq foreign', 4: 'women people children court life like men police law school don ms just time young world know said work child'}


In [19]:
def get_topic_id(article_text):
    return doc_index_topicid_dict[article_text]

In [21]:
df['topic_id'] = df['article_text'].apply(get_topic_id)

In [22]:
df['topic_words'] = df['topic_id'].apply(lambda x: topic_id_words_dict[x])

In [23]:
df.to_csv('../CF_output/gnm_articles_with_topics.csv', index = False)