In [12]:
import pandas as pd
import pymysql
from sqlalchemy import create_engine
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora
import numpy as np
import pprint

In [4]:
# Fetch the data from the mysql server and throw it into a dataframe
engine = create_engine('mysql+pymysql://newsfuzz:newsfuzzplease@newsfuzz.cuhvcgseshha.eu-west-2.rds.amazonaws.com:3306/newsfuzz', encoding='utf-8')
newsfuzz_db = pd.io.sql.read_sql('SELECT * FROM newsfuzz_db_test', engine, index_col='index')

In [13]:
# Check the data is good!
print(len(newsfuzz_db))
newsfuzz_db.head()
pprint.pprint(np.unique(newsfuzz_db['source_id']))

6533
array(['abc-news-au', 'al-jazeera-english', 'ars-technica',
       'associated-press', 'bbc-news', 'bbc-sport', 'bild', 'bloomberg',
       'breitbart-news', 'business-insider', 'business-insider-uk',
       'buzzfeed', 'cnbc', 'cnn', 'daily-mail', 'engadget',
       'entertainment-weekly', 'espn', 'espn-cric-info', 'financial-times',
       'focus', 'football-italia', 'fortune', 'four-four-two',
       'fox-sports', 'google-news', 'gruenderszene', 'hacker-news', 'ign',
       'independent', 'mashable', 'metro', 'mirror', 'mtv-news',
       'mtv-news-uk', 'national-geographic', 'new-scientist',
       'new-york-magazine', 'newsweek', 'nfl-news', 'polygon', 'recode',
       'reddit-r-all', 'reuters', 'spiegel-online', 't3n', 'talksport',
       'techcrunch', 'techradar', 'the-economist', 'the-guardian-au',
       'the-guardian-uk', 'the-hindu', 'the-huffington-post',
       'the-lad-bible', 'the-new-york-times', 'the-sport-bible',
       'the-telegraph', 'the-times-of-india', 'the-

In [21]:
# Extract guardian and daily mail articles
articles_guard_dm=newsfuzz_db[newsfuzz_db['source_id'].isin(['the-guardian-uk','daily-mail'])] ['article_raw'].tolist()

In [22]:
# Import some standard bits from nltk for language structure
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

In [23]:
# Function to clean raw text based on stope words and punctuation
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [25]:
# Clean the articles and assemble into a list
articles_clean = []
for article in articles_guard_dm:
    articles_clean.append(clean(article).split())

In [26]:
# Push articles into Document Term Matrix and create a dictionary
dictionary = corpora.Dictionary(articles_clean)
doc_term_matrix = [dictionary.doc2bow(article) for article in articles_clean]

In [27]:
# Set up the model
Lda = gensim.models.ldamodel.LdaModel

In [28]:
# Train the model
ldamodel = Lda(doc_term_matrix, num_topics=50, id2word = dictionary, passes=50)

In [30]:
print(ldamodel.print_topics(num_topics=50, num_words=1))
ldamodel.save('newsapi_lda.model')

[(0, '0.000*"show"'), (1, '0.034*"coach"'), (2, '0.012*"fire"'), (3, '0.000*"show"'), (4, '0.000*"star"'), (5, '0.010*"ussr"'), (6, '0.007*"prompted"'), (7, '0.003*"disgraceâ\x80\x99"'), (8, '0.000*"new"'), (9, '0.038*"may"'), (10, '0.010*"comment"'), (11, '0.000*"share"'), (12, '0.006*"new"'), (13, '0.014*"share"'), (14, '0.010*"u"'), (15, '0.000*"new"'), (16, '0.006*"star"'), (17, '0.000*"star"'), (18, '0.000*"show"'), (19, '0.000*"new"'), (20, '0.003*"booking"'), (21, '0.000*"show"'), (22, '0.000*"new"'), (23, '0.030*"skin"'), (24, '0.005*"steadfast"'), (25, '0.000*"show"'), (26, '0.026*"warhol"'), (27, '0.017*"hammond"'), (28, '0.028*"farron"'), (29, '0.041*"seed"'), (30, '0.038*"court"'), (31, '0.006*"comment"'), (32, '0.043*"drone"'), (33, '0.015*"inflation"'), (34, '0.032*"risk"'), (35, '0.022*"shot"'), (36, '0.021*"antibiotic"'), (37, '0.009*"trump"'), (38, '0.007*"london"'), (39, '0.003*"denouncing"'), (40, '0.024*"khrushchev"'), (41, '0.003*"parliamenthe"'), (42, '0.020*"shar

In [None]:
#  Alternative approach using Scikit learn below!

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [37]:
documents = articles_guard_dm

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
# display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
star new dress model trump son day old year looks
Topic 1:
edition switch opinion guardian business politics tech email supporter soccer
Topic 2:
tower grenfell block building residents london people floor west man
Topic 3:
bikini new shows reveals star figure ex james island baby
Topic 4:
brexit eu says britain talks theresa published british leave election
Topic 5:
watch mail mailonline police man post trump moment timeline comments
Topic 6:
london bridge attack market police killed victims final days published
Topic 7:
eu deal europe africa asia final figure film filmed filming
Topic 8:
manchester attack city police injured 22 victims attacks concert brother
Topic 9:
air force walking environment london street city final finally filming
Topic 10:
court charlie government appeal parents baby support anti street case
Topic 11:
north korea south president dennis asia kim washington selected east
Topic 12:
sessions russia campaign jeff comey calls meeting general private fbi
To