## Website: https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx),
              " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online',
                                learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10

print('NMF')
display_topics(nmf, tfidf_feature_names, no_top_words)
print('\n')
print('LDA')
display_topics(lda, tf_feature_names, no_top_words)



NMF
Topic 0: don just people think like know time good right ve
Topic 1: card video monitor drivers cards bus vga driver color ram
Topic 2: god jesus bible christ faith believe christian christians church sin
Topic 3: game team year games season players play hockey win player
Topic 4: car new 00 sale 10 price offer condition shipping 20
Topic 5: thanks does know advance mail hi anybody info looking help
Topic 6: windows file use files dos window program using problem running
Topic 7: edu soon cs university com email internet article ftp send
Topic 8: key chip encryption clipper keys government escrow public use algorithm
Topic 9: drive scsi drives hard disk ide controller floppy cd mac


LDA
Topic 0: people gun armenian armenians war turkish states israel said children
Topic 1: government people law mr use president don think right public
Topic 2: space program output entry data nasa use science research build
Topic 3: key car chip used keys bike use bit clipper number
Topic 4: edu fil

In [18]:
documents[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

## Bleach Report articles

In [3]:
import pickle, codecs, json

docs_raw_filename = "docs_raw_saturday2"
with open (docs_raw_filename, 'rb') as fp:
    raw_docs = pickle.load(fp)

docs_cleaned_filename = "docs_cleaned_saturday2"
with open (docs_cleaned_filename, 'rb') as fp:
    cleaned_docs = pickle.load(fp)

# titles_filename = 'article_dicts_saturday2'
# with open (titles_filename, 'rb') as fp:
#     titles = pickle.load(fp)
    
article_dicts_filename = "article_dicts_saturday2"
all_articles = []
with codecs.open(article_dicts_filename,'rU','utf-8') as f:
    for line in f:
        all_articles.append(json.loads(line))

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx),
              " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
# documents = dataset.data
documents = raw_docs

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online',
                                learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10

print('NMF')
display_topics(nmf, tfidf_feature_names, no_top_words)
print('\n')
print('LDA')
display_topics(lda, tf_feature_names, no_top_words)



NMF
Topic 0: points game series celtics rebounds season games rockets assists jazz
Topic 1: nba triple mj 30 way best liangelo chanceraptors classmikal clowning
Topic 2: draft nfl browns lb landry football app wnba journey st
Topic 3: year season said team nba players league like knicks million
Topic 4: james cleveland cavaliers lebron cavs thomas lue trade love nance
Topic 5: warriors curry durant golden state kerr thompson green stephen klay
Topic 6: betting oddsshark odds cover picks games check nba point total
Topic 7: embiid philadelphia 76ers sixers heat miami fultz simmons joel game
Topic 8: tournament ncaa michigan villanova kansas duke state wildcats sg sf
Topic 9: leonard spurs antonio san popovich kawhi aldridge wright wojnarowski meeting


LDA
Topic 0: percent season points year game point shooting nba just ball
Topic 1: game games nba vs conference state warriors round series rockets
Topic 2: million free trade year season team lakers pick deal contract
Topic 3: draft tour

In [11]:
all_articles[0]['title']

{'author': 'Paul Kasabian',
 'description': 'The NBA All-Star Game will have a much different look this year. The traditional Eastern Conference vs. Western Conference format has been scrapped in favor of a playground-style game in which two captains pick sides. Those captains are Golden State Warriors …',
 'publishedAt': '2018-02-18T14:00:00Z',
 'source': {'id': 'bleacher-report', 'name': 'Bleacher Report'},
 'title': 'NBA All-Star Game 2018: Format, TV Schedule, Live Stream, Rosters and Prediction',
 'url': 'https://bleacherreport.com/articles/2760021-nba-all-star-game-2018-format-tv-schedule-live-stream-rosters-and-prediction',
 'urlToImage': 'https://img.bleacherreport.net/img/images/photos/003/724/691/hi-res-b0f424980cc73292ab2c9198c683a567_crop_exact.jpg?w=1200&h=1200&q=75'}

In [9]:
raw_docs[0]

"The NBA All-Star Game will have a much different look this year.The traditional Eastern Conference vs. Western Conference format has been scrapped in favor of a playground-style game in which two captains pick sides.Those captains are Golden State Warriors point guard Stephen Curry and Cleveland Cavaliers forward LeBron James, who were given a pool of players to choose from for their respective teams (named Team Stephen and Team LeBron).First, James and Curry each had to pick four players from a pool of eight contenders who were voted as starters prior to the draft. Then, they each selected seven reserves to round out their 12-person teams.Nothing about the game itself has changed: It will be a typical 48-minute NBA contest with four 12-minute quarters.Here's a look at the basic information surrounding the game, including the television schedule and live-stream links. You can also find the rosters, a few notable storylines and a prediction below.\xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0