---
# <center> Topic Modeling with LDA and NMF </center>
---

## 1. Imports and data

In [None]:
import pandas as pd
import nltk
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from nltk.stem.porter import PorterStemmer
import re

We'll be using a dataset of articles from a newspaper in Kenya, "The Daily Nation".

In [None]:
fileName = "../../data/NationMediaArticles.csv"
news = pd.read_csv(fileName)
news.head()

## 2. Prepare the data

Stem, lemmatize and remove punctuation and other non-alphanumeric characters.

First we'll declare some helper functions.

In [None]:
lm = WordNetLemmatizer()
ps = PorterStemmer()

def lemmatize(word):
    """
    Lemmatizes a word
    """
    # get part of speech (needed for the lemmatizer)
    pos = nltk.pos_tag([word])[0][1]
    
    # convert into wordnet POS
    if pos.startswith("V"):
        pos_wn = wordnet.VERB
    elif pos.startswith("R"):
        pos_wn = wordnet.ADV
    elif pos.startswith("J"):
        pos_wn = wordnet.ADJ
    else:
        pos_wn = wordnet.NOUN
    
    # lemmatize
    return lm.lemmatize(word, pos = pos_wn)


def clean_string(string, lemmas = True):
    """
    Converts the string to lowercase, lemmatizes and removes non-alphanumerics
    """
    if pd.isnull(string): return ""
    # remove non-alphanumeric characters
    string = re.sub(r'[^A-Za-z]+', ' ', string)
    # to lowercase and stem / lemmatize
    if lemmas:
        string = [lemmatize(x) for x in string.lower().split()]
    else:
        string = [ps.stem(x) for x in string.lower().split()]
    return " ".join(string)

Now, clean the data.

`lemmas = False` will produce stems. You can pass `lemmas = True` to have `clean_string()` produce lemmas instead of stems, but this is a slower process.

In [None]:
# clean
title_cleaned = [clean_string(x, lemmas = False) for x in news.headline]

In [None]:
# inspect the cleaned data
title_cleaned[:5]

## 3. Create term-document matrices

Create term-document matrices from the documents. We'll use two types of weights: tf-idf and binary (bag-of-words).

In [None]:
# use tf-idf and bag-of-words representations

max_features = 10000

# tf-idf
tf = TfidfVectorizer(max_df = 0.95, min_df = 2, max_features = max_features, stop_words = 'english')
features_tfidf = tf.fit_transform(title_cleaned)
feature_names_tfidf = tf.get_feature_names()

# bag-of-words
bow = CountVectorizer(max_df = 0.95, min_df = 2, max_features = max_features, stop_words = 'english')
features_bow = bow.fit_transform(title_cleaned)
feature_names_bow = bow.get_feature_names()

## 4. Train the models

We'll train two types of topic models: Latent Dirichlet Allocation (LDA) and Non-negative Matrix Factorization (NMF). They both achieve the same result (discover topics in the documents), but while LDA uses a probabilistic approach, NMF uses linear algebra. NMF also may produce more meaningful topics on smaller datasets.

In [None]:
# specify the number of topics in the documents
topics_count = 5

# train an NMF model
nmf = NMF(n_components = topics_count, random_state = 0, alpha = 0.1, l1_ratio = 0.5, init = "nndsvd")
nmf.fit(features_bow)

# train an LDA model
lda = LatentDirichletAllocation(n_components = topics_count, max_iter = 5, random_state = 0, learning_method = "online")
lda.fit(features_tfidf)

## 5. Display topics

We'll use a helper function that displays the top terms associated with each topic.

In [None]:
# display topics

def show_topics(model, feature_names, top_words = 10):
    """
    Displays the top words from a model
    """
    print("Model: %s" % model.__class__.__name__)
    for i, topic in enumerate(model.components_):
        print("Topic %i\n%r\n" % (i, ", ".join([feature_names[x] for x in topic.argsort()[:-top_words - 1: -1]])))

In [None]:
# print the topics obtained from the NMF model
show_topics(nmf, feature_names_bow)

In [None]:
# print LDA topics
show_topics(lda, feature_names_tfidf)

## 6. Exercise. Train a topic model on the BBC news dataset

Use a dataset of news from the BBC to train your own topic model: LDA or NMF, or both. The news items in the BBC dataset are grouped into five categories:
* business
* tech
* entertainment
* sport
* politics

The dataset is in `'../../data/bbc.csv'`.

In [None]:
# Your code here
fileName = "../../data/bbc.csv"
bbc = pd.read_csv(fileName)
bbc.head()

In [None]:
# clean the data
content_cleaned = [clean_string(x, lemmas = False) for x in bbc.content]
content_cleaned[:5]

In [None]:
# create term-document matrices

max_features = 10000

# tf-idf
tf = TfidfVectorizer(max_df = 0.95, min_df = 2, max_features = max_features, stop_words = 'english')
features_tfidf = tf.fit_transform(content_cleaned)
feature_names_tfidf = tf.get_feature_names()

# bag-of-words
bow = CountVectorizer(max_df = 0.95, min_df = 2, max_features = max_features, stop_words = 'english')
features_bow = bow.fit_transform(content_cleaned)
feature_names_bow = bow.get_feature_names()

In [None]:
# train the models
# specify the number of topics in the documents
topics_count = 5

# train an NMF model
nmf = NMF(n_components = topics_count, random_state = 0, alpha = 0.1, l1_ratio = 0.5, init = "nndsvd")
nmf.fit(features_bow)

# train an LDA model
lda = LatentDirichletAllocation(n_components = topics_count, max_iter = 5, random_state = 0, learning_method = "online")
lda.fit(features_tfidf)

In [None]:
# display the topics: NMF model
show_topics(nmf, feature_names_bow)

In [None]:
# display the topics: LDA model
show_topics(lda, feature_names_bow)