In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.utils.extmath import randomized_svd

In [None]:
df = df.read_csv('../data/lem_stem_text.csv')

In [None]:
params = {
    'vectorizer': {
        'analyzer': 'word',
        'stop_words': stopwords.words('english'),
        'ngram_range': (1, 1),
        'token_pattern': '[a-z]{3,}',
        'min_df': 0.01,
        'lowercase': True
    },
    'raw_documents': df['stemmed_text'],
    'components': 4,
}

# CountVectorizer

In [None]:
cv = CountVectorizer(**params['vectorizer'])
X_cv = cv.fit_transform(raw_documents=params['raw_documents'])

cv_doc_word = pd.DataFrame(X_cv.toarray(), columns=cv.get_feature_names())
cv_doc_word

### Matrix Decomposition

In [None]:
k = 80
U, Sigma, VT = randomized_svd(X_cv, 
                              n_components=k,
                              n_iter=5,
                              random_state=None)

In [None]:
plt.figure(figsize=(18, 6))
plt.title("Sigma vs. K Topics")
plt.ylabel("Sigma")
plt.xlabel("K Topics")
sns.lineplot(range(k), Sigma)
plt.savefig("sigma_topics_cv.png");

In [None]:
list(zip(Sigma, range(80)))[:15]

# TF-IDF

In [None]:
tfidf = TfidfVectorizer(**params['vectorizer'])
X_tfidf = tfidf.fit_transform(raw_documents=params['raw_documents'])

tfidf_doc_word = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names())
tfidf_doc_word

### Matrix Decomposition

In [None]:
k = 80
U, Sigma, VT = randomized_svd(X_tfidf, 
                              n_components=k,
                              n_iter=5,
                              random_state=None)

In [None]:
plt.figure(figsize=(18, 6))
plt.title("Sigma vs. K Topics")
plt.ylabel("Sigma")
plt.xlabel("K Topics")
sns.lineplot(range(k), Sigma)
plt.savefig("sigma_topics_tfidf.png");

In [None]:
pair_sig_topic = list(zip(Sigma, range(k)))
for i in range(k):
    print(pair_sig_topic[i][0])
    if str(pair_sig_topic[i][0])[:3] == str(pair_sig_topic[i + 1][0])[:3]:#(pair_sig_topic[i][0] / pair_sig_topic[i + 1][0]) == 1.0:
        print(pair_sig_topic[i][0], pair_sig_topic[i + 1][0])
        print(pair_sig_topic[i][1], pair_sig_topic[i + 1][1])
        print(pair_sig_topic[i + 2])
        print(str(pair_sig_topic[i + 2][0])[:3])
        break

# Non-Negative Matrix (NMF)

In [None]:
# Use NMF to look for 15 topics
n_topics = 15
model = NMF(n_components=n_topics)
model.fit(X_tfidf)

# Print the top 10 words
n_words = 10
feature_names = tfidf.get_feature_names()

topic_list = []
for topic_idx, topic in enumerate(model.components_):
    top_n = [feature_names[i]
             for i in topic.argsort()
             [-n_words:]][::-1]
    top_features = ' '.join(top_n)
    topic_list.append(f"topic_{'_'.join(top_n[:3])}") 

    print(f"Topic {topic_idx}: {top_features}")

# Looking for best parameters with GridSearchCV

In [None]:
%%time

# Beware it will try *all* of the combinations, so it'll take ages
search_params = {
  'n_components': [5, 10, 15, 20, 25, 30, 40, 50],
  'learning_decay': [.2, .5, .7]
}

# Set up LDA with the options we'll keep static
model = LatentDirichletAllocation(learning_method='online')

# Try all of the options
gridsearch = GridSearchCV(model, param_grid=search_params, n_jobs=-1, verbose=1)
gridsearch.fit(X_tfidf)

# What did we find?
print("Best Model's Params: ", gridsearch.best_params_)
print("Best Log Likelihood Score: ", gridsearch.best_score_)
lda_params = gridsearch.best_params_.copy()

In [None]:
%%time

# Use LDA to look for 5 topics
learning_decay, n_topics = lda_params.values()
model = LatentDirichletAllocation(learning_method='online', n_components=n_topics, learning_decay=learning_decay)
model.fit(X_tfidf)

# Print the top 10 words per topic
n_words = 10
feature_names = tfidf.get_feature_names()

topic_list = []
for topic_idx, topic in enumerate(model.components_):
    top_n = [feature_names[i]
             for i in topic.argsort()
             [-n_words:]][::-1]
    top_features = ' '.join(top_n)
    topic_list.append(f"topic_{'_'.join(top_n[:3])}") 

    print(f"Topic {topic_idx}: {top_features}")

In [None]:
# Convert our counts into numbers
amounts = model.transform(X_tfidf) * 100

# Set it up as a dataframe
topics = pd.DataFrame(amounts, columns=topic_list)
topics.head()

In [None]:
label = df['stemmed_text'].index
doc_topic = pd.DataFrame(model.fit_transform(tfidf.fit_transform(df['stemmed_text'])).round(5),
                         index = label,
                         columns=['topic{}'.format(i + 1) for i in range(n_topics)])

df['top_topic'] = doc_topic.idxmax(axis=1)