In [None]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
from pathlib import Path
import json

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.feature_extraction import text as text1
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt

LOGS_ENABLED = True

#region Functions

def log_to_console(logs):
    if(LOGS_ENABLED):
        pprint(logs)


def basic_clean(df):
    # Convert to list
    data = df['CONTENT'].tolist()

    # Remove Emails
    data = [re.sub("\\S*@\\S*\\s?", '', doc) for doc in data]

    # Remove new line characters
    data = [re.sub("\\s+", ' ', doc) for doc in data]

    # Remove distracting single quotes
    data = [re.sub("\\'", "", doc) for doc in data]

    return data


def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


def tokenize(data):
    results = list(sent_to_words(data))

    return results



def build_dw_matrix(data):
    f = open("stop_words", 'r')
    words = f.readline().split(",")
    stop_words= []

    for word in words:
        tmp = word.strip()
        stop_words.append(tmp)

    my_stop_words = text1.ENGLISH_STOP_WORDS.union(stop_words)
    
    vectorizer = TfidfVectorizer(
        analyzer='word',
        min_df=10,                        # minimum reqd occurences of a word 
        stop_words=my_stop_words,             # remove stop words
        lowercase=True,                   # convert all words to lowercase
        token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
        # max_features=50000,             # max number of uniq words
    )

    return vectorizer, vectorizer.fit_transform(data)


def build_lda_model(data):
    lda_model = LatentDirichletAllocation(
        n_topics=20,               # Number of topics
        max_iter=10,               # Max learning iterations
        learning_method='online',
        random_state=100,          # Random state
        batch_size=128,            # n docs in each learning iter
        evaluate_every = -1,       # compute perplexity every n iters, default: Don't
        n_jobs = -1,               # Use all available CPUs
    )
    result_matrix = lda_model.fit_transform(data)

    log_to_console(lda_model)  # Model attributes

    return lda_model, result_matrix


def diagnose_model(model, data):
    # Log Likelyhood: Higher the better
    log_to_console("Log Likelihood: {}".format(model.score(data)))

    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    log_to_console("Perplexity: {}".format(model.perplexity(data)))

    # See model parameters
    log_to_console(model.get_params())


#endregion

In [None]:
# Import Dataset
# df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
df = pd.read_json('./entries.json', lines=True)
# log_to_console(df.target_names.unique())


values = df.values.tolist()[0]
columns = ['CONTENT']

data = []

for i in range(len(values)):
    cell = []
    value = values[i]
    cell = [value]
    data.append(cell)
new_df = pd.DataFrame(data, columns=columns)
df = new_df


lang_model_name = 'en_core_web_md'

nlp = spacy.load(lang_model_name)





# data_words = basic_clean(df)
# data_tokenized = tokenize(data_words)


In [None]:
data_words=[]

In [None]:
texts = df['CONTENT']
docs = []

for text in texts:
    docs.append(nlp(text))
        

In [None]:
print(len(docs))

In [None]:
data_lemmatized = []

for doc in docs:
    lemmas = []
    lemmas_new =[]

    lemmas = ([word.lemma_ if word.lemma_ !='-PRON-' else '' for word in doc if word.pos_=='NOUN'])

    for token in lemmas:
        if(token not in lemmas_new):
            lemmas_new.append(token)
    
    data_words.append(lemmas_new)
    
    lemmas = " ".join([token for token in lemmas_new])

    data_lemmatized.append(lemmas)


In [None]:
data_lemmatized[0:3]


In [None]:
# def lemmatization(nlp, texts, allowed_postags):
    
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent))
#         texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
#     return texts_out


# # Do lemmatization keeping only Noun, Adj, Verb, Adverb
# data_lemmatized = lemmatization(nlp, data_tokenized, allowed_postags=['NOUN'])

# Create the Document-Word matrix
vectorizer, data_vectorized = build_dw_matrix(data_lemmatized)

In [None]:
# Init the Model
lda_model = LatentDirichletAllocation()
lda_model.learning_decay=0.9
lda_model.n_components=10

# Create Document - Topic Matrix
lda_output = lda_model.fit_transform(data_vectorized)


In [None]:
lda_output[0]


In [None]:
lth = data_vectorized.shape[0]

for i in range(10):
    print(len(data_vectorized[i,:].toarray()[0]))

In [None]:
x= vectorizer.get_feature_names()

In [None]:
len(data_vectorized[0,:].toarray()[0])

In [None]:
lda_output.shape

In [None]:
lda_model.components_.shape

In [None]:
lda_model.verbose

In [None]:
from sklearn.decomposition import NMF

n_samples = 2000
n_features = 1000
n_topics = 11
n_top_words = 15

tfidf = data_vectorized
nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf)

feature_names = vectorizer.get_feature_names()

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()