In [51]:
from __future__ import print_function
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

import metaknowledge as mk
from stop_words import get_stop_words #Utils for streaming large files (S3, HDFS, gzip, bz2...) https://pypi.org/project/stop-words/
from nltk.tokenize import RegexpTokenizer
import seaborn as sns
import numpy
import matplotlib as plt
import pandas as pd
import os

# Imports for gensim.
import gensim #Python framework for fast Vector Space Modelling https://pypi.org/project/gensim/
from gensim import corpora, models

# Imports for pyLDAvis.
import pyLDAvis.gensim as gensimvis
import pyLDAvis


sns.set_style(style="white") # change the default background plot colour
sns.set(font_scale=.7)

plt.rc("savefig", dpi=300) # improve default resolution of graphics

os.chdir('.')

In [53]:
RC_docs = mk.RecordCollection('raw_data', cached = True)
len(RC_docs)

3915

In [54]:
# Transform the record collection into a format for use with natural language processing applications.
raw = RC_docs.forNLP('topic_model.csv', lower=True, removeNumbers=True,
                removeNonWords=True, removeWhitespace=True, removeCopyright=False)

In [55]:
# Conver the raw text into a list.
documents = raw['abstract']

In [56]:
# For use with SKlearn, convert the raw text to a numpy array.
docs = numpy.asarray(documents)

In [57]:
# Increasing the number of features will give a better model, but it may increase the runtime.
features = 1000
topics = 50
top_words = 10

In [58]:
# Initialize the tokenizer.
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=features,
                                   stop_words='english')

In [59]:
# Tokenize the documents.
tfidf = tfidf_vectorizer.fit_transform(docs)

In [60]:
# Define the output function.
def print_top_words(model, feature_names, top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-top_words - 1:-1]]))
    print()

In [61]:
# Extract the features (tokens) for the models.
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=features,
                                stop_words='english')

In [62]:
tf = tf_vectorizer.fit_transform(docs)

In [63]:
# Extract topics for the LDA model, and fit the model.
# Change n_topics for n_components
lda = LatentDirichletAllocation(n_components=topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=5,
                          mean_change_tol=0.001, n_components=50, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [64]:
# Print the list of topics and their contents. 
# Note that the constraints can be modified by changing the number of topics or number of words in each topic.
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, top_words)

Topic #0:
governance ecosystems technology technologies partners capabilities assets risks mechanisms variability
Topic #1:
network networks game capability networking structure base local adaptive service
Topic #2:
entrepreneurial entrepreneurship entrepreneurs ecosystems ecosystem university economic support venture universities
Topic #3:
technology business industry current ecosystem results appropriate increasing provides paper
Topic #4:
small greater platform strategies benefits interoperability performance software intellectual sample
Topic #5:
space virtual physical principles new article challenges development commercial public
Topic #6:
carbon industry partners emerging construction systematic driving project primary development
Topic #7:
water land sustainability utilization local availability supply importance regions case
Topic #8:
information search results users formal forms terms form relationship ontology
Topic #9:
connectivity integrating private life single simple ris

In [65]:
# Load stopwords from personal file
df2 = pd.read_csv('stopwords_personal.csv', delimiter=',', header=None)
 
# User list comprehension to create a list of lists from Dataframe rows
list_of_rows = [list(row) for row in df2.values]
 
# Print list of lists i.e. rows
stopwords_personal = list_of_rows[0]

In [66]:
# Initialize the tokenizer. -Explicacion y amplianció https://people.revoledu.com/kardi/tutorial/Python/NLP1.html
# Bigram, trigram
tokenizer = RegexpTokenizer(r'\w+') # To separate a sentence into words without puctuation - https://people.revoledu.com/kardi/tutorial/Python/NLP1.html

In [67]:
# Initialize a list, we will save tokens here.
tokens = []

In [68]:
# Iterate over the documents list, and tokenize and save each entry.
for l in documents:
    token = tokenizer.tokenize(l)
    tokens.append(token)

In [69]:
# Initialize a list, we will save cleaned tokens here.
cleaned_tokens = []

In [70]:
# Keep tokens only if they do not appear in the list of stopwords.
for l in tokens:
    cleaned_tokens.append([i for i in l if not i in stopwords_personal])

In [71]:
# Create dictionary from the cleaned tokens.
dictionary = corpora.Dictionary(cleaned_tokens)

In [72]:
# Convert the cleaned tokens into a numpy array.
array = numpy.asarray(cleaned_tokens)

In [73]:
# Train the corpus using the array, creating a bag-of-words that contains each word in the array.
corpus = [dictionary.doc2bow(word) for word in array]

In [74]:
# Generate the LDA model using gensim.
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=50, id2word = dictionary, passes=20)

In [75]:
# The format for printing 50 topics is not very good, so we can print a sample of 10 topics, with the top 5 words in the content.
# You can change the number of topics and number of words as you wish
ldamodel.print_topics(num_topics=10, num_words=5)

[(49,
  '0.018*"ecosystem" + 0.015*"properties" + 0.013*"populations" + 0.013*"bay" + 0.011*"residents"'),
 (7,
  '0.021*"products" + 0.018*"model" + 0.017*"variability" + 0.013*"pss" + 0.010*"design"'),
 (46,
  '0.017*"journals" + 0.016*"board" + 0.012*"partner" + 0.010*"odamy" + 0.010*"directors"'),
 (16,
  '0.035*"resource" + 0.028*"ecommerce" + 0.024*"information" + 0.019*"electronic" + 0.014*"macro"'),
 (11,
  '0.136*"knowledge" + 0.016*"ee" + 0.014*"sharing" + 0.007*"communities" + 0.006*"women"'),
 (45,
  '0.084*"smart" + 0.044*"city" + 0.040*"urban" + 0.038*"cities" + 0.016*"tourism"'),
 (2,
  '0.019*"bricolage" + 0.009*"systems" + 0.009*"acquisitions" + 0.008*"removal" + 0.008*"integrators"'),
 (22,
  '0.142*"business" + 0.046*"value" + 0.025*"model" + 0.025*"models" + 0.020*"ecosystem"'),
 (15,
  '0.017*"water" + 0.013*"health" + 0.011*"c" + 0.009*"ecosystem" + 0.009*"elsevier"'),
 (25,
  '0.044*"industrial" + 0.025*"ecosystem" + 0.013*"system" + 0.012*"model" + 0.010*"develo

In [76]:
# Prepare the visualization data.
vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)

In [77]:
# Visualize the topic model.
pyLDAvis.display(vis_data)

## References

* Blei, David M.; Ng, Andrew Y.; Jordan, Michael I (January 2003). Lafferty, John (ed.). "Latent Dirichlet Allocation". Journal of Machine Learning Research. 3 (4–5): pp. 993–1022. doi:10.1162/jmlr.2003.3.4-5.993
* McLevey, J. et McIlroy-Young, R. (2017). Introducing metaknowledge: Software for computational research in information science, network analysis, and science of science. Journal of Informetrics, 11(1), 176‑197. doi:10.1016/j.joi.2016.12.005
* Moore, J. (1993). Predators and Prey: A new ecology fo competition. Harvard Business Review.
* Vasconcelos Gomes, L. A., Figueiredo Facin, A. L., Salerno, M. S. et Ikenami, R. K. (2018). Unpacking the innovation ecosystem construct: Evolution, gaps and trends. TECHNOLOGICAL FORECASTING AND SOCIAL CHANGE, 136, 30‑48. doi:10.1016/j.techfore.2016.11.009
