## Imports

In [4]:
# !python -m spacy download en

In [5]:
import pandas as pd

from sklearn.pipeline import Pipeline

import spacy
spacy_lemmatizer = spacy.load('en', disable=['parser', 'ner'])

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn import preprocessing

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from pprint import pprint

from timeit import default_timer

## Utilities

In [6]:
def lemmatize_spacy(text):
    """Apply Lemmatization using Spacy Lemmatization"""
    lemma_text = None
    if isinstance(text, str):
        lemma_text = ""

        # Parse the sentence using the loaded 'en' model object `nlp`
        doc = spacy_lemmatizer(text)

        # Extract the lemma for each token
        lemmas = []
        for token in doc:
            lemma = token.lemma_
            if lemma == '-PRON-': #https://spacy.io/api/annotation#lemmatization
                lemma = token.text
            if lemma.isalpha():
                lemmas.append(lemma)
        lemma_text = " ".join(lemmas)
        if len(lemma_text) == 0:
            return None
    return lemma_text

In [7]:
def convert_sec(no_of_secs):
    """return no_of_secs to min or hrs string"""
    if no_of_secs < 60:
        return "{:06.4f}    sec".format(no_of_secs)
    elif no_of_secs < 3600:
        return "{:06.4f}    min".format(no_of_secs/60)
    else:
        return "{:06.4f}    hr".format(no_of_secs/3600)

## Load Data

In [8]:
articles_df = pd.read_csv('../data/shared_articles.csv')
articles_df.fillna('UNKOWN', inplace=True)
articles_df.shape

(3122, 13)

In [9]:
articles_df.head()

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,UNKOWN,UNKOWN,UNKOWN,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,UNKOWN,UNKOWN,UNKOWN,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,UNKOWN,UNKOWN,UNKOWN,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,UNKOWN,UNKOWN,UNKOWN,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,UNKOWN,UNKOWN,UNKOWN,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en


### Filtering Shared Articles

In [10]:
articles_df['eventType'].value_counts()

CONTENT SHARED     3047
CONTENT REMOVED      75
Name: eventType, dtype: int64

In [11]:
shared_articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
shared_articles_df.shape

(3047, 13)

### Filtering Shared Articles in English

In [12]:
shared_articles_df['lang'].value_counts()

en    2211
pt     829
la       3
es       2
ja       2
Name: lang, dtype: int64

In [13]:
eng_shared_articles_df = shared_articles_df[shared_articles_df['lang'] == 'en']
eng_shared_articles_df.shape

(2211, 13)

In [14]:
eng_shared_articles_df = eng_shared_articles_df[['contentId', 'url', 'title', 'text']]
eng_shared_articles_df.head()

Unnamed: 0,contentId,url,title,text
1,-4110354420726924665,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...
2,-7292285110016212249,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...
3,-6151852268067518688,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...
4,2448026894306402386,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...
5,-2826566343807132236,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...


In [15]:
len(eng_shared_articles_df['contentId'].unique())

2211

### Combine title and text to create text_content

In [16]:
eng_shared_articles_df['text_content'] = eng_shared_articles_df[['title', 'text']].apply(lambda x: '. '.join(x), axis=1)
eng_shared_articles_df.head()

Unnamed: 0,contentId,url,title,text,text_content
1,-4110354420726924665,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,"Ethereum, a Virtual Currency, Enables Transact..."
2,-7292285110016212249,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,Bitcoin Future: When GBPcoin of Branson Wins O...
3,-6151852268067518688,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,Google Data Center 360° Tour. We're excited to...
4,2448026894306402386,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,"IBM Wants to ""Evolve the Internet"" With Blockc..."
5,-2826566343807132236,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,IEEE to Talk Blockchain at Cloud Computing Oxf...


In [17]:
eng_shared_articles_df.drop(['title', 'text'], inplace=True, axis=1)
eng_shared_articles_df.head()

Unnamed: 0,contentId,url,text_content
1,-4110354420726924665,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact..."
2,-7292285110016212249,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...
3,-6151852268067518688,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour. We're excited to...
4,2448026894306402386,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc..."
5,-2826566343807132236,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...


## Get Unique Text Content

In [18]:
eng_shared_articles_df.head()

Unnamed: 0,contentId,url,text_content
1,-4110354420726924665,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact..."
2,-7292285110016212249,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...
3,-6151852268067518688,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour. We're excited to...
4,2448026894306402386,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc..."
5,-2826566343807132236,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...


### Multiple ContentIds, Urls for Text_Content

In [19]:
len(eng_shared_articles_df['contentId'].unique()), len(eng_shared_articles_df['url'].unique()), len(eng_shared_articles_df['text_content'].unique())

(2211, 2191, 2185)

In [20]:
url_contents_df = eng_shared_articles_df[['url', 'text_content']].drop_duplicates()
url_contents_df.shape

(2191, 2)

In [21]:
url_contents_df.head()

Unnamed: 0,url,text_content
1,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact..."
2,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...
3,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour. We're excited to...
4,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc..."
5,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...


In [22]:
len(url_contents_df['url'].unique()), len(url_contents_df['text_content'].unique())

(2191, 2185)

In [23]:
len(url_contents_df['url'].unique()) - len(url_contents_df['text_content'].unique())

6

### Multiple Urls for Text Content

In [24]:
content_urls = url_contents_df.groupby('text_content').apply(lambda x : list(x['url'].unique()))
for content in content_urls.index:
    if len(content_urls[content]) > 1:
        #print(content)
        pprint(content_urls[content])

['http://android-developers.blogspot.com.br/2016/04/android-n-developer-preview-2-out-today.html',
 'http://android-developers.blogspot.com/2016/04/android-n-developer-preview-2-out-today.html']
['https://www.oreilly.com/learning/how-to-build-a-robot-that-sees-with-100-and-tensorflow?twitter=@bigdata&utm_source=hackernewsletter&utm_medium=email&utm_term=fav',
 'https://www.oreilly.com/learning/how-to-build-a-robot-that-sees-with-100-and-tensorflow']
['https://www.blog.google/products/g-suite/introducing-google-cloud-search-g-suite/',
 'https://blog.google/products/g-suite/introducing-google-cloud-search-g-suite/']
['https://www.ted.com/talks/linus_torvalds_the_mind_behind_linux',
 'http://www.ted.com/talks/linus_torvalds_the_mind_behind_linux']
['https://www.thinkwithgoogle.com/articles/youtube-empowering-ads-engage.html?utm_source=Gplus&utm_medium=social&utm_campaign=Think',
 'https://www.thinkwithgoogle.com/articles/youtube-empowering-ads-engage.html?utm_medium=email-d&utm_source=201

In [25]:
eng_shared_articles_content_df = eng_shared_articles_df['text_content'].drop_duplicates()\
                                                                       .reset_index()\
                                                                       .rename(columns={'index' : 'text_content_id'})
eng_shared_articles_content_df.shape

(2185, 2)

In [26]:
eng_shared_articles_content_df.head()

Unnamed: 0,text_content_id,text_content
0,1,"Ethereum, a Virtual Currency, Enables Transact..."
1,2,Bitcoin Future: When GBPcoin of Branson Wins O...
2,3,Google Data Center 360° Tour. We're excited to...
3,4,"IBM Wants to ""Evolve the Internet"" With Blockc..."
4,5,IEEE to Talk Blockchain at Cloud Computing Oxf...


In [27]:
eng_shared_articles_text_id_df = eng_shared_articles_df.merge(eng_shared_articles_content_df)
eng_shared_articles_text_id_df.head()

Unnamed: 0,contentId,url,text_content,text_content_id
0,-4110354420726924665,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",1
1,-7292285110016212249,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,2
2,-6151852268067518688,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour. We're excited to...,3
3,2448026894306402386,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",4
4,-2826566343807132236,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...,5


In [28]:
eng_shared_articles_text_id_df['text_content_id'].value_counts()

80      4
337     4
618     3
1513    2
681     2
       ..
957     1
955     1
954     1
953     1
3121    1
Name: text_content_id, Length: 2185, dtype: int64

In [29]:
eng_shared_articles_text_id_df[eng_shared_articles_text_id_df['text_content_id'] == 337]

Unnamed: 0,contentId,url,text_content,text_content_id
280,8160885002406274828,https://medium.com/quality-functions/preventin...,Preventing Software Bugs from Ever Occurring -...,337
281,-4248528062574538011,https://medium.com/quality-functions/preventin...,Preventing Software Bugs from Ever Occurring -...,337
282,-3265730906936163775,https://medium.com/quality-functions/preventin...,Preventing Software Bugs from Ever Occurring -...,337
283,2824996248683640175,https://medium.com/quality-functions/preventin...,Preventing Software Bugs from Ever Occurring -...,337


## Extract Topics of Articles

In [30]:
eng_shared_articles_content_df.shape

(2185, 2)

In [31]:
eng_shared_articles_content_df.head()

Unnamed: 0,text_content_id,text_content
0,1,"Ethereum, a Virtual Currency, Enables Transact..."
1,2,Bitcoin Future: When GBPcoin of Branson Wins O...
2,3,Google Data Center 360° Tour. We're excited to...
3,4,"IBM Wants to ""Evolve the Internet"" With Blockc..."
4,5,IEEE to Talk Blockchain at Cloud Computing Oxf...


In [32]:
eng_shared_articles_content_df.loc[:, 'processed_text'] = eng_shared_articles_content_df['text_content'].apply(lemmatize_spacy)

In [33]:
eng_shared_articles_content_df.head()

Unnamed: 0,text_content_id,text_content,processed_text
0,1,"Ethereum, a Virtual Currency, Enables Transact...",Ethereum a Virtual Currency Enables Transactio...
1,2,Bitcoin Future: When GBPcoin of Branson Wins O...,Bitcoin Future when GBPcoin of Branson Wins ov...
2,3,Google Data Center 360° Tour. We're excited to...,Google Data Center Tour We be excite to share ...
3,4,"IBM Wants to ""Evolve the Internet"" With Blockc...",IBM want to evolve the internet with Blockchai...
4,5,IEEE to Talk Blockchain at Cloud Computing Oxf...,ieee to talk Blockchain at Cloud Computing Oxf...


In [34]:
# bow_lda_pipeline = Pipeline(steps=[('vectorizer', CountVectorizer()),
#                                    ('dim_reduce', LatentDirichletAllocation())
#                               ])

# bow_lda_pipeline_params = {
#     'vectorizer__strip_accents': 'unicode',
#     'vectorizer__stop_words': 'english',
#     'vectorizer__lowercase': True,
#     'vectorizer__token_pattern': r'\b[a-zA-Z]{3,}\b',
#     'vectorizer__min_df': 5,
#     'vectorizer__max_df': 0.5,
    
#     'dim_reduce__n_components': 30,
#     'dim_reduce__random_state': 0
# }
# bow_lda_pipeline.set_params(**bow_lda_pipeline_params)

# start_time = default_timer()
# bow_lda_pipeline.fit(eng_shared_articles_content_df['text_content'])
# end_time = default_timer()

# time_taken_str = convert_sec(end_time - start_time)
# print("Time Taken : {}".format(time_taken_str))


# start_time = default_timer()
# pyLDAvis.sklearn.prepare(bow_lda_pipeline.named_steps['dim_reduce'], 
#                          bow_lda_pipeline.named_steps['vectorizer'].fit_transform(eng_shared_articles_content_df['text_content']), 
#                          bow_lda_pipeline.named_steps['vectorizer'])
# end_time = default_timer()

# time_taken_str = convert_sec(end_time - start_time)
# print("Time Taken : {}".format(time_taken_str))

In [35]:
def get_topics(model, feature_names, n_top_words=10):
    topics_df = pd.DataFrame(model.components_, columns=feature_names)   
    topics = dict()
    for i, topic_words in topics_df.iterrows():
        topic_id_str = 'topic_' + str(i)        
        topic_words_dict = topic_words.sort_values(ascending=False).head(n_top_words).to_dict()
        topic_words_str = ', '.join(['#'+str(word) for word in topic_words_dict.keys()])
        print(topic_id_str + " : " + topic_words_str)
        topics[topic_id_str] = dict()
        topics[topic_id_str]['words_str'] = topic_words_str
        topics[topic_id_str]['words'] = topic_words_dict
    return topics

### Term Frequency LDA

In [36]:
tf_vectorizer = CountVectorizer(strip_accents = 'ascii',
                                stop_words = 'english',
                                lowercase = True,
                                analyzer = 'word',
                                token_pattern = r'\b[a-zA-Z]{5,}\b',
                                ngram_range = (1, 1),
                                max_df = 0.8, min_df = 1)
lda_tf = LatentDirichletAllocation(n_components=15, n_jobs=-1, random_state=0)

start_time = default_timer()

dtm_tf = tf_vectorizer.fit_transform(eng_shared_articles_content_df['processed_text'])
lda_tf.fit(dtm_tf)

end_time = default_timer()
time_taken_str = convert_sec(end_time - start_time)
print("Time Taken : {}".format(time_taken_str))

tf_feature_names = tf_vectorizer.get_feature_names()
topics = get_topics(lda_tf, tf_feature_names, n_top_words=10)

# n_components=5
# Time Taken : 10.7162    sec
# topic_0 : #people, #world, #woman, #company, #country, #think, #thing, #million, #space, #write
# topic_1 : #change, #build, #create, #datum, #example, #application, #write, #project, #event, #start
# topic_2 : #google, #cloud, #service, #apple, #platform, #company, #android, #developer, #product, #device
# topic_3 : #company, #customer, #business, #product, #digital, #technology, #drupal, #experience, #datum, #organization
# topic_4 : #machine, #datum, #learn, #learning, #people, #model, #thing, #image, #human, #think

# n_components=10
# Time Taken : 14.4023    sec
# topic_0 : #woman, #people, #country, #world, #music, #black, #space, #brazil, #america, #write
# topic_1 : #software, #change, #project, #build, #process, #development, #design, #organization, #people, #agile
# topic_2 : #google, #apple, #product, #company, #cloud, #platform, #android, #service, #developer, #build
# topic_3 : #drupal, #module, #content, #acquia, #commerce, #feature, #build, #service, #support, #vehicle
# topic_4 : #machine, #learning, #learn, #datum, #people, #human, #computer, #thing, #google, #image
# topic_5 : #company, #customer, #digital, #business, #technology, #datum, #product, #blockchain, #market, #experience
# topic_6 : #datum, #example, #event, #write, #method, #function, #value, #class, #result, #create
# topic_7 : #cloud, #google, #mobile, #docker, #application, #storage, #device, #security, #datum, #password
# topic_8 : #google, #service, #create, #instance, #cloud, #message, #datum, #update, #windows, #application
# topic_9 : #docker, #build, #container, #image, #command, #start, #version, #server, #support, #application
    
# n_components = 15
# Time Taken : 23.2102    sec
# topic_0 : #woman, #country, #brazil, #black, #music, #trend, #quantum, #plague, #death, #america
# topic_1 : #software, #change, #project, #build, #development, #process, #agile, #organization, #service, #developer
# topic_2 : #product, #company, #business, #people, #design, #digital, #service, #build, #platform, #experience
# topic_3 : #drupal, #module, #content, #acquia, #community, #support, #build, #developer, #commerce, #feature
# topic_4 : #machine, #learning, #datum, #learn, #model, #image, #human, #intelligence, #computer, #algorithm
# topic_5 : #customer, #company, #technology, #datum, #digital, #blockchain, #business, #bitcoin, #service, #industry
# topic_6 : #datum, #example, #method, #event, #function, #write, #value, #create, #result, #class
# topic_7 : #password, #token, #kotlin, #field, #label, #place, #swarm, #aggregation, #value, #google
# topic_8 : #google, #android, #facebook, #developer, #search, #windows, #content, #create, #message, #mobile
# topic_9 : #docker, #container, #command, #build, #image, #version, #server, #start, #application, #create
# topic_10 : #apple, #people, #thing, #company, #email, #device, #think, #phone, #feature, #microsoft
# topic_11 : #language, #people, #write, #think, #thing, #story, #change, #learn, #world, #really
# topic_12 : #payment, #account, #store, #customer, #consumer, #online, #credit, #retailer, #wallet, #chart
# topic_13 : #cloud, #google, #service, #application, #platform, #datum, #server, #instance, #support, #network
# topic_14 : #mobile, #people, #experience, #thing, #website, #start, #world, #sound, #content, #think



# n_components = 20
# Time Taken : 32.3148    sec
# topic_0 : #woman, #music, #trend, #quantum, #light, #brazil, #power, #circuit, #gender, #world
# topic_1 : #software, #project, #change, #build, #service, #agile, #microservice, #asset, #component, #development
# topic_2 : #company, #product, #service, #platform, #business, #build, #google, #design, #slack, #developer
# topic_3 : #vehicle, #internet, #technology, #drive, #autonomous, #sensor, #driver, #device, #airbnb, #china
# topic_4 : #machine, #learning, #learn, #human, #google, #intelligence, #computer, #datum, #company, #people
# topic_5 : #company, #technology, #blockchain, #bitcoin, #customer, #financial, #datum, #service, #industry, #business
# topic_6 : #event, #datum, #example, #method, #function, #content, #write, #result, #database, #search
# topic_7 : #google, #field, #design, #label, #place, #aggregation, #apple, #bucket, #swarm, #different
# topic_8 : #google, #facebook, #notification, #message, #search, #android, #drive, #create, #desktop, #share
# topic_9 : #docker, #container, #command, #image, #build, #server, #linux, #windows, #application, #start
# topic_10 : #apple, #email, #people, #thing, #feature, #company, #device, #phone, #microsoft, #iphone
# topic_11 : #people, #thing, #think, #story, #change, #write, #really, #start, #point, #problem
# topic_12 : #consumer, #customer, #brand, #store, #company, #datum, #online, #market, #growth, #percent
# topic_13 : #cloud, #google, #service, #datum, #application, #platform, #instance, #server, #network, #request
# topic_14 : #mobile, #people, #website, #experience, #content, #digital, #account, #world, #thing, #marketing
# topic_15 : #datum, #model, #learn, #language, #machine, #image, #learning, #network, #example, #algorithm
# topic_16 : #password, #event, #salesforce, #trello, #include, #storage, #million, #space, #board, #create
# topic_17 : #customer, #company, #drupal, #business, #product, #organization, #experience, #change, #process, #digital
# topic_18 : #woman, #write, #people, #japanese, #technical, #english, #death, #speak, #black, #release
# topic_19 : #developer, #module, #build, #javascript, #android, #support, #library, #project, #create, #framework

Time Taken : 7.5867    sec
topic_0 : #woman, #country, #brazil, #black, #music, #trend, #technical, #plague, #death, #white
topic_1 : #software, #change, #project, #build, #development, #agile, #process, #organization, #service, #developer
topic_2 : #product, #company, #business, #design, #people, #digital, #build, #service, #platform, #experience
topic_3 : #drupal, #module, #content, #acquia, #community, #support, #react, #build, #developer, #commerce
topic_4 : #machine, #learning, #learn, #datum, #model, #image, #human, #computer, #algorithm, #intelligence
topic_5 : #customer, #company, #datum, #technology, #business, #blockchain, #digital, #bitcoin, #service, #industry
topic_6 : #datum, #example, #method, #function, #event, #write, #value, #create, #class, #result
topic_7 : #password, #token, #kotlin, #field, #label, #place, #design, #swarm, #google, #aggregation
topic_8 : #google, #android, #facebook, #search, #service, #message, #create, #windows, #notification, #video
topic_9 : #



In [37]:
dtm_tf.shape

(2185, 29751)

In [38]:
lda_tf.components_.shape

(15, 29751)

In [39]:
reduced_dtm_tf = pd.DataFrame(lda_tf.transform(dtm_tf), columns=['topic_' + str(i) for i in range(lda_tf.n_components)])
reduced_dtm_tf.shape

(2185, 15)

In [40]:
reduced_dtm_tf.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14
0,0.000168,0.012941,0.309521,0.000168,0.072652,0.564456,0.000168,0.000168,0.000168,0.000168,0.000168,0.000168,0.000168,0.038751,0.000168
1,0.00045,0.00045,0.00045,0.00045,0.111673,0.113502,0.00045,0.00045,0.00045,0.00045,0.00045,0.00045,0.577152,0.00045,0.192718
2,0.00068,0.00068,0.00068,0.00068,0.00068,0.00068,0.00068,0.00068,0.618954,0.00068,0.131277,0.00068,0.00068,0.241605,0.00068
3,0.000151,0.037151,0.000151,0.000151,0.000151,0.528791,0.000151,0.000151,0.027521,0.000151,0.000151,0.000151,0.000151,0.000151,0.404874
4,0.061285,0.000538,0.000538,0.000538,0.19578,0.504231,0.000538,0.053001,0.000538,0.000538,0.000538,0.000538,0.000538,0.180326,0.000538


In [41]:
def get_topic_names(x):
    return x[x>0.1].to_dict()
reduced_dtm_tf.loc[:, 'mostly_about'] = reduced_dtm_tf.apply(get_topic_names, axis=1)

In [42]:
reduced_dtm_tf.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,mostly_about
0,0.000168,0.012941,0.309521,0.000168,0.072652,0.564456,0.000168,0.000168,0.000168,0.000168,0.000168,0.000168,0.000168,0.038751,0.000168,"{'topic_2': 0.30952064779929167, 'topic_5': 0...."
1,0.00045,0.00045,0.00045,0.00045,0.111673,0.113502,0.00045,0.00045,0.00045,0.00045,0.00045,0.00045,0.577152,0.00045,0.192718,"{'topic_4': 0.11167252677737557, 'topic_5': 0...."
2,0.00068,0.00068,0.00068,0.00068,0.00068,0.00068,0.00068,0.00068,0.618954,0.00068,0.131277,0.00068,0.00068,0.241605,0.00068,"{'topic_8': 0.6189542836324868, 'topic_10': 0...."
3,0.000151,0.037151,0.000151,0.000151,0.000151,0.528791,0.000151,0.000151,0.027521,0.000151,0.000151,0.000151,0.000151,0.000151,0.404874,"{'topic_5': 0.5287913666013795, 'topic_14': 0...."
4,0.061285,0.000538,0.000538,0.000538,0.19578,0.504231,0.000538,0.053001,0.000538,0.000538,0.000538,0.000538,0.000538,0.180326,0.000538,"{'topic_4': 0.19577970746143628, 'topic_5': 0...."


In [43]:
reduced_dtm_tf.loc[:, 'no_of_topics'] = reduced_dtm_tf['mostly_about'].apply(lambda x: len(x))

In [44]:
reduced_dtm_tf.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,mostly_about,no_of_topics
0,0.000168,0.012941,0.309521,0.000168,0.072652,0.564456,0.000168,0.000168,0.000168,0.000168,0.000168,0.000168,0.000168,0.038751,0.000168,"{'topic_2': 0.30952064779929167, 'topic_5': 0....",2
1,0.00045,0.00045,0.00045,0.00045,0.111673,0.113502,0.00045,0.00045,0.00045,0.00045,0.00045,0.00045,0.577152,0.00045,0.192718,"{'topic_4': 0.11167252677737557, 'topic_5': 0....",4
2,0.00068,0.00068,0.00068,0.00068,0.00068,0.00068,0.00068,0.00068,0.618954,0.00068,0.131277,0.00068,0.00068,0.241605,0.00068,"{'topic_8': 0.6189542836324868, 'topic_10': 0....",3
3,0.000151,0.037151,0.000151,0.000151,0.000151,0.528791,0.000151,0.000151,0.027521,0.000151,0.000151,0.000151,0.000151,0.000151,0.404874,"{'topic_5': 0.5287913666013795, 'topic_14': 0....",2
4,0.061285,0.000538,0.000538,0.000538,0.19578,0.504231,0.000538,0.053001,0.000538,0.000538,0.000538,0.000538,0.000538,0.180326,0.000538,"{'topic_4': 0.19577970746143628, 'topic_5': 0....",3


In [45]:
reduced_dtm_tf['no_of_topics'].value_counts()

2    825
3    767
4    287
1    254
5     50
6      2
Name: no_of_topics, dtype: int64

In [46]:
reduced_dtm_tf[reduced_dtm_tf['no_of_topics'] == 0]

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,mostly_about,no_of_topics


In [47]:
topics

{'topic_0': {'words_str': '#woman, #country, #brazil, #black, #music, #trend, #technical, #plague, #death, #white',
  'words': {'woman': 295.8352776157405,
   'country': 152.8645733002662,
   'brazil': 139.74893830198724,
   'black': 121.70954171216943,
   'music': 104.792084141382,
   'trend': 74.04677792260088,
   'technical': 69.39006419048542,
   'plague': 68.85755816731393,
   'death': 65.58216656728166,
   'white': 62.11874189062583}},
 'topic_1': {'words_str': '#software, #change, #project, #build, #development, #agile, #process, #organization, #service, #developer',
  'words': {'software': 957.3085578975662,
   'change': 703.2097343339267,
   'project': 676.1621800295693,
   'build': 574.807926265171,
   'development': 544.4248146285677,
   'agile': 481.9616304688019,
   'process': 472.0548293029117,
   'organization': 372.59456240478437,
   'service': 344.17199591254683,
   'developer': 341.8456758187348}},
 'topic_2': {'words_str': '#product, #company, #business, #design, #pe

In [48]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

  head(R).drop('saliency', 1)


#### Using different MDS functions

With `sklearn` installed, other MDS functions, such as MMDS and TSNE can be used for plotting if the default PCoA is not satisfactory.

In [49]:
# pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')

In [50]:
# pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

### Tf-IDf LDA

In [51]:
tf_idf_vectorizer = TfidfVectorizer(strip_accents = 'ascii',
                                    stop_words = 'english',
                                    lowercase = True,
                                    analyzer = 'word',
                                    token_pattern = r'\b[a-zA-Z]{5,}\b',
                                    ngram_range = (1, 1),
                                    max_df = 0.8, min_df = 1,
                                    #max_features = 1000,
                                    norm='l2', 
                                    use_idf=True, smooth_idf=True, sublinear_tf=True)
lda_tf_idf = LatentDirichletAllocation(n_components=15, random_state=0)

start_time = default_timer()

dtm_tf_idf = tf_idf_vectorizer.fit_transform(eng_shared_articles_content_df['text_content'])
lda_tf_idf.fit(dtm_tf_idf)

end_time = default_timer()
time_taken_str = convert_sec(end_time - start_time)
print("Time Taken : {}".format(time_taken_str))

tf_idf_feature_names = tf_idf_vectorizer.get_feature_names()
topics = get_topics(lda_tf_idf, tf_idf_feature_names, n_top_words=10)

Time Taken : 6.2821    sec
topic_0 : #kittlaus, #bryant, #bluemix, #shots, #whiteboard, #jamboard, #sussman, #instrumentation, #disclosure, #bootloader
topic_1 : #firebase, #swarm, #balancer, #microservice, #stackdriver, #downtime, #gcloud, #layout, #niantic, #proxy
topic_2 : #aggregations, #aggregation, #elasticsearch, #facets, #raspberry, #sorted, #buffer, #gradle, #macos, #sierra
topic_3 : #mycroft, #thunder, #burda, #kickstarter, #tesla, #skully, #walgreens, #publishing, #interviewer, #coupons
topic_4 : #layout, #arguments, #debate, #bigquery, #graphs, #constraints, #angular, #optimization, #micro, #checklist
topic_5 : #sprints, #kotlin, #optionals, #dagger, #kibana, #espresso, #retrolambda, #subcommittee, #unicode, #safari
topic_6 : #female, #scandic, #portrait, #kotlin, #bigquery, #sarcastic, #istanbul, #tattoo, #vintage, #dzone
topic_7 : #acquia, #protocol, #whatsapp, #ubuntu, #dynamic, #keyboard, #caching, #optimization, #exposed, #dropbox
topic_8 : #evaluations, #martech, #bra



In [52]:
dtm_tf_idf.shape

(2185, 36915)

In [53]:
lda_tf_idf.components_.shape

(15, 36915)

In [54]:
reduced_dtm_tf_idf = lda_tf_idf.transform(dtm_tf_idf)
reduced_dtm_tf_idf.shape

(2185, 15)

In [55]:
pyLDAvis.sklearn.prepare(lda_tf_idf, dtm_tf_idf, tf_idf_vectorizer)

  head(R).drop('saliency', 1)


#### Using different MDS functions

With `sklearn` installed, other MDS functions, such as MMDS and TSNE can be used for plotting if the default PCoA is not satisfactory.

In [56]:
# pyLDAvis.sklearn.prepare(lda_tf_idf, dtm_tf_idf, tf_idf_vectorizer, mds='mmds')

In [57]:
# pyLDAvis.sklearn.prepare(lda_tf_idf, dtm_tf_idf, tf_idf_vectorizer, mds='tsne')

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
