# Getting the topic distribution for each article

- Start with smaller success_urls df with goal13 completions
- Run LDA on it
- Get topic names printed
- Get topic distributions per html element
- Run same process on complete academy df

## 1. Importing test data

In [None]:
import pickle
import pandas as pd

In [None]:
with open('../02_LDA_model/pkl_cellar/success_urls.pkl', 'rb') as file:
    success_urls = pickle.load(file)
    
success_urls

## 2. Run LDA
Refer to 03_LDA_implement for more info on these functions

In [None]:
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
def remove_stopwords(texts):
    out = [[word for word in simple_preprocess(str(doc))
            if word not in stop_words]
            for doc in texts]
    return out

In [None]:
def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

In [None]:
def get_corpus(df):

    words = list((df.content))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[article] for article in words]
    id2word = gensim.corpora.Dictionary(bigram)
    
    # removes most common and rare words
    # it's possible I need to get rid of this one
    #id2word.filter_extremes(no_below=10, no_above=0.35)
    
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    
    return corpus, id2word, bigram

In [None]:
corpus, id2word, train_bigram = get_corpus(success_urls)

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=42,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [25]:
lda_model.print_topics()

[(0,
  '0.048*"plan" + 0.038*"compani" + 0.030*"sustain" + 0.023*"first" + 0.023*"busi" + 0.020*"project" + 0.017*"carbon" + 0.016*"come" + 0.014*"fight" + 0.012*"bank"'),
 (1,
  '0.028*"chang" + 0.028*"way" + 0.022*"earli" + 0.016*"year" + 0.016*"today" + 0.016*"new" + 0.014*"one" + 0.014*"climat_chang" + 0.014*"believ" + 0.014*"futur"'),
 (2,
  '0.021*"sustain" + 0.017*"posit" + 0.015*"man" + 0.015*"open" + 0.015*"hope" + 0.013*"choic" + 0.013*"planetwid" + 0.013*"wednesday" + 0.013*"water" + 0.012*"ever"'),
 (3,
  '0.017*"emiss" + 0.016*"system" + 0.016*"energi" + 0.013*"make" + 0.013*"product" + 0.013*"environment" + 0.012*"use" + 0.011*"hand" + 0.011*"well" + 0.011*"help"'),
 (4,
  '0.026*"individu" + 0.018*"peopl" + 0.018*"climat" + 0.017*"climat_chang" + 0.016*"sloth" + 0.015*"also" + 0.015*"woman" + 0.015*"earth" + 0.015*"local" + 0.014*"clean"'),
 (5,
  '0.045*"ecolog" + 0.015*"natur" + 0.015*"opportun" + 0.013*"get" + 0.012*"journey" + 0.012*"becom" + 0.011*"transact" + 0.011

In [None]:
#import pyLDAvis
#import pyLDAvis.gensim
#import matplotlib.pyplot as plt
#import warnings
#warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
# Visualize the topics
#pyLDAvis.enable_notebook(sort=True)
#vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
#pyLDAvis.display(vis)

## 3. Get the topic names printed

In [26]:
# displays the 10 topics and their top 10 words and weights

model_topics = lda_model.show_topics(formatted=False)
model_topics

[(0,
  [('plan', 0.048151694),
   ('compani', 0.03785444),
   ('sustain', 0.030131614),
   ('first', 0.023210242),
   ('busi', 0.022793675),
   ('project', 0.019507527),
   ('carbon', 0.0165984),
   ('come', 0.016305203),
   ('fight', 0.01438183),
   ('bank', 0.012409103)]),
 (1,
  [('chang', 0.028057825),
   ('way', 0.027781893),
   ('earli', 0.022348398),
   ('year', 0.016183186),
   ('today', 0.015972517),
   ('new', 0.015830645),
   ('one', 0.0144107435),
   ('climat_chang', 0.014323255),
   ('believ', 0.014182873),
   ('futur', 0.013918093)]),
 (2,
  [('sustain', 0.020788154),
   ('posit', 0.016575467),
   ('man', 0.015145073),
   ('open', 0.01506497),
   ('hope', 0.014829749),
   ('choic', 0.013383618),
   ('planetwid', 0.013217439),
   ('wednesday', 0.0126706455),
   ('water', 0.012572415),
   ('ever', 0.012389411)]),
 (3,
  [('emiss', 0.016839137),
   ('system', 0.0157152),
   ('energi', 0.01552342),
   ('make', 0.013466747),
   ('product', 0.01310676),
   ('environment', 0.012

In [None]:
topic_dist = []

for i in range(len(success_urls)):
    top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(10)]
    topic_vec.extend([success_urls.index[i]]) 
    topic_dist.append(topic_vec)


In [None]:
topic_dist

## 4. Use top 2 words to name topics

In [None]:
topic_titles = []

for i in model_topics:
    index, word_list = i
    word, prob = word_list[0]
    word1, prob1 = word_list[1]
    topic_title = f'{word}_{word1}'
    topic_titles.append(topic_title)
    

In [27]:
topic_titles

['plan_compani',
 'chang_way',
 'sustain_posit',
 'emiss_system',
 'individu_peopl',
 'ecolog_natur',
 'realiti_august',
 'top_iso',
 'go_ye',
 'day_year']

## 5. Assign topic names to values in topic_dist

- create dict with article_index:porbability_list
- attach topic names to probabilities for readability
- order list of probabilities in DESC

In [None]:
# STEP 1: INDICIES DICTIONARY
# convert all topic distributions in topic_dist
# to key:value of index:topic_probabilites

dist = {}

for topic in topic_dist:
    dist.update({topic[-1]: topic[:-1]})

dist

In [None]:
# STEP 2: ADD TOPIC NAMES TO PROBABILITIES

# adding topic titles to probabilities
# original order must be maintained as
# topic titles are based on the list index

topic_dict = dict(zip(topic_titles,(list(dist.values())[0])))
topic_dict


In [None]:
for i,prob in dist.items():
    dist[i] = dict(zip(topic_titles, prob))


In [None]:
dist

In [None]:
# First item in dict

dist[12]

In [None]:
dict(sorted(dist[12].items(), key=lambda x:x[1], reverse=True))

In [28]:
for i,prob in dist.items():
    dist[i] = dict(sorted(dist[i].items(), key=lambda x:x[1], reverse=True))

dist

{12: {'realiti_august': 0.5028347,
  'plan_compani': 0.22393243,
  'go_ye': 0.15211035,
  'emiss_system': 0.039846957,
  'individu_peopl': 0.02663375,
  'chang_way': 0.02100196,
  'day_year': 0.01212562,
  'sustain_posit': 0.011783422,
  'ecolog_natur': 0.008098455,
  'top_iso': 0.001632354},
 13: {'emiss_system': 0.7738138,
  'ecolog_natur': 0.08818717,
  'realiti_august': 0.057266843,
  'plan_compani': 0.020330243,
  'individu_peopl': 0.01714788,
  'chang_way': 0.013785017,
  'go_ye': 0.0131573975,
  'day_year': 0.007663212,
  'sustain_posit': 0.007614712,
  'top_iso': 0.0010337471},
 14: {'ecolog_natur': 0.558914,
  'plan_compani': 0.21178292,
  'emiss_system': 0.057654023,
  'chang_way': 0.04362093,
  'individu_peopl': 0.038511124,
  'go_ye': 0.029591896,
  'realiti_august': 0.024015065,
  'day_year': 0.017029589,
  'sustain_posit': 0.01658323,
  'top_iso': 0.0022972757},
 15: {'plan_compani': 0.51487213,
  'emiss_system': 0.123102605,
  'individu_peopl': 0.08210042,
  'chang_way':