# Getting the topic distribution for each article

- Start with smaller success_urls df with goal13 completions
- Run LDA on it
- Get topic names printed
- Get topic distributions per html element

## 1. importing test data

In [None]:
import pickle
import pandas as pd


In [None]:
with open('../02_LDA_model/pkl_cellar/new_success_corpus.pkl', 'rb') as file:
    success_corpus = pickle.load(file)

In [None]:
with open('../02_LDA_model/pkl_cellar/new_success_id2word.pkl', 'rb') as file:
    success_id2word = pickle.load(file)

In [None]:
with open('../02_LDA_model/pkl_cellar/new_success_train_bigram.pkl', 'rb') as file:
    success_bigram = pickle.load(file)

## 2. running the model 

In [None]:
import gensim


In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=success_corpus,
                                           id2word=success_id2word,
                                           num_topics=10,
                                           random_state=42,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [17]:
lda_model.print_topics()

[(0,
  '0.044*"sustain" + 0.031*"compani" + 0.026*"busi" + 0.020*"emiss" + 0.017*"way" + 0.016*"impact" + 0.016*"environ" + 0.015*"like" + 0.013*"product" + 0.011*"make"'),
 (1,
  '0.031*"bank" + 0.021*"act" + 0.021*"includ" + 0.021*"campaign" + 0.019*"invest" + 0.017*"solar" + 0.017*"financ" + 0.012*"fuel" + 0.010*"demonstr" + 0.010*"lead"'),
 (2,
  '0.018*"system" + 0.017*"energi" + 0.015*"use" + 0.013*"project" + 0.013*"resourc" + 0.012*"go" + 0.012*"clean" + 0.012*"hand" + 0.012*"one" + 0.011*"may"'),
 (3,
  '0.024*"bear" + 0.023*"long" + 0.019*"polar" + 0.018*"social" + 0.017*"europ" + 0.016*"pop" + 0.015*"die" + 0.014*"coral" + 0.014*"reef" + 0.014*"awar"'),
 (4,
  '0.024*"chang" + 0.021*"plan" + 0.020*"wast" + 0.015*"climat" + 0.015*"human" + 0.014*"individu" + 0.014*"work" + 0.014*"well" + 0.013*"woman" + 0.012*"peopl"'),
 (5,
  '0.051*"climat_chang" + 0.026*"ye" + 0.019*"earli" + 0.016*"local" + 0.016*"earth" + 0.015*"datum" + 0.014*"today" + 0.014*"understand" + 0.013*"footpr

## 3. Topic titles and distributions

In [None]:
with open('../02_LDA_model/pkl_cellar/success_urls.pkl', 'rb') as file:
    success_urls = pickle.load(file)
    
success_urls.head(10)

In [None]:
# displays the 10 topics and their top 10 words and weights

model_topics = lda_model.show_topics(formatted=False)
model_topics


In [None]:
topic_titles = []

for i in model_topics:
    index, word_list = i
    word, prob = word_list[0]
    word1, prob1 = word_list[1]
    topic_title = f'{word}_{word1}'
    topic_titles.append(topic_title)
    

In [None]:
topic_titles

In [None]:
topic_dist = []

for i in range(len(success_urls)):
    top_topics = lda_model.get_document_topics(success_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(10)]
    topic_vec.extend([success_urls.index[i]]) 
    topic_dist.append(topic_vec)
    

In [None]:
topic_dist

## 4. One-hot encoding topics to df

In [None]:
# this can probably be part of the for loop that creates topic_dist to avoid this code repetition

top_0 = [topic[0] for topic in topic_dist]
top_1 = [topic[1] for topic in topic_dist]
top_2 = [topic[2] for topic in topic_dist]
top_3 = [topic[3] for topic in topic_dist]
top_4 = [topic[4] for topic in topic_dist]
top_5 = [topic[5] for topic in topic_dist]
top_6 = [topic[6] for topic in topic_dist]
top_7 = [topic[7] for topic in topic_dist]
top_8 = [topic[8] for topic in topic_dist]
top_9 = [topic[9] for topic in topic_dist]


In [None]:
success_urls = success_urls.assign(sustain_compani=top_0, bank_act=top_1, system_energi=top_2, 
                                   bear_long=top_3, chang_plan=top_4, climat_chang_ye=top_5, 
                                   day_year=top_6, ecolog_flower=top_7, plana_less=top_8, 
                                   go_countri=top_9)


In [18]:
success_urls


Unnamed: 0,url,content,target_g13,sustain_compani,bank_act,system_energi,bear_long,chang_plan,climat_chang_ye,day_year,ecolog_flower,plana_less,go_countri
12,https://plana.earth/academy/release-carbon-man...,proud present major updat softwar product plan...,1.0,0.093297,0.007912,0.046300,0.008560,0.056794,0.502548,0.008783,0.254924,0.003447,0.017434
13,https://plana.earth/academy/release-carbon-man...,make seriou upgrad help measur accur carbon fo...,1.0,0.145241,0.005025,0.025333,0.005456,0.744880,0.020995,0.005598,0.034342,0.002197,0.010933
14,https://plana.earth/academy/release-carbon-man...,want get inform book demo call discov plan car...,1.0,0.088589,0.011018,0.053965,0.011963,0.740156,0.044545,0.012274,0.009204,0.004817,0.023470
15,https://plana.earth/academy/release-carbon-man...,carbon account busi,1.0,0.374308,0.023452,0.251044,0.025464,0.123921,0.095894,0.026127,0.019576,0.010254,0.049959
16,https://plana.earth/academy/release-carbon-man...,emiss dashboard add depth context carbon footp...,1.0,0.597571,0.004254,0.021230,0.004619,0.058250,0.129509,0.004739,0.003551,0.167199,0.009078
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3081,https://plana.earth/academy/earth-overshoot-day/,pick futur,1.0,0.197277,0.027304,0.128433,0.029646,0.132877,0.107347,0.030418,0.022792,0.011938,0.311968
3082,https://plana.earth/academy/earth-overshoot-day/,movethed plana,1.0,0.128445,0.027304,0.128431,0.029646,0.132876,0.101471,0.030418,0.022791,0.340453,0.058164
3156,https://plana.earth/academy/a-message-from-the...,month intens investig topic sloth sloth habita...,1.0,0.028792,0.005525,0.178165,0.006001,0.357185,0.063034,0.272055,0.004612,0.002416,0.082216
3157,https://plana.earth/academy/a-message-from-the...,wednesday may plan launch first planetwid camp...,1.0,0.060877,0.089431,0.136143,0.013790,0.287210,0.355187,0.014150,0.010602,0.005553,0.027056
