# Getting the topic distribution for each article

- Start with smaller success_urls df with goal13 completions
- Run LDA on it
- Get topic names printed
- Get topic distributions per html element

## 1. importing test data

In [None]:
import pickle
import pandas as pd


In [None]:
with open('../04_Data/success_g13_corpus.pkl', 'rb') as file:
    success_corpus = pickle.load(file)

In [None]:
with open('../04_Data/success_g13_id2word.pkl', 'rb') as file:
    success_id2word = pickle.load(file)

In [None]:
with open('../04_Data/success_g13_train_bigram.pkl', 'rb') as file:
    success_bigram = pickle.load(file)

## 2. running the model 

In [None]:
import gensim


In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=success_corpus,
                                           id2word=success_id2word,
                                           num_topics=10,
                                           random_state=42,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [22]:
lda_model.print_topics()

[(0,
  '0.018*"find" + 0.015*"datum" + 0.014*"also" + 0.013*"begin" + 0.012*"look" + 0.011*"one" + 0.010*"solut" + 0.010*"everi" + 0.010*"new" + 0.009*"creatur"'),
 (1,
  '0.034*"climat" + 0.023*"sloth" + 0.021*"hand" + 0.021*"individu" + 0.019*"bank" + 0.017*"would" + 0.017*"habitat" + 0.016*"group" + 0.015*"level" + 0.014*"want"'),
 (2,
  '0.032*"emiss" + 0.029*"energi" + 0.016*"lca" + 0.014*"process" + 0.013*"life_cycl" + 0.012*"long" + 0.012*"product" + 0.011*"bitcoin" + 0.011*"blockchain" + 0.010*"scope"'),
 (3,
  '0.027*"approach" + 0.022*"result" + 0.020*"act" + 0.015*"economi" + 0.014*"carbon_footprint" + 0.013*"drive" + 0.013*"transit" + 0.012*"effici" + 0.012*"associ" + 0.012*"stakehold"'),
 (4,
  '0.030*"credit" + 0.028*"entir" + 0.023*"month" + 0.021*"away" + 0.014*"read" + 0.013*"oper" + 0.013*"repres" + 0.012*"live" + 0.012*"happen" + 0.011*"still"'),
 (5,
  '0.034*"plastic" + 0.032*"posit" + 0.023*"man" + 0.021*"water" + 0.019*"planetwid" + 0.017*"increas" + 0.016*"woman

## 3. Topic titles and distributions

In [None]:
with open('../04_Data/success_g13.pkl', 'rb') as file:
    success_g13 = pickle.load(file)
    
success_g13.head(10)

In [None]:
# displays the 10 topics and their top 10 words and weights

model_topics = lda_model.show_topics(formatted=False)
model_topics


In [None]:
topic_titles = []

for i in model_topics:
    index, word_list = i
    word, prob = word_list[0]
    word1, prob1 = word_list[1]
    topic_title = f'{word}_{word1}'
    topic_titles.append(topic_title)
    

In [None]:
topic_titles

In [None]:
topic_dist = []

for i in range(len(success_g13)):
    top_topics = lda_model.get_document_topics(success_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(10)]
    topic_vec.extend([success_g13.index[i]]) 
    topic_dist.append(topic_vec)
    

In [None]:
topic_dist

## 4. One-hot encoding topics to df

In [None]:
# this can probably be part of the for loop that creates topic_dist to avoid this code repetition

top_0 = [topic[0] for topic in topic_dist]
top_1 = [topic[1] for topic in topic_dist]
top_2 = [topic[2] for topic in topic_dist]
top_3 = [topic[3] for topic in topic_dist]
top_4 = [topic[4] for topic in topic_dist]
top_5 = [topic[5] for topic in topic_dist]
top_6 = [topic[6] for topic in topic_dist]
top_7 = [topic[7] for topic in topic_dist]
top_8 = [topic[8] for topic in topic_dist]
top_9 = [topic[9] for topic in topic_dist]


In [None]:
topic_titles

In [None]:
success_g13 = success_g13.assign(find_datum=top_0, climat_sloth=top_1, emiss_energi=top_2, 
                                   approach_result=top_3, credit_entir=top_4, plastic_posit=top_5, 
                                   bear_polar=top_6, go_wast=top_7, alreadi_earli=top_8, 
                                   sustain_climat_chang=top_9)


In [23]:
success_g13.head()


Unnamed: 0,url,content,target_g13,find_datum,climat_sloth,emiss_energi,approach_result,credit_entir,plastic_posit,bear_polar,go_wast,alreadi_earli,sustain_climat_chang
134,https://plana.earth/academy/why-president-of-e...,"This Tuesday 1st December, the annual Digital ...",1.0,0.01289,0.007099,0.010526,0.103633,0.004172,0.002692,0.001743,0.008558,0.382059,0.466629
135,https://plana.earth/academy/why-president-of-e...,BREAKING: “Plan a is using scientific models t...,1.0,0.019767,0.010847,0.016415,0.0059,0.006402,0.004131,0.002674,0.012825,0.013341,0.907698
136,https://plana.earth/academy/why-president-of-e...,BREAKING: “Plan a is using scientific models t...,1.0,0.022347,0.012253,0.018589,0.006665,0.007231,0.004666,0.003021,0.014487,0.01507,0.89567
137,https://plana.earth/academy/why-president-of-e...,What is digital sustainability?,1.0,0.083676,0.046089,0.068636,0.025074,0.027208,0.017556,0.011366,0.054507,0.056702,0.609185
138,https://plana.earth/academy/why-president-of-e...,Digitization is an inherent part of our daily ...,1.0,0.019991,0.010071,0.015119,0.005485,0.005945,0.003836,0.002484,0.012075,0.012446,0.912549
