# Getting the topic distribution for each article

- Start with smaller success_urls df with goal13 completions
- Run LDA on it
- Get topic names printed
- Get topic distributions per html element

## 1. importing test data

In [None]:
import pickle
import pandas as pd


In [None]:
with open('../04_Data/success_g13_corpus.pkl', 'rb') as file:
    success_corpus = pickle.load(file)

In [None]:
with open('../04_Data/success_g13_id2word.pkl', 'rb') as file:
    success_id2word = pickle.load(file)

In [None]:
with open('../04_Data/success_g13_train_bigram.pkl', 'rb') as file:
    success_bigram = pickle.load(file)

## 2. running the model 

In [None]:
import gensim


In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=success_corpus,
                                           id2word=success_id2word,
                                           num_topics=10,
                                           random_state=42,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [26]:
lda_model.print_topics()

[(0,
  '0.018*"find" + 0.015*"datum" + 0.014*"also" + 0.013*"begin" + 0.012*"look" + 0.011*"one" + 0.010*"solut" + 0.010*"everi" + 0.010*"new" + 0.009*"creatur"'),
 (1,
  '0.034*"climat" + 0.023*"sloth" + 0.021*"hand" + 0.021*"individu" + 0.019*"bank" + 0.017*"would" + 0.017*"habitat" + 0.016*"group" + 0.015*"level" + 0.014*"want"'),
 (2,
  '0.032*"emiss" + 0.029*"energi" + 0.016*"lca" + 0.014*"process" + 0.013*"life_cycl" + 0.012*"long" + 0.012*"product" + 0.011*"bitcoin" + 0.011*"blockchain" + 0.010*"scope"'),
 (3,
  '0.027*"approach" + 0.022*"result" + 0.020*"act" + 0.015*"economi" + 0.014*"carbon_footprint" + 0.013*"drive" + 0.013*"transit" + 0.012*"effici" + 0.012*"associ" + 0.012*"stakehold"'),
 (4,
  '0.030*"credit" + 0.028*"entir" + 0.023*"month" + 0.021*"away" + 0.014*"read" + 0.013*"oper" + 0.013*"repres" + 0.012*"live" + 0.012*"happen" + 0.011*"still"'),
 (5,
  '0.034*"plastic" + 0.032*"posit" + 0.023*"man" + 0.021*"water" + 0.019*"planetwid" + 0.017*"increas" + 0.016*"woman

## 3. Topic titles and distributions

In [None]:
with open('../04_Data/success_g13.pkl', 'rb') as file:
    success_g13 = pickle.load(file)
    
success_g13.head(10)

In [None]:
# displays the 10 topics and their top 10 words and weights

model_topics = lda_model.show_topics(formatted=False)
model_topics


In [None]:
topic_titles = []

for i in model_topics:
    index, word_list = i
    word, prob = word_list[0]
    word1, prob1 = word_list[1]
    topic_title = f'{word}_{word1}'
    topic_titles.append(topic_title)
    

In [None]:
topic_titles

In [None]:
topic_dist = []

for i in range(len(success_g13)):
    top_topics = lda_model.get_document_topics(success_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(10)]
    topic_vec.extend([success_g13.index[i]]) 
    topic_dist.append(topic_vec)
    

In [None]:
topic_dist

## 4. One-hot encoding topics to df

In [None]:
# this can probably be part of the for loop that creates topic_dist to avoid this code repetition

top_0 = [topic[0] for topic in topic_dist]
top_1 = [topic[1] for topic in topic_dist]
top_2 = [topic[2] for topic in topic_dist]
top_3 = [topic[3] for topic in topic_dist]
top_4 = [topic[4] for topic in topic_dist]
top_5 = [topic[5] for topic in topic_dist]
top_6 = [topic[6] for topic in topic_dist]
top_7 = [topic[7] for topic in topic_dist]
top_8 = [topic[8] for topic in topic_dist]
top_9 = [topic[9] for topic in topic_dist]


In [None]:
topic_titles

In [None]:
success_g13 = success_g13.assign(find_datum=top_0, climat_sloth=top_1, emiss_energi=top_2, 
                                   approach_result=top_3, credit_entir=top_4, plastic_posit=top_5, 
                                   bear_polar=top_6, go_wast=top_7, alreadi_earli=top_8, 
                                   sustain_climat_chang=top_9)


In [None]:
success_g13.head()


## 5. LDA model function
Df goes in, df with topic distribution comes out

### 5.1 setting up the LDA model

In [None]:
def run_lda(lda_corpus, lda_id2word):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=lda_corpus,
                                                id2word=lda_id2word,
                                                num_topics=10,
                                                random_state=42,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
    

    return lda_model

### 5.2 Getting the topic titles

In [None]:
def topic_titles(lda_model):
    
    model_topics = lda_model.show_topics(formatted=False)
    
    topic_titles = []
    
    for i in model_topics:
        index, word_list = i
        word, prob = word_list[0]
        word1, prob1 = word_list[1]
        topic_title = f'{word}_{word1}'
        topic_titles.append(topic_title)
    
    return topic_titles

### 5.3 Get topic distribution

In [None]:
def topic_distribution(df, lda_model, corpus):
    
    topic_dist = []
    
    for i in range(len(df)):
        top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0)
        topic_vec = [top_topics[i][1] for i in range(10)]
        topic_vec.extend([df.index[i]]) 
        topic_dist.append(topic_vec)
        
    return topic_dist

### 5.4 Add topic distributions to df

In [None]:
def get_lda_results(topic_titles, topic_dist, df):
    
    for i, value in enumerate(topic_titles):
        topic_col = [topic[i] for topic in topic_dist]
        df[value] = topic_col
    
    return df


### 5.5 Final function

In [None]:
def lda_results_to_df(corpus, id2word, df):
    
    lda_model = run_lda(corpus, id2word)
    titles = topic_titles(lda_model)
    top_dist = topic_distribution(df, lda_model, corpus)
    df = get_lda_results(titles,top_dist, df)
    
    return df
    

## 6. Using function on big full academy df

In [None]:
with open('../04_Data/full_academy_corpus.pkl', 'rb') as file:
    academy_corpus = pickle.load(file)

with open('../04_Data/full_academy_id2word.pkl', 'rb') as file:
    academy_id2word = pickle.load(file)

with open('../04_Data/academy_posts.pkl', 'rb') as file:
    academy_df = pickle.load(file)

In [27]:
full_academy = lda_results_to_df(academy_corpus, academy_id2word, academy_df)
full_academy.head(10)

Unnamed: 0,url,title,published,content,tag,human_one,seem_focu,less_decid,even_long,plan_sustain,cultur_nativ,bee_music,tree_natur,water_credit,year_understand
0,https://plana.earth/academy/how-sustainable-is...,How sustainable is your office Christmas party?,2020-12-18,Christmas is just around the corner! Unfortuna...,p,0.12948,0.064434,0.003364,0.027903,0.233645,0.017146,0.03877,0.240459,0.014376,0.230422
1,https://plana.earth/academy/how-sustainable-is...,How sustainable is your office Christmas party?,2020-12-18,"Before we start, here are a few statistics on ...",p,0.170685,0.015211,0.003975,0.033012,0.046098,0.020283,0.015379,0.255159,0.017005,0.423192
2,https://plana.earth/academy/how-sustainable-is...,How sustainable is your office Christmas party?,2020-12-18,It is time for the office Christmas Party Quiz!,h2,0.201742,0.021895,0.005719,0.047522,0.065292,0.029197,0.022136,0.293736,0.024476,0.288286
3,https://plana.earth/academy/how-sustainable-is...,How sustainable is your office Christmas party?,2020-12-18,Find out how sustainable your Christmas Party ...,p,0.193324,0.020962,0.005484,0.045488,0.104568,0.027951,0.021197,0.350031,0.023436,0.207561
4,https://plana.earth/academy/how-sustainable-is...,How sustainable is your office Christmas party?,2020-12-18,May the Merry Force be with you!,h2,0.26924,0.024003,0.006269,0.052097,0.071576,0.032007,0.024266,0.352234,0.026832,0.141477
5,https://plana.earth/academy/how-joe-biden-u-s-...,How will Biden and the US rejoin the Paris Agr...,2020-12-15,It is the fifth anniversary of the Paris Clima...,p,0.124986,0.013552,0.003545,0.029408,0.067596,0.018071,0.013704,0.209292,0.015151,0.504694
6,https://plana.earth/academy/how-joe-biden-u-s-...,How will Biden and the US rejoin the Paris Agr...,2020-12-15,The US President-Elect Joe Biden vows to rejoi...,p,0.122715,0.013347,0.003475,0.028964,0.039784,0.017789,0.01348,0.176903,0.014908,0.568635
7,https://plana.earth/academy/how-joe-biden-u-s-...,How will Biden and the US rejoin the Paris Agr...,2020-12-15,This follows the United States’ withdrawal fro...,p,0.086032,0.022989,0.146809,0.03194,0.022954,0.025552,0.007729,0.154619,0.008546,0.492831
8,https://plana.earth/academy/how-joe-biden-u-s-...,How will Biden and the US rejoin the Paris Agr...,2020-12-15,"Today, the Trump Administration officially lef...",p,0.148504,0.016179,0.037333,0.035093,0.048196,0.021547,0.016324,0.217574,0.018054,0.441196
9,https://plana.earth/academy/how-joe-biden-u-s-...,How will Biden and the US rejoin the Paris Agr...,2020-12-15,"Today, the Trump Administration officially lef...",p,0.153953,0.016729,0.038197,0.036291,0.04986,0.022291,0.016897,0.22636,0.018685,0.420737


In [None]:
with open('../04_Data/lda_separated_posts.pkl', 'wb') as fa:
    pickle.dump(full_academy, fa, protocol=pickle.HIGHEST_PROTOCOL)