# Getting the topic distribution for each article

- Start with smaller success_urls df with goal13 completions
- Run LDA on it
- Get topic names printed
- Get topic distributions per html element
- Run same process on complete academy df

## 1. Importing test data

In [None]:
import pickle
import pandas as pd

In [None]:
with open('../02_LDA_model/pkl_cellar/success_urls.pkl', 'rb') as file:
    success_urls = pickle.load(file)
    
success_urls

## 2. Run LDA
Refer to 03_LDA_implement for more info on these functions

In [None]:
import gensim
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
def remove_stopwords(texts):
    out = [[word for word in simple_preprocess(str(doc))
            if word not in stop_words]
            for doc in texts]
    return out

In [None]:
def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

In [None]:
def get_corpus(df):

    words = list((df.content))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[article] for article in words]
    id2word = gensim.corpora.Dictionary(bigram)
    
    # removes most common and rare words
    # it's possible I need to get rid of this one
    #id2word.filter_extremes(no_below=10, no_above=0.35)
    
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    
    return corpus, id2word, bigram

In [None]:
corpus, id2word, train_bigram = get_corpus(success_urls)

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=42,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
lda_model.print_topics()

In [None]:
#import pyLDAvis
#import pyLDAvis.gensim
#import matplotlib.pyplot as plt
#import warnings
#warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
# Visualize the topics
#pyLDAvis.enable_notebook(sort=True)
#vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
#pyLDAvis.display(vis)

## 3. Get the topic names printed

In [None]:
# displays the 10 topics and their top 10 words and weights

model_topics = lda_model.show_topics(formatted=False)
model_topics

In [None]:
topic_dist = []

for i in range(len(success_urls)):
    top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(10)]
    topic_vec.extend([success_urls.index[i]]) 
    topic_dist.append(topic_vec)


In [None]:
topic_dist

## 4. Use top 2 words to name topics

In [None]:
topic_titles = []

for i in model_topics:
    index, word_list = i
    word, prob = word_list[0]
    word1, prob1 = word_list[1]
    topic_title = f'{word}_{word1}'
    topic_titles.append(topic_title)
    

In [None]:
topic_titles

## 5. Assign topic names to values in topic_dist

- create dict with article_index:porbability_list
- attach topic names to probabilities for readability
- order list of probabilities in DESC

## 6. one-hot encoding topics to df

In [None]:
# first we have the df

success_urls.head()

In [None]:
topic_dist

In [None]:
topic_dist[0]

In [None]:
for topic in topic_dist:
    print(topic[0])

In [None]:
top_0 = [topic[0] for topic in topic_dist]

In [None]:
success_urls['top_0'] = top_0

In [None]:
success_urls

In [None]:
# this can probably be part of the for loop that creates topic_dist to avoid this code repetition

top_1 = [topic[1] for topic in topic_dist]
top_2 = [topic[2] for topic in topic_dist]
top_3 = [topic[3] for topic in topic_dist]
top_4 = [topic[4] for topic in topic_dist]
top_5 = [topic[5] for topic in topic_dist]
top_6 = [topic[6] for topic in topic_dist]
top_7 = [topic[7] for topic in topic_dist]
top_8 = [topic[8] for topic in topic_dist]
top_9 = [topic[9] for topic in topic_dist]

In [None]:
success_urls = success_urls.assign(top_1=top_1, top_2=top_2, top_3=top_3, 
                                   top_4=top_4, top_5=top_5, top_6=top_6, 
                                   top_7=top_7, top_8=top_8, top_9=top_9)

In [55]:
success_urls

Unnamed: 0,url,content,target_g13,top_0,top_1,top_2,top_3,top_4,top_5,top_6,top_7,top_8,top_9
12,https://plana.earth/academy/release-carbon-man...,proud present major updat softwar product plan...,1.0,0.223932,0.021002,0.011783,0.039847,0.026634,0.008098,0.502835,0.001632,0.152110,0.012126
13,https://plana.earth/academy/release-carbon-man...,make seriou upgrad help measur accur carbon fo...,1.0,0.020330,0.013785,0.007615,0.773814,0.017148,0.088187,0.057267,0.001034,0.013157,0.007663
14,https://plana.earth/academy/release-carbon-man...,want get inform book demo call discov plan car...,1.0,0.211783,0.043621,0.016583,0.057654,0.038511,0.558914,0.024015,0.002297,0.029592,0.017030
15,https://plana.earth/academy/release-carbon-man...,carbon account busi,1.0,0.514872,0.065822,0.036323,0.123103,0.082100,0.024935,0.052141,0.005032,0.058371,0.037301
16,https://plana.earth/academy/release-carbon-man...,emiss dashboard add depth context carbon footp...,1.0,0.095629,0.013309,0.006307,0.508645,0.014854,0.004371,0.009633,0.000874,0.088323,0.258056
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3081,https://plana.earth/academy/earth-overshoot-day/,pick futur,1.0,0.093717,0.091450,0.042673,0.132630,0.096453,0.029275,0.061256,0.005912,0.068576,0.378059
3082,https://plana.earth/academy/earth-overshoot-day/,movethed plana,1.0,0.443342,0.076058,0.042674,0.132631,0.096453,0.029276,0.061256,0.005912,0.068576,0.043822
3156,https://plana.earth/academy/a-message-from-the...,month intens investig topic sloth sloth habita...,1.0,0.166513,0.117698,0.042813,0.188631,0.376733,0.005635,0.011869,0.001138,0.013242,0.075728
3157,https://plana.earth/academy/a-message-from-the...,wednesday may plan launch first planetwid camp...,1.0,0.422576,0.036065,0.176416,0.063210,0.048285,0.013165,0.108261,0.002658,0.031050,0.098313
