In [8]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
from nltk.corpus import stopwords
import os
import json
from bs4 import BeautifulSoup

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
import nltk

%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


nltk.download('stopwords')
# Define functions for stopwords, bigrams, trigrams and lemmatization
stop_words = stopwords.words('english')
# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

        
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row=row[0]
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

youtuber_list_path="subscriber_network.json"

comments=[]
with open(youtuber_list_path,'r') as fp:
    youtubers=json.loads(fp.read())["nodes"]
    for you in youtubers:
        for vi in you["video_list"]:
            for comment in vi["comment_list"]:
                cleantext = BeautifulSoup(comment["text"],"html.parser").text
                comments.append(cleantext)
comment_words = list(sent_to_words(comments))
data_words = list(sent_to_words(comment_words))
print(data_words[0:10])

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100) 

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print("lemmatized")

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:10]])


lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

pprint(lda_model.show_topics(formatted=True))

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_lemmatized)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# # Show
# df_dominant_topic.head(500)

# youtuber_save_path="comment_data.csv"
# with open(youtuber_save_path, mode='w+', encoding="utf-8", newline='') as fp:
#     csv_writer = csv.writer(fp, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#     first = True
#     for v in youtubers:
#         if first:
#             csv_writer.writerow([key for key, value in df_dominant_topic.])
#             first = False
#         csv_writer.writerow([value for key, value in v.items()])

# # Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

print(sent_topics_sorteddf_mallet.head())
# Show
sent_topics_sorteddf_mallet.head()


# # Number of Documents for Each Topic
# topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# print("topic_counts")
# print(topic_counts)

# # Percentage of Documents for Each Topic
# topic_contribution = round(topic_counts/topic_counts.sum(), 5)

# print("topic_contribution")
# print(topic_contribution)

# # Topic Number and Keywords
# topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# # Concatenate Column wise
# df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# # Change Column names
# df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# # # Show
# # df_dominant_topics
# # # Visualize the topics
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
# vis


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shuon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


[['if', 'youtube', 'doesn', 'allow', 'for', 'donations', 'in', 'your', 'area', 'just', 'go', 'to', 'teamtrees', 'org', 'to', 'donate', 'love', 'you', 'guys'], ['we', 'hit', 'it'], ['do', 'in', 'poland', 'that', 'in', 'city', 'named', 'bydgoszcz'], ['dear', 'mrbeast', 'can', 'you', 'please', 'help', 'the', 'bushfires', 'in', 'hell', 'mean', 'australia'], ['can', 'just', 'see', 'years', 'later', 'the', 'news', 'saying', 'years', 'ago', 'youtuber', 'named', 'mr', 'beast', 'planted', 'mil', 'trees', 'is', 'less', 'than', 'months', 'it', 'was', 'and', 'still', 'is', 'the', 'biggest', 'fundraiser', 'in', 'the', 'world'], ['ok', 'we', 'donated', 'million', 'dollars', 'but', 'who', 'gonna', 'plant', 'them', 'all', 'don', 'think', 'that', 'possible'], ['faith', 'in', 'humanity', 'went'], ['ecosia', 'is', 'the', 'answer', 'also', 'you', 'could', 'get', 'lots', 'of', 'seeds', 'from', 'lemons', 'pomegranates', 'apples', 'the', 'list', 'goes', 'on', 'on', 'there', 'are', 'currently', 'trillion', 't

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.991,"tree, plant, go, make, would, think, say, peop...","[tree, tree, tree, tree, tree, tree, tree, tre..."
1,1.0,0.9002,"video, get, thing, time, look, even, watch, ni...","[awesome, awesome, awesome, awesome, awesome, ..."
2,2.0,0.9707,"love, teamtree, good, see, great, know, thank,...","[teamtree, teamtree, teamtree, teamtree, teamt..."
