In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
from __future__ import unicode_literals

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from collections import Counter


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import nltk
# NLTK Stop words
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, HdpModel

import spacy
try:
    from spacymoji import Emoji
except:
    !pip install spacymoji
    from spacymoji import Emoji

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
emoji = Emoji(nlp, merge_spans=False)
nlp.add_pipe(emoji, first=True)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

#pd.options.plotting.backend = "plotly"
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.makedirs('./Models')
os.makedirs('./Dictionary')

In [None]:
## Time Features

def get_timefeatures(df: pd.DataFrame, col:str):
    from pandas.api.types import is_datetime64_ns_dtype as is_datetime
    from pandas.api.types import is_float_dtype as is_float
    
    if is_datetime(df[col]):
        df[col+'_date'] = df[col].dt.date
        df[col+'_doy'] = df[col].dt.day_of_year
        df[col+'_dow'] = df[col].dt.day_of_week
        df[col+'_time'] = df[col].dt.time
        df[col+'_hour'] = df[col].dt.hour
        df[col+'_week'] = df[col].dt.week
        df[col+'_day'] = df[col+'_doy'] - df[col+'_doy'].min()
        df[col+'_floorT'] = df[col].dt.floor('Min')
        df[col+'_floorH'] = df[col].dt.floor('H')
        return [col,col+'_date',col+'_doy', col+'_dow',
                col+'_time',col+'_hour',col+'_week',col+'_day', col+'_floorT', col+'_floorH']
    elif is_float(df[col]):
        df[col+'_age'] = df[col].max() - df[col]
        return [col, col+'_age']
    
## Feature Engineering

def remove_outliers(df: pd.DataFrame):
    try:
        return df[df.timestamp_doy<273]
    except:
        print('It is not possible to filter on day of year column.')
        return df
def VaderSentiment(text: str):
        d = sid.polarity_scores(text)
        return d['pos'],d['neu'],d['neg'],d['compound']
def NumbersFromText(txt: str):
    try:
        doc = nlp(txt)
        words = [token.text for token in doc if token.is_alpha]
        emojis = [token.text for token in doc if token._.is_emoji]
        return len(words), len(emojis)
    except:
        return 0,0

def get_TextFeatures(df: pd.DataFrame):
    df['full_text'] = df['title'] + ' ' + df['body'].fillna('')
    df['text_lenght'] = df['full_text'].apply(lambda x: len(x))
    p = df.full_text.transform(VaderSentiment).to_list()
    df[['VS_pos','VS_neu','VS_neg','compound']] = pd.DataFrame(p, columns=['VS_pos','VS_neu','VS_neg','compound'])
    df['sentiment'] = df['compound'].apply(lambda c: 'pos' if c>0.1 else 'neg' if c<-0.1 else 'neu')
    #df[['num_of_words','num_of_emojis']] = pd.DataFrame(df.title.apply(lambda x: NumbersFromText(x)).to_list(), columns=['num_of_words','num_of_emojis'])

def extract_emoji(sent):
    doc = nlp(sent)
    res = ' '.join([token.text for token in doc if token._.is_emoji])
    return res.strip()

## Text Preprocessing

def remove_url(text):
    return re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
def remove_newline(text):
    return re.sub('\s+', ' ', text)
def remove_squote(text):
    return re.sub("\'", "", text)

def sent_to_words(texts):
    texts = [remove_url(text) for text in texts]
    texts = [remove_newline(text) for text in texts]
    texts = [remove_squote(text) for text in texts]
    for sent in texts:
        yield([token.lemma_.lower() for token in nlp(sent) if (token._.is_emoji)|(token.is_alpha)|(token.is_digit)])
#    for sent in texts:
#        yield(gensim.utils.simple_preprocess(str(sent), deacc=True))

def remove_stopwords(texts):
    return [[word for word in text if word not in stop_words] for text in texts]
#[[word for word in doc if word not in stop_words] for doc in texts]



def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PROPN'], is_emoji=True):
    """https://spacy.io/api/annotation"""
    texts_out = []
    item = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if (token.pos_ in allowed_postags)|(is_emoji&token._.is_emoji)])
    return texts_out

# equivalent to build_corpus but optimized with spacy
def preprocess(texts: pd.Series, allowed_postags:list):
    docs = list(nlp.pipe(texts))
    data_words = [[token.lemma_.lower() for token in doc if (
        (token._.is_emoji)|(token.is_alpha)|(token.is_digit)|(token.is_stop==False)
    )&(token.pos_ in allowed_postags)]for doc in docs]
    #data_words = remove_stopwords(data_words)
    return data_words
    
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
    

def build_corpus(texts:pd.Series, dictionary, allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV','PROPN']):
    docs = list(nlp.pipe(texts))
    data_words = [[token.lemma_.lower() for token in doc if (
        (token._.is_emoji)|(token.is_alpha)|(token.is_digit)|(token.is_stop==False)
    )&(token.pos_ in allowed_postags)]for doc in docs]
    
    data_words = make_bigrams(data_words)
    corpus = [id2word.doc2bow(text) for text in data_words]
    return corpus

# outdated function?
# def build_corpus(texts:pd.Series, dictionary):
#    data_words = list(sent_to_words(texts))
#    data_words = remove_stopwords(data_words)
#    data_words = make_bigrams(data_words)
#    data_words = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
#    corpus = [id2word.doc2bow(text) for text in lemmas]
#    return corpus

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                               alpha='auto',
                                               eta='auto')#learn asymmetric priors
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

# Topic Modeling

In this notebook I host the topic modeling part previously coded [here](https://www.kaggle.com/radema/yolo-explorative-analysis-on-wallstreetbets). I decided to split the notebooks to improve readability and be able to separate the different subtask (topic modeling, feature engineering, data cleansing, etc. etc.). For a detailed theory about LDA and Topic Modeling I suggest the links in the Reference Section.

In [None]:
path = '/kaggle/input/reddit-wallstreetsbets-posts/reddit_wsb.csv'
df = pd.read_csv(path, parse_dates = ['timestamp'], infer_datetime_format=True)

The dataset has a total of 8 columns. From [an official page on GitHub of the API Wrapper](https://github.com/reddit-archive/reddit/wiki/JSON) we have the following:
- **title**: the title of the link. may contain newlines for some reason
- **score**:the net-score of the link. note: A submission's score is simply the number of upvotes minus the number of downvotes. If five users like the submission and three users don't it will have a score of 2. Please note that the vote numbers are not "real" numbers, they have been "fuzzed" to prevent spam bots etc. So taking the above example, if five users upvoted the submission, and three users downvote it, the upvote/downvote numbers may say 23 upvotes and 21 downvotes, or 12 upvotes, and 10 downvotes. The points score is correct, but the vote totals are "fuzzed".
- **id**: this item's identifier, e.g. "8xwlg"
- **url**: the link of this post. the permalink if this is a self-post
- **comms_num**: the number of comments that belong to this link. includes removed comments.
- **created**: the time of creation in local epoch-second format
- **body**:  the raw text. this is the unformatted text which includes the raw markup characters such as ** for bold. <, >, and & are escaped.
- **timestamp**: datetime about the related activity 

In [None]:
%%time
import gc

gc.collect()

get_TextFeatures(df)
get_timefeatures(df,'timestamp')
df = remove_outliers(df)

## LDA Topic Modeling

In [None]:
%%time
## Let's build text corpus
texts = df.full_text.sample(frac=1).dropna().unique()
data_words = preprocess(texts, ['PROPN','NOUN', 'ADJ', 'VERB', 'ADV'])
## Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
#
## Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
data_words = make_bigrams(data_words)


In [None]:
### Let's build text corpus
#texts = df.full_text.sample(frac=1).dropna().unique()
#data_words = list(sent_to_words(texts))
#
## Build the bigram and trigram models
#bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
#trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
#
## Faster way to get a sentence clubbed as a trigram/bigram
#bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)
#
## Remove Stop Words
#data_words_nostops = remove_stopwords(data_words)
#
## Form Bigrams
#data_words_bigrams = make_bigrams(data_words_nostops)
#
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['PROPN','NOUN', 'ADJ', 'VERB', 'ADV'])


In [None]:
%%time
gc.collect()

# Create Dictionary
id2word = corpora.Dictionary(data_words)
id2word.filter_extremes(no_below = 10,no_above=0.25)
# Create Corpus
lemmas = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_words]

In [None]:
%%time
limit=25; start=3; step=3;
model_list, coherence_values = compute_coherence_values(dictionary=id2word, 
                                                        corpus=corpus, 
                                                        texts=data_words, 
                                                        start=start, limit=limit, step=step)
# Show graph
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[1]#np.argmax(coherence_values)]

## Topic Interpretability

In [None]:
def format_topics_sentences(ldamodel, corpus=corpus, texts=texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: x[1], reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num, topn=7)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf,
                                      grp.sort_values(['Perc_Contribution'], 
                                                      ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[
    ['Dominant_Topic', 'Topic_Keywords']
].drop_duplicates()

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords.sort_values(by='Dominant_Topic').reset_index() , 
                                topic_counts.sort_index()
                                , topic_contribution.sort_index()
                               ], axis=1).drop(columns='index')

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics.sort_values(by='Num_Documents', ascending=False)

### Apply LDA model to all texts

In [None]:
new_corpus = build_corpus(df.full_text, dictionary=id2word)
topics_docs= list(optimal_model.get_document_topics(new_corpus, minimum_probability=0.0))
lda_topics = [sorted(topics, key=lambda x: x[1], reverse=True)[0] for topics in topics_docs]
df = pd.concat([df,pd.DataFrame(lda_topics, columns=['lda_topic','lda_topic_prob'])], axis = 1)
del topics_docs, lda_topics

In [None]:
for num in range(optimal_model.num_topics):
    print('\n')
    print('###### Topic no.',num)
    print('\n')
    for i,row in df[(df.lda_topic==num)&(df.lda_topic_prob)>0.95].sort_values(by='lda_topic_prob').head(1).iterrows():
        print('At: ', row.timestamp)
        print('Text: ',row.full_text)
        print('Url :', row.url)
        print('DataFrame index: ', i)
        print('\n')

In [None]:
### Store dictionary and lda model

from gensim.test.utils import datapath

optimal_model.save('./Models/lda_model')

id2word.save('./Dictionary/dictionary')

In [None]:
df.to_csv('./processes_wsb_data.csv')

In [None]:
bigram_mod.save('./Models/bigram_mod')
trigram_mod.save('./Models/trigram_mod')
bigram.save('./bigram')
trigram.save('./trigram')

# References

## Other Notebooks by Radema 
* [YOLO - Explorative analysis on WallStreetBets, by Radema](https://www.kaggle.com/radema/yolo-explorative-analysis-on-wallstreetbets)

## External Reference

* [NLTK Documentation](https://www.nltk.org/)
* [Gensim Documentation](https://radimrehurek.com/gensim/auto_examples/index.html)
* [Spacy Usage - Linguistic Features](https://spacy.io/usage/linguistic-features)
* [Universal POS Tags](https://universaldependencies.org/docs/u/pos/)
* [*Topic Modeling with Gensim* by Selva Prabhakaran](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/)