In [1]:
import pandas as pd
import numpy as np

import os
import re
import csv
import glob
import os.path
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

from gensim import models
from gensim.models import CoherenceModel

In [2]:
stop_words = set(stopwords.words('english'))
stop_words.update(['http'] )

def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    cleaned = cleaned.replace("\s+"," ")

    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def removeStopWords(sentence, stop_words):
    #global re_stop_words
    re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
    return re_stop_words.sub(" ", sentence)

### Import and text cleaning

In [3]:
# Import intermediate Twitter dataframe
df_tw = pd.read_csv('df_tw_sentiment.csv')

# Clean text
stemmer = PorterStemmer()
df_tw['clean_text'] = df_tw.Text.str.lower()
df_tw['clean_text'] = df_tw['clean_text'].apply(cleanHtml)
df_tw['clean_text'] = df_tw['clean_text'].apply(cleanPunc)
df_tw['clean_text'] = df_tw['clean_text'].apply(keepAlpha)
df_tw['clean_text'] = df_tw['clean_text'].apply(lambda x: removeStopWords(x, stop_words))
df_tw['clean_text'] = df_tw['clean_text'].str.split().apply(lambda x: ' '.join(w for w in x if len(w)>2))
df_tw['clean_text'] = df_tw['clean_text'].str.split().apply(lambda x: ' '.join([stemmer.stem(w) for w in x]))

### Tokenisation and lemmatisation

In [4]:
from nltk.tokenize import sent_tokenize
df_tw['sentences'] = df_tw['clean_text'].apply(sent_tokenize)

from nltk.tokenize import word_tokenize
df_tw['tokens_sentences'] = df_tw['sentences'].apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])

from nltk import pos_tag
df_tw['POS_tokens'] = df_tw['tokens_sentences'].apply(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])

# Inspired from https://stackoverflow.com/a/15590384
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatizing each word with its POS tag, in each sentence
df_tw['tokens_sentences_lemmatized'] = df_tw['POS_tokens'].apply(
    lambda list_tokens_POS: [
        [
            lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) 
            if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS
        ] 
        for tokens_POS in list_tokens_POS
    ]
)

from itertools import chain # to flatten list of sentences of tokens into list of tokens

df_tw['tokens'] = df_tw['tokens_sentences_lemmatized'].map(lambda sentences: chain.from_iterable(sentences))
df_tw['tokens'] = df_tw['tokens'].map(lambda tokens: [token.lower() for token in tokens if token.isalpha() 
                                                    and token.lower() not in stop_words and len(token)>1])

# Topic tagging

### Preparing a dictionary

In [5]:
# Preparing the corpus and the dictionary

from gensim import corpora
dictionary = corpora.Dictionary(df_tw.tokens)
dictionary.filter_extremes(no_below=20) #, no_above=0.15)
df_tw['corpus'] = [dictionary.doc2bow(tok) for tok in df_tw.tokens]
print(dictionary)

Dictionary(3023 unique tokens: ['ban', 'candl', 'close', 'dip', 'enter']...)


### Preparing train and test datasets

In [6]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_tw, random_state=20, test_size=0.20, shuffle=True)

print(train.shape)
print(test.shape)

(35158, 23)
(8790, 23)


### Training a model

In [7]:
# Define number of topics
num_topics = 3

passes = 10
lda_model = models.LdaModel(corpus=train['corpus'],\
                                  num_topics=num_topics, \
                                  id2word=dictionary, \
                                  random_state=100, \
                                  passes=passes, \
                                  #iterations=100,
                                  alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary.keys())
                                  #alpha='auto', \
                                  #eta='auto'
                                  #per_word_topics=True
                                 )

### Evaluate performance

In [8]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=test['tokens'], dictionary=dictionary, coherence='c_v')

print('Coherence per topic:')
print(coherence_model_lda.get_coherence_per_topic())
print()
print('==========================================')
print('Topics as word lists:')
print(coherence_model_lda.top_topics_as_word_lists(lda_model, dictionary, topn=20))
print()
print('==========================================')
print('Topics:')
print(coherence_model_lda.topics)
print()
print('==========================================')
print('Performance on the test set:')
# Compute Perplexity
#print('Perplexity: ', lda_model.log_perplexity(test['corpus']))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=test['tokens'], dictionary=dictionary, coherence='c_v')
coherence_list = coherence_model_lda.get_coherence_per_topic()
print('Mean coherence of the model:      {}'.format(coherence_model_lda.get_coherence()))
print('Median coherence of the model:    {}'.format(np.median(coherence_list)))
print('Maximal coherence of the model:   {}'.format(max(coherence_list)))
print('Minimal coherence of the model:   {}'.format(min(coherence_list)))

Coherence per topic:
[0.3003413386274456, 0.6344079018829462, 0.3655478909042293]

Topics as word lists:
[['btc', 'buy', 'time', 'get', 'like', 'price', 'year', 'think', 'market', 'high', 'see', 'go', 'peopl', 'sell', 'day', 'make', 'dont', 'look', 'still', 'long'], ['btc', 'amp', 'follow', 'eth', 'crypto', 'like', 'give', 'xrp', 'retweet', 'get', 'giveaway', 'friend', 'one', 'good', 'link', 'win', 'send', 'ethereum', 'tag', 'hour'], ['money', 'gold', 'asset', 'buy', 'valu', 'world', 'invest', 'bank', 'use', 'market', 'say', 'peopl', 'billion', 'fiat', 'dollar', 'currenc', 'worth', 'one', 'make', 'year']]

Topics:
[array([ 26,  53,  32,   5,  56,  68,  35, 169, 143,  37,  92,  66,  50,
       167, 128,  28,  42, 628, 245,  88]), array([ 26, 256,  95,  71, 118,  56, 173, 215, 429,   5,  96, 229, 131,
       179, 142, 112,  81, 120, 430, 526]), array([244, 328,  61,  53, 147, 126, 294, 105,  51, 143,  75,  50, 546,
       107, 824, 762,  33, 131,  28,  35])]

Performance on the test set:

### Visualisation

In [9]:
import pyLDAvis
import pyLDAvis.gensim
data_ = pyLDAvis.gensim.prepare(lda_model, df_tw['corpus'], dictionary)
pyLDAvis.display(data_)

# Applying topic tagging

In [10]:
from operator import itemgetter

def get_dominant_topic(l):
    return max(l,key=itemgetter(1))[0]

df_tw['categorisation_full'] = df_tw['corpus'].apply(lambda x: lda_model.get_document_topics(x, minimum_probability=0))
df_tw['tw_topic'] = df_tw['categorisation_full'].apply(lambda x: get_dominant_topic(x))

df_tw.drop(columns=['tw_sentiments', 'clean_text', 'tokens_sentences', 'tokens', 'POS_tokens', 'sentences',
       'tokens_sentences_lemmatized', 'corpus', 'categorisation_full'], inplace=True)

  and should_run_async(code)


### Save the df to csv

In [12]:
df_tw.to_csv('df_tw_topic.csv', index=False)

  and should_run_async(code)
