In [None]:
# One time run
# gensim stopped Mallet LDA wrapper support past 3.8.3

! pip install gensim==3.8.3 spacy pyLDAvis nltk

In [None]:
# One time run

import nltk
nltk.download("stopwords")

In [52]:
import re, os, pickle

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet

import spacy
from nltk.corpus import stopwords

import pyLDAvis
import pyLDAvis.gensim_models

import pandas as pd

In [53]:
# The idea is to load in all dataset files and extract samplepct (declared below) percent of tweets and
# store the rest in another directory for unseen data evaluation later on

sentiments_dir = "sentiments" # Input files directory, all sentiment files
eval_files_dir = "output" # Test files (the other 80 pc) directory
samplepct = 0.04 # 4 percent

source_hashtags = ["coronavirus", "covid19", "coronaoutbreak"]
# Lowercasing #Coronavirus, #COVID19, #CoronaOutbreak because we lowercase all tweets

In [54]:
def partition(lst, n):
    '''
    Chunks a python list object into smaller lists with n elements each
    '''
    for i in range(0, len(lst), n):
        yield lst[i:i+n]
        
def lemmatize(data):
    '''
    Lemmatizes a chunk of tweets
    Processes 10000 tweets at a time with batches of size 5000 (takes around 27s / 10000 tweets on my device)
    '''
    lemmatized = []
    chunksize = 10000
    
    for chunk in partition(data, chunksize):
        processed = list(nlp.pipe(chunk, batch_size=5000)) # running the Spacy pipeline

        for doc in processed:
            final = " ".join([token.lemma_ for token in doc]) # token.lemma_
            lemmatized.append(final)
        del processed
    
    return lemmatized

In [55]:
stop_words = {k:"" for k in stopwords.words("english")} # Making a dictionary of stopwords for O(1) lookup

rhashtag = re.compile(r"(^|[^\#\w])\#\s?(\w+)\b") # Compiling regex expressions to match/find later
rurl = re.compile(r"http\S+")

def clean_tweet(x):
    '''
    Removing URLs, tweet handles and English stopwords from the text
    '''
    x = re.sub(rurl, '', x)
    x = re.sub(rhashtag, '', x)
    x = ' '.join([word for word in x.strip().split() if word not in stop_words])
    return x

def profile_tweet(x):
    '''
    Profiling tweets in a dictionary based on hashtags
    
    Input: list of tweets
    Output: dictionary with hashtags as keys and pseudodocs as values
    '''
    hashtags = list(set(re.findall(rhashtag, x)))
    x = clean_tweet(x)
    for b, hashtag in hashtags:
        if hashtag not in source_hashtags:
            if hashtag in tweets:
                tweets[hashtag].append(x)
            else:
                tweets[hashtag] = [x]
    return x

In [56]:
# Loading the Spacy model disabling parser, named entity recognizer and text categorizer
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])

In [57]:
tweets = {} # Dictionary which will have "{hashtag}": "{all tweets as a single text for the hashtag}", relevant function: profile_tweet()

# Loading in the dataset files, extract samplepct percent of tweets,
# lemmatize them, small case them, profile them into pseudodocs
# and remove hashtags and URLs from the tweets, then finally
# save (100 - samplepct) percent file into eval_files_dir directory

for file in os.listdir(sentiments_dir):
    print(f"processing file {file}")
    
    df = pd.read_csv(os.path.join(sentiments_dir, file))[["id", "text"]] # reading a sentiments file
    slice_at = int(samplepct * df.shape[0])
    
    train = df.iloc[:slice_at].text
    test = df.iloc[slice_at:]
    
    print(f"EXAMPLE ORIGINAL TWEET: {train[14]}")
    
    texts = train.apply(lambda x: x.lower()) # running lowercase
    texts = pd.DataFrame(lemmatize(texts), columns=["text"]).text.apply(profile_tweet) # preprocess and profile all tweets
    
    print(f"SMALL CASED AND WITHOUT STOPWORDS: {texts[14]}")
    
    test.to_csv(os.path.join(eval_files_dir, file), index=False)
    
    print(f"number of pseudodocs (total): {len(tweets)}")
    print("==============================================================================")
    
    assert len(train) + len(test) == df.shape[0], "Something is wrong because this should be equal"

processing file sentiments_001.csv


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


EXAMPLE ORIGINAL TWEET: #StayAtHomeSaveLives Don't get paid🤬 Bills piling up e.g house rent etc. Then the next BIG thing is STARVATION. UK GOVERNMENT help us too...We can't go out or even shop for food online. All delivery DATES ARE GONE 🚨 We are HUNGRY😭 #UKlockdown #Coronavirus #UKGovtHelpUsToo
SMALL CASED AND WITHOUT STOPWORDS: n't get pay 🤬 bill pile e.g house rent etc . next big thing starvation . uk government help ... ca n't go even shop food online . delivery date go 🚨 hungry 😭
number of pseudodocs (total): 12650
processing file sentiments_002.csv


KeyboardInterrupt: 

In [66]:
# Save the pseudodocs dictionary so that we don't have to partition train/test each time
pickle.dump(tweets, open("pseudo.docs", 'wb'))

In [58]:
import pickle
# Load the pseudodocs (on a new/fresh run)
tweets = pickle.load(open("0.04/pseudo.docs", "rb"))

In [67]:
# Joining the profiled tweet lists into a single string, pseudodoc
for hashtag in tweets:
    tweets[hashtag] = ' '.join(tweets[hashtag])

In [68]:
max((len(v), k) for k, v in tweets.items()) # The longest pseudodoc and its size

(2152379, 'covid_19')

In [69]:
tweetslist, hashtags = [], []
wordlimit = 200000 # Token limit, the pseudodocs will be capped at 200000 tokens

for hashtag, text in tweets.items():
    hashtags.append(hashtag)
    tweetslist.append(text[:wordlimit])

# Quick check
assert len(tweetslist) == len(hashtags), "Number of pseudodocs and hashtags unequal, look into it"

In [70]:
len(hashtags)

319658

In [71]:
# Display a sample hashtag and its pseudodoc
ix = 1001 # pseudodoc number

print(hashtags[ix])
if len(tweetslist[ix]) < 10000:
    print(tweetslist[ix])
else:
    print("selected pseudodoc too long to print on screen")

intercambio
📣 new podcast ! " crisis schooling begin " @spreaker ' I think hoax ' : patient 30 die attend ' covid party ' hold valid permit , approve study permit , restriction take effect march 18 , 2020 travel u offer emergency grant student affect by-19 | : ireland put lockdown varadkar ask nation ' forego freedom '


In [72]:
data = tweetslist

In [40]:
len(data) # No of pseudodocs

12650

In [41]:
# Tokenize words after simple preprocessing
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(data)

print (data_words[700][0:40])

['university', 'queensland', 'highly', 'anticipate', 'covid', 'vaccine', 'pass', 'another', 'crucial', 'milestone', 'week', 'bring', 'one', 'step', 'close', 'towards', 'become', 'reality', 'screen', 'announce', 'support', 'virtual', 'live', 'screen', 'culture', 'event']


In [42]:
# Word <> Id mapping
id2word = corpora.Dictionary(data_words)
id2word.filter_extremes(no_below=5, no_above=0.5) # filter words that appear in less than 5 docs and more than 0.5 (half) of the docs

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

print(corpus[700][0:40])
print(id2word[2]) #(2, 3) below means id2word[2] appears 3 times in the doc

[(13, 1), (56, 1), (128, 1), (129, 1), (174, 1), (205, 1), (273, 1), (323, 1), (452, 1), (730, 1), (853, 1), (879, 1), (1181, 1), (1307, 1), (1319, 1), (1353, 1), (1619, 1), (1991, 1), (2012, 1), (2074, 2), (2767, 1), (2950, 1), (3006, 1), (3161, 1), (3205, 1)]
heure


In [28]:
# Following are statements to save the python objects created so far
# (to load later without having to repeat the processes)

pickle.dump(data_words, open("data_words", "wb"))

In [9]:
pickle.dump(corpus, open("corpus", "wb")) # Save

# corpus = pickle.load(open("corpus", 'rb')) # Load

In [11]:
pickle.dump(id2word, open("id2word", "wb"))

# id2word = pickle.load(open("id2word", 'rb'))

In [27]:
# Save the lemmatized pseudodocs python object (to use later) and the LDA model, if necessary

pickle.dump(data, open("lemmatized_texts.p", 'wb'))

In [None]:
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=10,
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=1000,
#                                            passes=10,
#                                            alpha="auto")


In [44]:
# Train the Mallet LDA model

path = 'C:/mallet/bin/mallet'
mallet_model = LdaMallet(path, corpus=corpus, num_topics=10, id2word=id2word)

In [45]:
# Convert Mallet LDA model to gensim LDA model
# gensim LDA model has more features
# The function creates a gensim LDA model with the weights of the original model

model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(mallet_model)

In [15]:
# Save the model for later use

model.save("lda.model")

In [2]:
# Load the model

model = gensim.models.ldamodel.LdaModel.load("lda.model")

In [46]:
# Rank 1 metric search about

# This score generally takes much longer to compute (very expensive on models trained on large sets)
coherence_model_lda = CoherenceModel(model=model, texts=data_words, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4165980211523473


In [47]:
# Coherence score using U Mass
coherence_model_lda = CoherenceModel(model=model, corpus=corpus, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  -2.2953723826050436


In [48]:
# Generate topics of the model

ldatopics = model.print_topics(num_topics=-1, num_words=20)

In [49]:
# Make the topics readable

from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric

topics = []
filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]

for topic in ldatopics:
    topics.append(preprocess_string(topic[1], filters))

In [50]:
topics

[['day',
  'pandemic',
  'start',
  'time',
  'due',
  'state',
  'government',
  'month',
  'pay',
  'plan',
  'jul',
  'crisis',
  'country',
  'economy',
  'issue',
  'trend',
  'national',
  'open',
  'hit',
  'run'],
 ['good',
  'today',
  'link',
  'back',
  'free',
  'youtube',
  'life',
  'school',
  've',
  'offer',
  'click',
  'feel',
  'ca',
  'play',
  'book',
  'hope',
  'make',
  'daily',
  'guy',
  'amazing'],
 ['support',
  'check',
  'sign',
  'gt',
  'll',
  'deliver',
  'design',
  'project',
  'contact',
  'market',
  'base',
  'official',
  'event',
  'man',
  'high',
  'track',
  'group',
  'listen',
  'night',
  'real'],
 ['work',
  'home',
  'stay',
  'quarantine',
  'safe',
  'top',
  'covid',
  'social',
  'video',
  'lockdown',
  'post',
  'watch',
  'read',
  'people',
  'distancing',
  'today',
  'amid',
  'thing',
  'time',
  'life'],
 ['coronavirus',
  'covid',
  'news',
  'virus',
  'live',
  'world',
  'read',
  'follow',
  'outbreak',
  'rt',
  'late'

In [17]:
# Save the topics into topics.txt

with open("topics.txt", "w", encoding="utf8") as f:
    for topic in topics:
        f.write(" ".join(topic) + "\n")

In [None]:
# Visualize the model (very expensive on models trained on large sets)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(model, corpus, id2word, mds="mmds", R=20) # 20 top relevant tokens
vis

In [None]:
# Generate topics for unseen data
# Will start working with the other 80 pc of data once we have a verified up and running LDA model
# This is sample result

unseen = ["what are you doing?", "What's up bro?"]


for doc in gen_words(unseen):
    vec = id2word.doc2bow(doc)
    topics = model[vec]
    for topic in topics:
        print(model.show_topic(topic[0]), topic[1])
    print("=================================================================================")