<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Topic-modelling" data-toc-modified-id="Topic-modelling-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Topic modelling</a></span><ul class="toc-item"><li><span><a href="#Set-working-directory" data-toc-modified-id="Set-working-directory-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Set working directory</a></span></li><li><span><a href="#Alert-Function" data-toc-modified-id="Alert-Function-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Alert Function</a></span></li><li><span><a href="#Import-Data" data-toc-modified-id="Import-Data-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Import Data</a></span></li><li><span><a href="#Stop-Words" data-toc-modified-id="Stop-Words-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Stop Words</a></span></li><li><span><a href="#Model-Preprocessing-Functions" data-toc-modified-id="Model-Preprocessing-Functions-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Model Preprocessing Functions</a></span></li></ul></li></ul></div>

# Topic modelling

**Frequency of dominant topics in documents**


    Date: 03/12/19
    Author: Avery
    Source Code: Simone and Toni
    Last Updated: 06/12/19


In [1]:
# Plotting tools
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import re
import numpy as np
import pandas as pd

#NLTK
import nltk
# nltk.download('stopwords')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel

# spacy for lemmatization
import spacy
if 'nlp' not in locals():
    # nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
    nlp = spacy.load('en', disable=['parser', 'ner'])

import gc
import os
from glob import glob
from IPython.display import Audio, display

# Text Analysis (uncomment if running for first time)
# ! wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# ! unzip mallet-2.0.8.zip
MALLET_PATH = 'mallet-2.0.8/bin/mallet'

## Set working directory

In [2]:
from pathlib import *
#current working directory
current_dir = Path.cwd()
#go up 1 level to the 1st parent directory
Par1_dir = current_dir.parents[0]

Par1_dir

PosixPath('/Users/averysoh/Google Drive (racass1234@gmail.com)')

## Alert Function

In [3]:
def allDone():
    '''this function outputs a short audio when called. 
    Typically this is used to signal a task completion'''
    
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))

In [4]:
allDone()

## Import Data

Whats happening here:
- Importing a test file to process the data and see the visualisation 
- slice the dataframe into yearly slices, then take a random sample of size=100k for each year and save the DF samples in a list.
- delete the DF imported and call for garbage collection to clear the memory

Reason for sampling will be mentioned below in another step

In [5]:
#set the file and data directories
df_filename = 'globe_Machine Learning_all_.csv'
df_data_dir = Par1_dir /'AI Project'/'Python Env' / 'DataFrames'/ 'AI_datasets' / 'globe_ai_13_18'/ df_filename

#import only the required columns from the DF
df = pd.read_csv(df_data_dir, usecols=['tweet', 'date/time', 'search'])

#find dataframe's search word, and remove quotations f present
search_word = df.head(1).search.item().replace('"', '').replace("'","")
#drop unused columns
df = df.loc[:, ['tweet', 'date/time']]

df_list = []
# #create a random Sample of 100k for each year & drop NAs
for x in range(2013,2019):
    globals()['df_%s' % x] = df.loc[(df['date/time']>=str(x)+'-01-01 00:00:00+00:00') & (df['date/time']<str(x+1)+'-01-01 00:00:00+00:00'),: ].copy()
    n = min(100000, len(globals()['df_%s' % x]))
    globals()['df_%s' % x] = globals()['df_%s' % x].sample(n=n, random_state=11).dropna(subset=['tweet'])
    df_list.append(globals()['df_%s' % x])
    del globals()['df_%s' % x]

#delete the big DF
del df
gc.collect()


  if __name__ == '__main__':


66

## Stop Words

Process developed by Toni, refer to file "Yearly Topic Models.ipynb" in "Pcode" folder

In [6]:
# NLTK Stop words
from nltk.corpus import stopwords

#NLTK english stopwords
stop_words = stopwords.words('english')
#extend the list with a peronal list of stopwords
stop_words.extend(['from', 'need','thank','thing','something', 'see', 'say', 'well','people', 'change', 'com',\
                   'go', 'put', 'give','twitter','pic', \
                   'subject', 're', 'edu', 'could', 'be', 'make', 'not', 'make','find','let','may','see', 'would',\
                   'come', 'sure', 'ever', 'tell', 'use', 'not', 'doing', 'be', 'get','want'])
#extend the search word
stop_words.extend(['artificial intelligence', '#ai', '#ml', '#nlp', 'analytics', 'data mining',
                  'deep mining', 'machine learning', 'natural language processing', 'neural network'
                  'pattern recognition'])

## Model Preprocessing Functions

Process developed by Toni, refer to file "Yearly Topic Models.ipynb" in "Pcode" folder

In [7]:
### Tokenize words and Clean-up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
###Remove Stopwords, Make Bigrams and Lemmatize
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(bigram_mod, texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


def lda_preprocessing(df):
    ### Remove emails, newline characters, and links
    # Convert to list
    data = df.tweet.values.tolist()

    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', str(tweet)) for tweet in data]

    # Remove new line characters
    data = [re.sub('\s+', ' ', tweet) for tweet in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", tweet) for tweet in data]

    #Remove links
    data = [re.sub(r"http\S+", "", tweet) for tweet in data]

    #make lower case
    data = [tweet.lower() for tweet in data]
    
    # Tokenize words and Clean-up text
    data_words = list(sent_to_words(data))

    ###Creating Bigram and Trigram Models
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(bigram_mod, data_words_nostops)

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN','ADJ','ADV', 'VERB'])
    data_lemmatized = remove_stopwords(data_lemmatized)

    ###Create the Dictionary and Corpus needed for Topic Modeling¶
    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    texts = data_lemmatized

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

        
    return data_lemmatized, corpus, id2word

In [7]:
from multiprocessing import cpu_count
# Testing with sample DF
# 2013 df testing

#df = df_list[0]
#data_lemmatized, CORPUS, id2word = lda_preprocessing(df)




In [8]:
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.wrappers import LdaMallet
from gensim.models.wrappers.ldamallet import malletmodel2ldamodel
from multiprocessing import cpu_count

#     df = df_list[1]
for df in df_list:
    data_lemmatized, CORPUS, id2word = lda_preprocessing(df)

    #get current working directory
    cwd = os.getcwd()

    #set the number of workers to the number of cores available
    workers = cpu_count() 

    # Set number of topics
    NUM_TOPICS = 20

    ldamallet = LdaMallet(MALLET_PATH, corpus=CORPUS, num_topics=NUM_TOPICS, id2word=id2word,
                                      random_seed =11,workers=workers)

    # Transform model weights from MALLET to GENSIM
    lda_model = malletmodel2ldamodel(ldamallet)

    ############################
    # Get doc-topic probability#
    ############################

    # get transformed corpus as per the LDA model
    TRANSF_CORPUS = lda_model.get_document_topics(CORPUS)

    # rearrange data on document-topic pairs probabilities
    DOC_TOPIC_M = []

    for id, doc in enumerate(TRANSF_CORPUS):
        for topic in np.arange(0, 20, 1):
            topic_n = doc[topic][0]
            topic_prob = doc[topic][1] 
            DOC_TOPIC_M.append([id, topic, topic_prob])

    DF = pd.DataFrame(DOC_TOPIC_M)

    # rename columns
    OLD_NAMES = [0, 1, 2]
    NEW_NAMES = ['doc_id', 'topic_n', 'prob']
    COLS = dict(zip(OLD_NAMES, NEW_NAMES))
    DF.rename(columns=COLS, inplace=True)

    # Keep only dominant topic for vis
    GR = DF.groupby('doc_id')
    DF.loc[:, 'max'] = GR['prob'].transform(np.max)
    DF.loc[:, 'first_topic'] = 0
    DF.loc[DF['prob'] == DF['max'], 'first_topic'] = 1
    FIRST_TOPIC = DF.loc[DF['first_topic'] == 1]

    # write data to file
    filename = search_word + 'dom_topic'+ min(df.loc[:,'date/time'])[:10]+"_" + max(df.loc[:,'date/time'])[:10] + '.csv'
    save_dir = Par1_dir /'AI Project'/'Avery_output' / filename
    FIRST_TOPIC.to_csv(save_dir, index=True)


    # Store a dataset for visualistion later
    year_name = min(df.loc[:,'date/time'])[:4]
    VIS_DF = pd.DataFrame(FIRST_TOPIC.topic_n.value_counts()).reset_index()
    # Add a rank level
    VIS_DF.index += 1 
    VIS_DF.reset_index(inplace=True)

    # rename columns
    OLD_NAMES = ['level_0', 'index', 'topic_n']
    NEW_NAMES = ['rank', 'topic_n', 'freq']
    COLS = dict(zip(OLD_NAMES, NEW_NAMES))
    VIS_DF.rename(columns=COLS, inplace=True)

    # Add the current year
    VIS_DF['year'] = int(year_name)

    vis_filename = year_name + '.csv'
    foldername = search_word + '_vis_df'
    save_dir2 = Par1_dir /'AI Project'/'Avery_output' / foldername / vis_filename
    VIS_DF.to_csv(save_dir2, index=True)

    del DF
    gc.collect()


allDone()

KeyboardInterrupt: 

In [10]:
# Find all the file names in the folder for the visualisation
df_files = sorted(glob(str(Par1_dir /'AI Project'/'Avery_output'/ 'Machine Learning_vis_df'/'*')))
df_files

['/Users/averysoh/Google Drive (racass1234@gmail.com)/AI Project/Avery_output/Machine Learning_vis_df/2013.csv',
 '/Users/averysoh/Google Drive (racass1234@gmail.com)/AI Project/Avery_output/Machine Learning_vis_df/2014.csv',
 '/Users/averysoh/Google Drive (racass1234@gmail.com)/AI Project/Avery_output/Machine Learning_vis_df/2015.csv',
 '/Users/averysoh/Google Drive (racass1234@gmail.com)/AI Project/Avery_output/Machine Learning_vis_df/2016.csv',
 '/Users/averysoh/Google Drive (racass1234@gmail.com)/AI Project/Avery_output/Machine Learning_vis_df/2017.csv',
 '/Users/averysoh/Google Drive (racass1234@gmail.com)/AI Project/Avery_output/Machine Learning_vis_df/2018.csv']

In [11]:
# Read All the files from df_files
vis_list=[]

for x in range(len(df_files)):
    globals()['df_%s' % df_files[x][-8:-4]] = pd.read_csv(df_files[x])
    vis_list.append(globals()['df_%s' % df_files[x][-8:-4]])
    del globals()['df_%s' % df_files[x][-8:-4]]
    gc.collect()
    
    
# Store as a single df
vis_df = pd.concat(vis_list)
vis_df.drop(columns=['Unnamed: 0'], inplace=True)
vis_df

Unnamed: 0,rank,topic_n,freq,year
0,1,13,5451,2013
1,2,18,5068,2013
2,3,8,4785,2013
3,4,10,4766,2013
4,5,4,4719,2013
...,...,...,...,...
15,16,19,5452,2018
16,17,15,5203,2018
17,18,10,5167,2018
18,19,7,5097,2018


In [80]:
# Plotting

from bokeh.io import output_notebook,output_file, show, save
output_notebook()

from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper
from bokeh.plotting import figure
from bokeh.palettes import Blues



# x=df_2013['year']
# y= df_2013['topic_n']
size = vis_df['freq']/150

source = ColumnDataSource(vis_df)

t1 = vis_df[vis_df.topic_n == 1]
st1 = ColumnDataSource(t1)

t2 = vis_df[vis_df.topic_n == 2]
st2 = ColumnDataSource(t2)

t3 = vis_df[vis_df.topic_n == 3]
st3 = ColumnDataSource(t3)

cmap = LinearColorMapper(palette=Blues[256], 
                         low = min(vis_df["rank"]), 
                         high = max(vis_df["rank"]))

p = figure(plot_width=800, plot_height=1000, title="Dominant Topic Frequency", toolbar_location=None, tools="")

p.line(x='year', y='rank', source=st1, line_width = 4, line_color = 'grey', line_alpha=0.5)
p.line(x='year', y='rank', source=st2, line_width = 4, line_color = 'grey', line_alpha=0.5)
p.line(x='year', y='rank', source=st3, line_width = 4, line_color = 'grey', line_alpha=0.5)

p.circle(x='year', y='rank', size = 30,source=source, 
         fill_color={"field":"rank", "transform":cmap})

# for x in vis_df['topic_n'].unique():
#     vis_df
#     src = ColumnDataSource(vis_df[vis_df[topic_n] == x])


p.add_tools(HoverTool(tooltips=[('Topic Number', '@topic_n'),
                                ("Frequency as Dominant Topic", "@freq"),
                                ("Rank", '@rank')]))

from bokeh.models import ColorBar
bar = ColorBar(color_mapper=cmap, location=(0,0))
p.add_layout(bar, "right")



show(p)

# TO save the file as HTML
#output_file("/Users/averysoh/Google Drive (racass1234@gmail.com)/AI Project/Avery_output/Dominant_topic.html")
#save(p)

In [46]:
Blues[256].reverse()

In [61]:
for x in vis_df['topic_n'].unique():
    print(x)

13
18
8
10
4
9
7
17
19
15
16
11
3
2
12
0
1
14
5
6


In [70]:
vis_df[vis_df[topic_n] == 19]

KeyError: 19

In [73]:
vis_df[vis_df.topic_n == 1]

Unnamed: 0,rank,topic_n,freq,year
16,17,1,4213,2013
6,7,1,6248,2014
0,1,1,7755,2015
18,19,1,5497,2016
4,5,1,6035,2017
10,11,1,5665,2018


In [None]:
CO

In [None]:
from gensim.test.utils import common_corpus
from gensim.models import LdaSeqModel

ldaseq = LdaSeqModel(corpus=common_corpus, time_slice=[2, 4, 3], num_topics=2, chunksize=1)