<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-and-directory" data-toc-modified-id="Import-and-directory-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import and directory</a></span></li><li><span><a href="#Sequential-Topic-Modelling" data-toc-modified-id="Sequential-Topic-Modelling-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Sequential Topic Modelling</a></span><ul class="toc-item"><li><span><a href="#Import-for-model" data-toc-modified-id="Import-for-model-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Import for model</a></span></li><li><span><a href="#Stop-Words" data-toc-modified-id="Stop-Words-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Stop Words</a></span></li><li><span><a href="#Data-Preprocessing" data-toc-modified-id="Data-Preprocessing-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Data Preprocessing</a></span></li><li><span><a href="#Sequential-modelling-part" data-toc-modified-id="Sequential-modelling-part-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Sequential modelling part</a></span></li><li><span><a href="#Get-document-topic-dominant-probability" data-toc-modified-id="Get-document-topic-dominant-probability-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>Get document-topic dominant probability</a></span></li></ul></li></ul></div>

# Import and directory

In [3]:
import pandas as pd
import numpy as np
import os

In [11]:
from pathlib import *
#current working directory
# current_dir = Path.cwd()
# Rename to suit simone folders
current_dir = Path('/media/simone/data/ra/avery/google_drive/ai')
# /media/simone/data/ra/avery/google_drive/ai/Avery_output/AI_sequential_topic/

#go up 1 level to the 1st parent directory
parent_dir = current_dir.parents[0]

parent_dir

PosixPath('/media/simone/data/ra/avery/google_drive')

In [12]:
data_filename = 'AI_topic_company_10k.csv'
data_dir = parent_dir / 'ai' / 'Avery_output' / 'AI_sequential_topic' / data_filename

data_dir

PosixPath('/media/simone/data/ra/avery/google_drive/ai/Avery_output/AI_sequential_topic/AI_topic_company_10k.csv')

In [4]:
from IPython.display import Audio, display

def allDone():
    '''this function outputs a short audio when called. 
    Typically this is used to signal a task completion'''
    
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))

# Sequential Topic Modelling

## Import for model

In [5]:
# Plotting tools
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import re
import numpy as np
import pandas as pd

#NLTK
import nltk
# nltk.download('stopwords')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel

# spacy for lemmatization
import spacy
if 'nlp' not in locals():
    nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
#     nlp = spacy.load('en', disable=['parser', 'ner'])

import gc
import os
from glob import glob
from IPython.display import Audio, display

# Text Analysis (uncomment if running for first time)
# ! wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
# ! unzip mallet-2.0.8.zip
# MALLET_PATH = 'mallet-2.0.8/bin/mallet'

## Stop Words

In [6]:
# NLTK Stop words
from nltk.corpus import stopwords

#NLTK english stopwords
stop_words = stopwords.words('english')
#extend the list with a peronal list of stopwords
stop_words.extend(['from', 'need','thank','thing','something', 'see', 'say', 'well','people', 'change', 'com',\
                   'go', 'put', 'give','twitter','pic', \
                   'subject', 're', 'edu', 'could', 'be', 'make', 'not', 'make','find','let','may','see', 'would',\
                   'come', 'sure', 'ever', 'tell', 'use', 'not', 'doing', 'be', 'get','want'])
#extend the search word
stop_words.extend(['artificial intelligence', '#ai', '#ml', '#nlp', 'analytics', 'data mining',
                  'deep mining', 'machine learning', 'natural language processing', 'neural network'
                  'pattern recognition'])

## Data Preprocessing

In [7]:
### Tokenize words and Clean-up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
###Remove Stopwords, Make Bigrams and Lemmatize
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(bigram_mod, texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


def lda_preprocessing(df):
    ### Remove emails, newline characters, and links
    # Convert to list
    data = df.tweets.values.tolist()

    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', str(tweet)) for tweet in data]

    # Remove new line characters
    data = [re.sub('\s+', ' ', tweet) for tweet in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", tweet) for tweet in data]

    #Remove links
    data = [re.sub(r"http\S+", "", tweet) for tweet in data]

    #make lower case
    data = [tweet.lower() for tweet in data]
    
    # Tokenize words and Clean-up text
    data_words = list(sent_to_words(data))

    ###Creating Bigram and Trigram Models
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(bigram_mod, data_words_nostops)

    # Do lemmatization keeping only noun, adj, vb, adv
    # Remove adj and adv just for this topic model
    # data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN','ADJ','ADV', 'VERB'])
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'VERB'])
    data_lemmatized = remove_stopwords(data_lemmatized)

    ###Create the Dictionary and Corpus needed for Topic Modeling¶
    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    texts = data_lemmatized

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

        
    return data_lemmatized, corpus, id2word

## Sequential modelling part

In [8]:
#import Packages
from gensim.models import LdaSeqModel
from gensim.models import LdaMulticore

# Import df
df = pd.read_csv(data_dir)

data_lemmatized, corpus, id2word = lda_preprocessing(df)
    

In [None]:

# Change slice based on how many each year 

### For 100k dataset use this
# topic_model = LdaSeqModel(corpus=corpus, id2word=id2word,
#                           time_slice=[100000, 100000, 100000, 100000, 100000, 100000, 75179],
#                           num_topics=20)


# For 10k dataset use this
topic_model = LdaSeqModel(corpus=corpus, id2word=id2word,
                          time_slice=[10000, 10000, 10000],
                          num_topics=20)


# Get the document topic probability relationship
num_topics = 20
matrix = []
for x in range(len(corpus)):
    matrix.append(topic_model.doc_topics(x))

    
df_ref = pd.DataFrame(matrix)

df_dom = pd.DataFrame()

# ref: https://thispointer.com/pandas-find-maximum-values-position-in-columns-or-rows-of-a-dataframe/
#Get Column names of Maximum value in every row
df_dom['topic_n'] = df_ref.idxmax(axis=1)
df_dom['prob'] = df_ref.max(axis=1)

#Merge with df100k and save

df = df.merge(df_dom, left_index=True, right_index=True)

OUTPUT = parent_dir/'ai'/'Avery_output'/'AI_sequential_topic'
outname ='AI_topic_company_10k_domtopic.csv'


df.to_csv(os.path.join(str(OUTPUT),outname), index = False)



allDone()

  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  convergence = np.fabs((bound - old_bound) / old_bound)
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))


## Get document-topic dominant probability

In [None]:
# Get the document topic probability relationship
num_topics = 20
matrix = []
for x in range(len(corpus)):
    matrix.append(topic_model.doc_topics(x))

    
df_ref = pd.DataFrame(matrix)

df_dom = pd.DataFrame()

# ref: https://thispointer.com/pandas-find-maximum-values-position-in-columns-or-rows-of-a-dataframe/
#Get Column names of Maximum value in every row
df_dom['topic_n'] = df_ref.idxmax(axis=1)
df_dom['prob'] = df_ref.max(axis=1)



In [None]:
#Merge with df100k and save

df = df.merge(df_dom, left_index=True, right_index=True)

OUTPUT = parent_dir/'ai'/'Avery_output'/'AI_sequential_topic'
outname ='AI_topic_company_10k_domtopic.csv'


df.to_csv(os.path.join(str(OUTPUT),outname), index = False)

In [None]:
allDone()

In [13]:
# import pandas as pd

# pd.read_csv('/Users/averysoh/Google Drive (racass1234@gmail.com)/AI Project/Avery_output/AI_sequential_topic/AI_topic_company_10k.csv')

Unnamed: 0,JP Morgan,Etoro,Google,Visa,Goldman Sachs,Unilever,Deloitte,Samsung,Wells Fargo,Allianz,Apple,Commerzbank,tweets,year
0,0,0,1,0,0,0,0,0,0,0,0,0,this looks like it might be a good google anal...,2013
1,0,0,1,0,0,0,0,0,0,0,0,0,like that most google apps support multiple ac...,2013
2,0,0,1,0,0,0,0,0,0,0,0,0,how to track unclicked video ad impressions wi...,2013
3,0,0,1,0,0,0,0,0,0,0,0,0,using google analytics to measure social media...,2013
4,1,0,0,0,0,0,0,0,0,0,0,0,business analytics &amp; reporting #manager (v...,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0,0,0,0,0,0,0,0,0,0,1,0,@eric_analytics porsche is the perfect example...,2015
29996,0,0,1,0,0,0,0,0,0,0,0,0,it's lunch time during our @googleanalytics tr...,2015
29997,0,0,1,0,0,0,0,0,0,0,0,0,el usuario de google analytics #infografia #in...,2015
29998,0,0,1,0,0,0,0,0,0,0,0,0,google analytics is an incredibly important to...,2015
