## What are people talking about ?

In [3]:
# Keyword and topic extraction

import os
import re
import pyLDAvis
import pyLDAvis.gensim 
import collections
import math
import pandas as pd
import operator
import spacy
import nltk.tokenize
from gensim.models import CoherenceModel

nlp = spacy.load('en', disable=['parser', 'ner'])

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer

from gensim import corpora,models,similarities
from gensim.parsing.preprocessing import (strip_punctuation,strip_numeric,stem_text,
                                          strip_multiple_whitespaces,strip_non_alphanum,
                                          remove_stopwords,strip_short)
from gensim.parsing.preprocessing import preprocess_string

CUSTOM_FILTERS = [lambda x: x.lower(),
                  strip_multiple_whitespaces,
                  strip_numeric,
                  strip_punctuation,
                  remove_stopwords,
                  strip_short]

from_sender_regex = re.compile('From\s([^\s]+\sat\s[^\.]+\.[a-z0-9]+)(.*)\nFrom\:\s[^\s]+\sat\s[^\.]+\.[a-z0-9]+',
                               re.MULTILINE|re.UNICODE)
from_forwarded_regex = re.compile('From\:')
message_id_regex = re.compile('Message\-ID\:\s+\<([^<>]+)\>')
reference_id_regex = re.compile('References\:\s+\<([^<>]+)\>')
url_regex = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
mobile_number_regex = re.compile('\+?\d{2}\-?\s*\d\s*\d\s*\d\s*\d\s*\d\s*\d\s*\d\s*\d\s*\d\s*\d|^Mobile\s*\:|^Phone\s*\:|^Email\s*\:')

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [4]:
def lemmatization(texts, allowed_postags=['NOUN']):
    """ Parse sentences, perform NLP and return only words matching given POS Tags """
    
    texts_out = []
   
    for sent in texts:
        # print("Sent",sent)
        doc = nlp(sent) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    
    return texts_out

In [5]:
def sent_to_words(sentences):
    """ Tokenize sentences to words """
    
    all_words = []
    
    sentences = lemmatization(sentences)
    # print('Sentences =>', sentences)
    for sentence in sentences:
        
        try:
            all_words += preprocess_string(str(sentence), CUSTOM_FILTERS)  # deacc=True removes punctuations
        except:
            pass

    return all_words

In [4]:
def clean_email_extract_keywords(email_data, full_path, sents=False):
    """ Clean an email text data string and return keywords/sentences """

    # Email content begins after Message-ID: line
    m = message_id_regex.search(email_data)
    if m == None:
        print('Fatal - Missing Message-ID in email!')
        sys.exit(1)

    data = email_data[m.end():]

    # Remove any forwarded part of the email!
    m = from_forwarded_regex.search(data)
    if m != None:
        data = data[:m.start()].strip()
        
    # Split in newlines and filter empty lines
    data_lines = filter(None, [item.strip() for item in data.strip().split('\n')])
    valid_lines = []
    
    # Skip the following lines
    for line in data_lines:
        line = line.strip()
        # Any quoted emails     
        if line.startswith('>'):continue
        # Similarly skip anything with -[A-Z]+
        if re.match('\-+[A-Z][a-zA-Z]+', line):
            continue
        # Skip -- ...
        # Anything starting with a -- is usually a name or signature of email
        if re.match('\-{2,}|\*{1,}|\={1,}|\?{1,}|\<{1,}|\_{4,}', line):
            continue
        # Skip 'Regards'
        if re.match('regards|with\s+regards|thanks|with\s+thanks|thank\s+you|hth|cheers|visit\:|sincerely|your\'?s|thanking',
                    line, re.IGNORECASE):
            continue
        # Skip ML signature lines
        if re.match('BangPypers\s+mailing\s+list|BangPypers\s+at\s+python\.org', line, re.IGNORECASE):
            continue
        if line.startswith('http://mail.python.org/mailman/listinfo/bangpypers'):
            continue
        # Skip HTML attachment lines
        if re.match('An?\s+[a-zA-Z\-]+\s+attachment\s+was\s+scrubbed|URL\:|Name\:|Type\:|Size\:|Desc:', line):
            continue        
        # Skip the "On so and so date, dude wrote" lines
        if re.search('On\s+.*\s+wrote\:|On\s+[A-Z][a-z]+\,\s*[A-Z][a-z]+\s+\d+\,\s+\d{4}|On\s+\d/\d+/\d+|On\s+[A-Z][a-z]+\s+\d+\,?\s+\d{2,4}|On\s+\d+\-?[A-Z][a-z]+\-\d{2,4}', line):
            continue

        # Skip "anyone wrote:" lines
        if re.match('[a-zA-Z0-9\s\-\._\>\<\(\)]+\s+wrote\:|wrote\:', line):
            continue
        
        # Skip Sent from ... lines
        if re.match('Sent\s+from', line, re.IGNORECASE):
            continue

        # Skip Hello lines
        if re.match(r'\b(Hello|Hi)\b', line, re.IGNORECASE):
            continue
        
        # Skip anything matching a URL or mobile
        if url_regex.search(line) or mobile_number_regex.search(line):
            continue

        valid_lines.append(line.strip())
    
    text = ' '.join(valid_lines)
    sentences = nltk.tokenize.sent_tokenize(text)
    
    if sents:
        return sentences
    
    data_words = sent_to_words(sentences)
        
    return data_words

  if re.match('\-+[A-Z][a-zA-Z]+', line):
  if re.match('\-{2,}|\*{1,}|\={1,}|\?{1,}|\<{1,}|\_{4,}', line):
  if re.match('regards|with\s+regards|thanks|with\s+thanks|thank\s+you|hth|cheers|visit\:|sincerely|your\'?s|thanking',
  if re.match('BangPypers\s+mailing\s+list|BangPypers\s+at\s+python\.org', line, re.IGNORECASE):
  if re.match('An?\s+[a-zA-Z\-]+\s+attachment\s+was\s+scrubbed|URL\:|Name\:|Type\:|Size\:|Desc:', line):
  if re.search('On\s+.*\s+wrote\:|On\s+[A-Z][a-z]+\,\s*[A-Z][a-z]+\s+\d+\,\s+\d{4}|On\s+\d/\d+/\d+|On\s+[A-Z][a-z]+\s+\d+\,?\s+\d{2,4}|On\s+\d+\-?[A-Z][a-z]+\-\d{2,4}', line):
  if re.match('[a-zA-Z0-9\s\-\._\>\<\(\)]+\s+wrote\:|wrote\:', line):
  if re.match('Sent\s+from', line, re.IGNORECASE):


In [5]:
def perform_lda(all_words):
    """ Create LDA topic model from word lists """
      
    # NOT USED
    dictionary = corpora.Dictionary(all_words)
    #print(dictionary)

    corpus = [dictionary.doc2bow(text) for text in all_words]
    #print([[(dictionary[id], freq) for id,freq in cp] for cp in corpus[:1]])

    tfidf = models.TfidfModel(corpus)

    trans_tfidf = tfidf[corpus]

    lda = models.LdaMulticore(trans_tfidf, num_topics=10, id2word=dictionary)
    print(lda.show_topics())
    
    return lda, corpus, dictionary

In [6]:
def perform_lda2(all_words):
    """ Create LDA topic model from words lists (customized) """
      
    id2word = corpora.Dictionary(all_words)
    
    corpus = [id2word.doc2bow(text) for text in all_words]

    # trans_tfidf = tfidf[corpus]
    lda_model =   models.LdaMulticore(corpus=corpus,
                                      id2word=id2word,
                                      num_topics=20, 
                                      random_state=100,
                                      chunksize=100,
                                      passes=10,
                                      # alpha='auto',
                                      workers=7,
                                      per_word_topics=True)
    
    print(lda_model.show_topics())
       
    return all_words, lda_model, corpus, id2word

In [6]:
def top_words(all_words, count=10):
    """ Find top words using Tf/idf """
    
    all_words = [' '.join(x) for x in all_words]

    cv=CountVectorizer()
    word_count_vector=cv.fit_transform(all_words)
    tft = TfidfTransformer(sublinear_tf=True)
    tft.fit_transform(word_count_vector)

    tf_idf = pd.DataFrame(tft.idf_, index=cv.get_feature_names(),columns=["idf_weights"])
 
    return tf_idf

In [8]:
def extract_email_sentences(root='archives', start=2007, end=2019, year=None, type='frequency'):
    """ Extract email sentences """
    
    for y in range(start, end+1):
        if year != None and y != year: continue
        
        year_root = os.path.join(root, str(y))
        # Parse .eml files
        all_lines = []
        all_sents = []
        
        for root_dir,dirs,files in os.walk(year_root):
            for f in files:
                if f.endswith('.eml'):
                    full_path = os.path.join(root_dir, f)
                    # print('Parsing',full_path)                  
                    data = open(full_path).read()
                    sents = clean_email_extract_keywords(data, full_path, sents=True)
                    all_sents.append(sents)
                    
        return all_sents

In [7]:
def analyze_emails(root='archives', start=2007, end=2019, year=None, type='frequency'):
    """ Extract email data and perform analysis """
    
    for y in range(start, end+1):
        if year != None and y != year: continue
            
        year_root = os.path.join(root, str(y))
        # Parse .eml files
        all_lines = []
        all_words = []
        
        for root_dir,dirs,files in os.walk(year_root):
            for f in files:
                if f.endswith('.eml'):
                    full_path = os.path.join(root_dir, f)
                    # print('Parsing',full_path)                  
                    data = open(full_path).read()
                    words = clean_email_extract_keywords(data, full_path)
                    all_words.append(words)
        
        if type == 'frequency':
            return top_words(all_words)
        elif type == 'lda':
            return perform_lda2(all_words)

### Frequency Analysis using tf/idf

In [1]:
year = 2009

In [8]:
result = %time analyze_emails(year=year)

NameError: name 'clean_email_extract_keywords' is not defined

In [9]:
display_tfidf = lambda x, y, z: x[x>=y].sort_values(by='idf_weights',ascending=z)[:50]

In [10]:
display_tfidf(result, 2.0, False)

TypeError: '>=' not supported between instances of 'NoneType' and 'float'

### Topic Analysis using LDA

In [28]:
import pickle
from IPython.core.display import HTML

def display_lda(lda, corpus, dictionary, year=None, key=0, type='lda'):
    """ Display LDA model using pyLDAvis """
        
    pyLDAvis.enable_notebook()
    
    def inner():
        cache_file = '_'.join(('lda_viz', '.', str(year), str(key), '_', type))
        if year and os.path.isfile(cache_file):
            print('Loading from cache')
            vis = pickle.load(open(cache_file, 'rb'))
        else:
            vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
            pickle.dump(vis, open(cache_file, 'wb'))
            
        viz = pyLDAvis.display(vis)
        return HTML(viz.data)
    
    return inner()

In [15]:
all_words, lda_model, corpus, id2word = %time analyze_emails(year=year, type='lda')

[(19, '0.040*"python" + 0.026*"performance" + 0.026*"language" + 0.023*"project" + 0.021*"page" + 0.019*"lot" + 0.018*"wiki" + 0.018*"array" + 0.016*"thread" + 0.015*"comment"'), (12, '0.136*"python" + 0.023*"file" + 0.017*"script" + 0.017*"window" + 0.016*"command" + 0.016*"version" + 0.012*"logo" + 0.011*"bit" + 0.011*"event" + 0.011*"library"'), (4, '0.071*"day" + 0.044*"way" + 0.040*"piece" + 0.033*"life" + 0.030*"attitude" + 0.029*"failure" + 0.029*"miracle" + 0.029*"coal" + 0.029*"pressure" + 0.028*"happiness"'), (16, '0.019*"list" + 0.017*"expression" + 0.016*"python" + 0.016*"syntax" + 0.015*"release" + 0.014*"argument" + 0.014*"bug" + 0.014*"reference" + 0.013*"api" + 0.012*"framework"'), (0, '0.079*"mail" + 0.041*"noufal" + 0.030*"attachment" + 0.028*"try" + 0.024*"length" + 0.021*"copying" + 0.014*"saha" + 0.014*"sender" + 0.014*"care" + 0.014*"processing"'), (6, '0.041*"value" + 0.033*"code" + 0.028*"query" + 0.026*"way" + 0.023*"print" + 0.016*"function" + 0.016*"argument"

In [16]:
# Find coherence of the model, larger the coherence the better
coh_model_lda = CoherenceModel(model=lda_model, texts=all_words, dictionary=id2word, coherence='c_v')
coh_model_lda.get_coherence()

0.4240676803689537

In [17]:
# Log perplexity of the moderl, smaller the better
lda_model.log_perplexity(corpus)

-7.9786437269318355

In [29]:
display_lda(lda_model, corpus, id2word, year)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Topic Analysis using LDA Mallet

In [19]:
import gensim
from pprint import pprint

# mallet: http://mallet.cs.umass.edu/index.php
mallet_path = 'mallet/bin/mallet' # Path to mallet binary
lda_mallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

pprint(lda_mallet.show_topics(formatted=False))

[(11,
  [('idea', 0.09799554565701558),
   ('problem', 0.08314773570898293),
   ('page', 0.06904231625835189),
   ('logo', 0.05270972531551596),
   ('number', 0.051224944320712694),
   ('wiki', 0.03414996288047513),
   ('kind', 0.0334075723830735),
   ('design', 0.032665181885671864),
   ('software', 0.032665181885671864),
   ('school', 0.025983667409057165)]),
 (17,
  [('question', 0.09235427706283118),
   ('post', 0.080999242997729),
   ('link', 0.05753217259651779),
   ('reason', 0.045420136260408785),
   ('blog', 0.03557910673732021),
   ('forum', 0.033308099924299776),
   ('context', 0.03255109765329296),
   ('image', 0.028009084027252083),
   ('posting', 0.028009084027252083),
   ('information', 0.021196063588190765)]),
 (19,
  [('code', 0.22781954887218045),
   ('app', 0.09323308270676692),
   ('exception', 0.027819548872180452),
   ('security', 0.027819548872180452),
   ('statement', 0.02706766917293233),
   ('thing', 0.02481203007518797),
   ('loop', 0.021052631578947368),
   

In [20]:
coh_model_lda = CoherenceModel(model=lda_mallet, texts=all_words, dictionary=id2word, coherence='c_v')
coh_model_lda.get_coherence()

0.47043825472392903

### Compute best Model using coherence 

In [21]:
def compute_best_model(dictionary, corpus, texts, limit, start=5, step=3):
    """
    Compute best model using C_V coherence values for a range of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    topic_range = range(start, limit, step)
        
    for num_topics in topic_range:
        print('Computing model for #topic - {}'.format(num_topics))
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word, workers=7)
        print('Model for #topic - {} computed'.format(num_topics))
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    
    for m, cv in zip(topic_range, coherence_values):
        print("#topic {} has Coherence Value of {}".format(m, round(cv, 4)))

    return model_list, coherence_values

In [22]:
model_list, coherence_values = %time compute_best_model(id2word, corpus, all_words, 30)

Computing model for #topic - 5
Model for #topic - 5 computed
Computing model for #topic - 8
Model for #topic - 8 computed
Computing model for #topic - 11
Model for #topic - 11 computed
Computing model for #topic - 14
Model for #topic - 14 computed
Computing model for #topic - 17
Model for #topic - 17 computed
Computing model for #topic - 20
Model for #topic - 20 computed
Computing model for #topic - 23
Model for #topic - 23 computed
Computing model for #topic - 26
Model for #topic - 26 computed
Computing model for #topic - 29
Model for #topic - 29 computed
#topic 5 has Coherence Value of 0.4889
#topic 8 has Coherence Value of 0.5135
#topic 11 has Coherence Value of 0.4812
#topic 14 has Coherence Value of 0.4879
#topic 17 has Coherence Value of 0.4424
#topic 20 has Coherence Value of 0.4546
#topic 23 has Coherence Value of 0.458
#topic 26 has Coherence Value of 0.4202
#topic 29 has Coherence Value of 0.4417
CPU times: user 8.02 s, sys: 961 ms, total: 8.98 s
Wall time: 5min 18s


In [31]:
# Model with 8 topics has highest coherence
# Convert to LDA model first
lda_mallet_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model_list[1])

display_lda(lda_mallet_model, corpus, id2word, year, 1, 'mallet')

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Finding dominant topics in sentences

In [39]:
def format_topics_sentences(ldamodel, corpus, texts):
    """ Find the dominant topic(s) in each sentence """

    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            # print(j, topic_num, prop_topic)
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [40]:
all_sents = extract_email_sentences(year=year)
df_topic_sents_keywords = format_topics_sentences(ldamodel=model_list[1], corpus=corpus, texts=all_sents)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords (Topics)', 'Text']

# Show
df_dominant_topic.head(20)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords (Topics),Text
0,0,5.0,0.1767,"time, web, point, app, thread, lot, guy, job, ...","[This was quite an amazing read., I looked whe..."
1,1,3.0,0.1874,"list, class, function, code, method, object, p...",[For some vague reason the click() function is...
2,2,0.0,0.1828,"people, meeting, conference, event, idea, disc...",[Every one suddenly getting interested in conc...
3,3,1.0,0.14,"year, project, developer, work, experience, so...","[Yes, I need exactly the same functionality]"
4,4,7.0,0.221,"language, problem, programming, thing, program...",[I guess my point is this: If you've chosen py...
5,5,3.0,0.207,"list, class, function, code, method, object, p...","[This is what I would do: l = [(name, object, ..."
6,6,2.0,0.1394,"application, user, page, server, datum, featur...","[issue is created: comments welcome., Kenneth ..."
7,7,4.0,0.1422,"day, code, question, life, issue, text, sessio...",[top posting almost invariably causes confusio...
8,8,1.0,0.1422,"year, project, developer, work, experience, so...","[Well,it's like a game., Mapping the names wit..."
9,9,3.0,0.1334,"list, class, function, code, method, object, p...","[I am all for it., Perhaps Aug1 or Aug8., Lets..."


### Find the most representative text for each topic

In [35]:
def find_representative_topic(df_topic_sents_keywords):
    """ Finding most representative sentence(s) for each topic """
    
    # Group top 5 sentences under each topic
    sent_topics_sorteddf_mallet = pd.DataFrame()
    sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

    for i, grp in sent_topics_outdf_grpd:
        sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                                 grp.sort_values(['Perc_Contribution'], ascending=[0]).head(2)], 
                                                axis=0)
    # Reset Index    
    sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
    # Format
    sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords (Topics)", "Text"]
    # Show
    return sent_topics_sorteddf_mallet.head(10)

In [36]:
find_representative_topic(df_topic_sents_keywords)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords (Topics),Text
0,0.0,0.6604,"people, meeting, conference, event, idea, disc...","[Let me chime in here., A couple of us in B'lo..."
1,0.0,0.6369,"people, meeting, conference, event, idea, disc...","[May 3rd at ThoughtWorks then., Shall we say 4..."
2,1.0,0.5165,"year, project, developer, work, experience, so...","[Few FOSS based python jobs in IIT Bombay., Pl..."
3,1.0,0.4437,"year, project, developer, work, experience, so...",[We at Essentia are looking for candidate at j...
4,2.0,0.6044,"application, user, page, server, datum, featur...","[Greetings all, We are proud to announce the r..."
5,2.0,0.5864,"application, user, page, server, datum, featur...","[Greetings all, We are proud to announce the r..."
6,3.0,0.7051,"list, class, function, code, method, object, p...",[I will try to explain this the other way arou...
7,3.0,0.6321,"list, class, function, code, method, object, p...","[Well Vijay, you have asked a sweeping questio..."
8,4.0,0.3646,"day, code, question, life, issue, text, sessio...",[Cool....While I was using preprocessor direct...
9,4.0,0.3567,"day, code, question, life, issue, text, sessio...",[trackers forum was that I was not using BOUML...


### References

1. https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#10removestopwordsmakebigramsandlemmatize
2. https://towardsdatascience.com/topic-modeling-with-gensim-a5609cefccc
2. https://radimrehurek.com/gensim/
3. https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/