## Importing Packages and Data

In [1]:
import boto3
import pandas as pd
import nltk

#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('punkt')
#nltk.download('universal_tagset')

In [2]:
!aws s3 ls s3://ml-personalization/data/

                           PRE comprehend_topic_data/
                           PRE comprehend_topic_results/
                           PRE ecommerce_marketbasket/
                           PRE ecommerce_recommender/
                           PRE movie_lens/
                           PRE netflix/
2019-02-10 00:27:12          0 
2019-02-22 05:21:06      56866 WorkdayClasses.csv


## 

In [3]:
class_data = pd.read_csv("s3://ml-personalization/data/WorkdayClasses.csv")
class_data.head()

Unnamed: 0,Class Name,Description Text,Source
0,Coaching Habit,Coaching can help our people to perform at the...,Workday
1,Meaningful Feedback,Feedback is a catalyst for career growth and c...,Workday
2,Client Introductions and Meetings,Even the most experienced consultant must prep...,Workday
3,Ladder of Inference & The Drama Triangle,What you say and how you say it is a part of w...,Workday
4,Listening with Purpose,"""Most people do not listen with the intent to ...",Workday


## Basic Text Preprocessing

In [4]:
import preprocessing

In [5]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

class Splitter(object):
    """
    split the document into sentences and tokenize each sentence
    """
    def __init__(self):
        self.splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self,text):
        """
        out : ['What', 'can', 'I', 'say', 'about', 'this', 'place', '.']
        """
        # split into single sentence
        sentences = self.splitter.tokenize(text)
        # tokenization in each sentences
        tokens = [self.tokenizer.tokenize(sent) for sent in sentences]
        return tokens


class LemmatizationWithPOSTagger(object):
    def __init__(self):
        pass
    def get_wordnet_pos(self,treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

    def pos_tag(self,tokens):
        # find the pos tagginf for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
        pos_tokens = [nltk.pos_tag(token) for token in tokens]

        # lemmatization using pos tagg   
        # convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]
        pos_tokens = [ [(word, lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)), [pos_tag]) for (word,pos_tag) in pos] for pos in pos_tokens]
        return pos_tokens

lemmatizer = WordNetLemmatizer()
splitter = Splitter()
lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()

#step 1 split document into sentence followed by tokenization
tokens = class_data['Description Text'].apply(lambda x: splitter.split(x))

#step 2 lemmatization using pos tagger 
lemma_pos_token = tokens.apply(lambda x: lemmatization_using_pos_tagger.pos_tag(x))
print(lemma_pos_token[0])

[[('Coaching', 'Coaching', ['VBG']), ('can', 'can', ['MD']), ('help', 'help', ['VB']), ('our', 'our', ['PRP$']), ('people', 'people', ['NNS']), ('to', 'to', ['TO']), ('perform', 'perform', ['VB']), ('at', 'at', ['IN']), ('their', 'their', ['PRP$']), ('best', 'best', ['JJS']), (',', ',', [',']), ('learn', 'learn', ['VBP']), ('from', 'from', ['IN']), ('their', 'their', ['PRP$']), ('own', 'own', ['JJ']), ('experiences', 'experience', ['NNS']), (',', ',', [',']), ('and', 'and', ['CC']), ('drive', 'drive', ['JJ']), ('empowerment', 'empowerment', ['NN']), ('and', 'and', ['CC']), ('engagement', 'engagement', ['NN']), ('.', '.', ['.'])], [('Yet', 'Yet', ['RB']), (',', ',', [',']), ('we', 'we', ['PRP']), ('regularly', 'regularly', ['VBP']), ('fall', 'fall', ['DT']), ('back', 'back', ['RB']), ('on', 'on', ['IN']), ('old', 'old', ['JJ']), ('habits', 'habit', ['NNS']), ('of', 'of', ['IN']), ('telling', 'tell', ['VBG']), ('instead', 'instead', ['RB']), ('of', 'of', ['IN']), ('asking', 'ask', ['VBG'

In [6]:
#creating and testing lemmatizing function

def lemmatize_to_string(value):
    string_list = []
    for i in range(len(value)):
        string_list.extend([x[1] for x in value[i]])
    
    
    return " ".join(string_list)
        
lemmatize_to_string(lemma_pos_token[0])

'Coaching can help our people to perform at their best , learn from their own experience , and drive empowerment and engagement . Yet , we regularly fall back on old habit of tell instead of ask , mentor instead of coaching , and direct instead of lead . In this class , we ’ ll delineate the difference between mentor and coaching , introduce a simple framework for a 10 minute coach conversation , and establish a new habit to reinforce regular coaching conversation with your team . At the end of this course , participant will be able to : Determine when and how to use coach question to support others development and performance , Facilitate a coach conversation in 10 minute or less , Effectively establish a new habit'

In [7]:
# we can see the above lemmatizing approach isn't perfect, applying to all rows

lemmatized_descriptions = lemma_pos_token.apply(lambda x: lemmatize_to_string(x))

In [8]:
# basic text preparation for NLP

# converting to lowercase
class_data['Lemma_Processed'] = preprocessing.lowercase(lemmatized_descriptions)
class_data['Stem_Processed'] = preprocessing.lowercase(class_data['Description Text'])

# stripping punctuation
class_data['Lemma_Processed'] = preprocessing.strip_punctuation(class_data['Lemma_Processed'])
class_data['Stem_Processed'] = preprocessing.strip_punctuation(class_data['Stem_Processed'])

# removing stopwords
class_data['Lemma_Processed'] = preprocessing.remove_english_stopwords(class_data['Lemma_Processed'])
class_data['Stem_Processed'] = preprocessing.remove_english_stopwords(class_data['Stem_Processed'])

# stemming
class_data['Stem_Processed'] = preprocessing.stem(class_data['Stem_Processed'])

print()
print()
print(class_data['Description Text'][0])
print()
print(class_data['Lemma_Processed'][0])
print()
print(class_data['Stem_Processed'][0])

converting to lower case...
converting to lower case...
removing English stopwords...
removing English stopwords...
stemming words...


Coaching can help our people to perform at their best, learn from their own experiences, and drive empowerment and engagement. Yet, we regularly fall back on old habits of telling instead of asking, mentoring instead of coaching, and directing instead of leading. In this class, we’ll delineate the difference between mentoring and coaching, introduce a simple frameworks for a 10 minutes coaching conversation, and establish a new habit to reinforce regular coaching conversations with your team. At the end of this course, participants will be able to: Determine when and how to use coaching questions to support others development and performance, Facilitate a coaching conversation in 10 minutes or less, Effectively establish a new habit

coaching help people perform best learn experience drive empowerment engagement yet regularly fall back old habit tell i

In [9]:
# evaluating model ranges to determine best topic model

import time
from collections import defaultdict

grid = defaultdict(list)
parameter_list = range(2,8)
eval_columns = ['Number_Of_Topics', 'Lemma_Coherence', 'Stem_Coherence']

# getting kfold splits

from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=5, shuffle=True)

In [10]:
# evaluating kfold splits

from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
import re
import warnings

eval_df = pd.DataFrame(columns=eval_columns)

for train_index, test_index in kf.split(class_data):
    
    # getting train/test split for the fold
    train_data = class_data.iloc[train_index]
    test_data = class_data.iloc[test_index]
    
    # creating text lists
    stemmed_train_text = []
    lemma_train_text = []
    stemmed_test_text = []
    lemma_test_text = []
    
    for row in train_data.iterrows():
        stem_val = str(row[1]['Stem_Processed'])
        lemma_val = str(row[1]['Lemma_Processed'])
        stemmed_train_text.append(re.split(r'\s+', stem_val))
        lemma_train_text.append(re.split(r'\s+', lemma_val))
        
    for row in test_data.iterrows():
        stem_val = str(row[1]['Stem_Processed'])
        lemma_val = str(row[1]['Lemma_Processed'])
        stemmed_test_text.append(re.split(r'\s+', stem_val))
        lemma_test_text.append(re.split(r'\s+', lemma_val))
        
    # creating dictionaries
    stem_dictionary = corpora.Dictionary(stemmed_train_text)
    lemma_dictionary = corpora.Dictionary(lemma_train_text)
    
    # creating corpuses
    stem_corpus = [stem_dictionary.doc2bow(stem) for stem in stemmed_train_text]
    lemma_corpus = [lemma_dictionary.doc2bow(lemma) for lemma in lemma_train_text]
    
    # building models for parameter values
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        
        for param in parameter_list:
            
            # building models
            print("Building topic models for {0} topics".format(str(param)))
            start_time = time.time()
            stem_model = models.ldamodel.LdaModel(corpus=stem_corpus, id2word=stem_dictionary, num_topics=param, passes=10, random_state = 1)
            lemma_model = models.ldamodel.LdaModel(corpus=lemma_corpus, id2word=lemma_dictionary, num_topics=param, passes=10, random_state = 1)
            elapsed = time.time() - start_time
            #print("Elapsed time: %s" % elapsed)
            
            # getting coherence values
            stem_cm = CoherenceModel(model=stem_model, texts=stemmed_train_text, dictionary=stem_dictionary, coherence='c_v')
            lemma_cm = CoherenceModel(model=lemma_model, texts=lemma_train_text, dictionary=lemma_dictionary, coherence='c_v')
            stem_coherence = stem_cm.get_coherence()
            lemma_coherence = lemma_cm.get_coherence()
        
            # adding to eval data frame
            eval_df.loc[len(eval_df)] = [param, lemma_coherence, stem_coherence]

print(eval_df)

Building topic models for 2 topics
Building topic models for 3 topics
Building topic models for 4 topics
Building topic models for 5 topics
Building topic models for 6 topics
Building topic models for 7 topics
Building topic models for 2 topics
Building topic models for 3 topics
Building topic models for 4 topics
Building topic models for 5 topics
Building topic models for 6 topics
Building topic models for 7 topics
Building topic models for 2 topics
Building topic models for 3 topics
Building topic models for 4 topics
Building topic models for 5 topics
Building topic models for 6 topics
Building topic models for 7 topics
Building topic models for 2 topics
Building topic models for 3 topics
Building topic models for 4 topics
Building topic models for 5 topics
Building topic models for 6 topics
Building topic models for 7 topics
Building topic models for 2 topics
Building topic models for 3 topics
Building topic models for 4 topics
Building topic models for 5 topics
Building topic model

In [11]:
eval_df.head()

Unnamed: 0,Number_Of_Topics,Lemma_Coherence,Stem_Coherence
0,2.0,0.257453,0.292505
1,3.0,0.30659,0.321425
2,4.0,0.315804,0.310355
3,5.0,0.355308,0.326782
4,6.0,0.36867,0.282336


In [12]:
avg_eval_df = eval_df \
.groupby('Number_Of_Topics')[['Lemma_Coherence', 'Stem_Coherence']] \
.agg('mean')

avg_eval_df['diff'] = avg_eval_df['Lemma_Coherence'] - avg_eval_df['Stem_Coherence']
avg_eval_df

Unnamed: 0_level_0,Lemma_Coherence,Stem_Coherence,diff
Number_Of_Topics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2.0,0.329393,0.341838,-0.012445
3.0,0.318266,0.33517,-0.016903
4.0,0.343974,0.350478,-0.006504
5.0,0.345092,0.375076,-0.029983
6.0,0.349637,0.384341,-0.034704
7.0,0.343062,0.365232,-0.02217


In [13]:
stemmed_text = []

for row in class_data.iterrows():
    stem_val = str(row[1]['Stem_Processed'])
    stemmed_text.append(re.split(r'\s+', stem_val))
        
# creating dictionaries
stem_dictionary = corpora.Dictionary(stemmed_text)

# creating corpuses
stem_corpus = [stem_dictionary.doc2bow(stem) for stem in stemmed_text]

In [14]:
# build model
model2 = models.ldamodel.LdaModel(corpus=stem_corpus, id2word=stem_dictionary, num_topics=2, passes=50, random_state = 1)


In [15]:
# build model Anny
model6 = models.ldamodel.LdaModel(corpus=stem_corpus, id2word=stem_dictionary, num_topics=6, passes=50, random_state = 1)


In [16]:
# build model Anny
model7 = models.ldamodel.LdaModel(corpus=stem_corpus, id2word=stem_dictionary, num_topics=7, passes=50, random_state = 1)


In [17]:
import pyLDAvis
import pyLDAvis.gensim as gensimvis

vis_data = gensimvis.prepare(model7, stem_corpus, stem_dictionary)
#pyLDAvis.display(vis_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [18]:
pyLDAvis.display(vis_data)

In [19]:
vis_data6 = gensimvis.prepare(model6, stem_corpus, stem_dictionary)
pyLDAvis.display(vis_data6)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [20]:
vis_data2 = gensimvis.prepare(model2, stem_corpus, stem_dictionary)
pyLDAvis.display(vis_data2)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [24]:
model7.print_topics(num_words=5)

[(0,
  '0.046*"data" + 0.020*"cours" + 0.016*"use" + 0.016*"learn" + 0.015*"big"'),
 (1,
  '0.044*"aw" + 0.037*"cours" + 0.017*"exam" + 0.015*"certif" + 0.012*"provid"'),
 (2,
  '0.024*"cours" + 0.021*"learn" + 0.021*"applic" + 0.019*"deep" + 0.015*"build"'),
 (3,
  '0.051*"learn" + 0.026*"machin" + 0.016*"understand" + 0.015*"cours" + 0.011*"practic"'),
 (4,
  '0.019*"use" + 0.015*"system" + 0.014*"perform" + 0.013*"feedback" + 0.011*"comput"'),
 (5,
  '0.037*"cloud" + 0.032*"cours" + 0.018*"servic" + 0.015*"comput" + 0.014*"aw"'),
 (6,
  '0.048*"googl" + 0.043*"data" + 0.041*"cloud" + 0.023*"platform" + 0.021*"cours"')]

In [None]:
# writing to file with single column for use with comprehend

from io import StringIO
s3_client = boto3.client('s3')

output_data = class_data['Processed']
output_data_buffer = StringIO(output_data.to_csv(header = False, index = False))
s3_client.put_object(Bucket='ml-personalization', Key='data/comprehend_topic_data/comprehend_data.csv', Body=output_data_buffer.read(), StorageClass='STANDARD')