In [18]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation

from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

In [2]:
sample1 = "Our board of directors boasts 11 seasoned technology and business leaders from Adobe, GSK, HGGC and more."
sample2 = "Our executives lead by example and guide us to accomplish great things every day."
sample3 = "Working at Pluralisght means being surrounded by smart, passionate people who inspire us to do our best work."
sample4 = "A leadership team with vision."
sample5 = "Courses on cloud, microservices, machine learning, security, Agile and more."
sample6 = "Interactive courses and projects."
sample7 = "Personalized course recommendations from Iris."
sample8 = "We’re excited to announce that Pluralsight has ranked #9 on the Great Place to Work 2018, Best Medium Workplaces list!"
sample9 = "Few of the job opportunities include Implementation Consultant - Analytics, Manager - assessment production, Chief Information Officer, Director of Communications."

# compile documents
compileddoc = [sample1, sample2, sample3, sample4, sample5, sample6, sample7, sample8, sample9] 

In [5]:
stopword_list = stopwords.words("english")
punc_list = punctuation
lemma = WordNetLemmatizer()

In [12]:
def clean(document):
    tokenized_doc = word_tokenize(document.lower())
    stopword_removed = [word for word in tokenized_doc if word not in stopword_list]
    punctuation_removed = [word for word in stopword_removed if word not in punc_list]
    normalised = [lemma.lemmatize(word) for word in punctuation_removed]
    return normalised

In [13]:
final_doc = [clean(document) for document in compileddoc]

In [14]:
final_doc

[['board',
  'director',
  'boast',
  '11',
  'seasoned',
  'technology',
  'business',
  'leader',
  'adobe',
  'gsk',
  'hggc'],
 ['executive',
  'lead',
  'example',
  'guide',
  'u',
  'accomplish',
  'great',
  'thing',
  'every',
  'day'],
 ['working',
  'pluralisght',
  'mean',
  'surrounded',
  'smart',
  'passionate',
  'people',
  'inspire',
  'u',
  'best',
  'work'],
 ['leadership', 'team', 'vision'],
 ['course',
  'cloud',
  'microservices',
  'machine',
  'learning',
  'security',
  'agile'],
 ['interactive', 'course', 'project'],
 ['personalized', 'course', 'recommendation', 'iris'],
 ['’',
  'excited',
  'announce',
  'pluralsight',
  'ranked',
  '9',
  'great',
  'place',
  'work',
  '2018',
  'best',
  'medium',
  'workplace',
  'list'],
 ['job',
  'opportunity',
  'include',
  'implementation',
  'consultant',
  'analytics',
  'manager',
  'assessment',
  'production',
  'chief',
  'information',
  'officer',
  'director',
  'communication']]

In [16]:
dictionary = corpora.Dictionary(final_doc)

In [20]:
DT_matrix = [dictionary.doc2bow(doc) for doc in final_doc]
lda_model = LdaModel

In [21]:
model_1 = lda_model(DT_matrix, num_topics=2, id2word = dictionary)
model_1.print_topics(num_topics=2, num_words=5)

[(0,
  '0.046*"course" + 0.022*"iris" + 0.022*"recommendation" + 0.022*"director" + 0.022*"personalized"'),
 (1,
  '0.029*"best" + 0.029*"work" + 0.026*"u" + 0.026*"great" + 0.019*"director"')]

In [22]:
model_2 = lda_model(DT_matrix, num_topics=2, id2word = dictionary)
model_2.print_topics(num_topics=2, num_words=2)

[(0, '0.033*"course" + 0.032*"work"'),
 (1, '0.032*"director" + 0.021*"course"')]