Read list of tags from file

In [1]:
with open("../moocs_tags", "r") as f:
    content = f.readlines()

Helper Function for cleaning text

In [2]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [3]:
doc_c = [clean(doc) for doc in content]   
doc_clean = [doc.split() for doc in doc_c]   

Create Document-Term Matrix

In [4]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index.

dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

Train LDA model using Document-Term Matrix

In [5]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=100, id2word = dictionary, passes=50)

List of topics generated using LDA

In [6]:
ldamodel.print_topics(num_topics=100, num_words=5)

[(0,
  '0.108*"new" + 0.062*"security" + 0.046*"network" + 0.031*"tech" + 0.031*"indigenous"'),
 (1,
  '0.093*"professional" + 0.054*"dynamic" + 0.040*"self" + 0.040*"blended" + 0.027*"poetry"'),
 (2,
  '0.235*"teach" + 0.211*"english" + 0.196*"moral" + 0.016*"pi" + 0.016*"raspberry"'),
 (3,
  '0.090*"environment" + 0.064*"presentation" + 0.039*"word" + 0.039*"marketing" + 0.039*"informal"'),
 (4,
  '0.309*"nanotechnology" + 0.037*"question" + 0.013*"mathchat" + 0.013*"lm" + 0.013*"photosynthesis"'),
 (5,
  '0.538*"skill" + 0.025*"renewable" + 0.008*"generation" + 0.008*"next" + 0.008*"source"'),
 (6,
  '0.101*"french" + 0.076*"instruction" + 0.063*"maker" + 0.051*"junior" + 0.038*"test"'),
 (7,
  '0.315*"education" + 0.170*"physic" + 0.124*"administration" + 0.107*"operation" + 0.107*"civic"'),
 (8,
  '0.302*"general" + 0.108*"literature" + 0.097*"middle" + 0.022*"taxonomy" + 0.022*"open"'),
 (9,
  '0.080*"motion" + 0.064*"architecture" + 0.048*"philippine" + 0.048*"20" + 0.032*"quant

Seeing topic assigned to document 0 with probability

In [7]:
ldamodel.get_document_topics(dictionary.doc2bow(doc_clean[0]))

[(93, 0.50499999999999978)]

Looping though all documents to get their list of topics. Adding each document to cluster based on the most likely topic. Printing out list of clusters in the end.

In [8]:
clusters = [[] for i in range(100)]
for idx,doc_i in enumerate(doc_clean):
    score = ldamodel.get_document_topics(dictionary.doc2bow(doc_i))
    if len(score) > 0 :
        bucket = max(ldamodel.get_document_topics(dictionary.doc2bow(doc_i)), key=lambda x: x[1])[0]
        score = max(ldamodel.get_document_topics(dictionary.doc2bow(doc_i)), key=lambda x: x[1])[1]
        if score > 0.5:
            clusters[bucket].append(doc_c[idx])
clusters

[['network',
  'gas atmospheric chemistry',
  'new headway',
  'small active cute',
  'indigenous',
  'network security',
  'security',
  'organization',
  'politcs',
  'specie',
  'arduino',
  '46'],
 ['poetry',
  'steam',
  'accounting',
  'dynamic',
  'parent',
  '3d cad',
  'blended',
  'misconception',
  'beforeschool',
  'professional ethic',
  'adhd',
  'transdisciplinarity',
  'museology',
  'dynamic',
  'cad',
  'trigonometry',
  'gender'],
 ['human right responsibility',
  'teenager',
  'raspberry pi',
  'english camp',
  'moral',
  'fgv',
  'learn teach english',
  'raspberry pi platform'],
 ['marketing',
  'human impact environment',
  'word way',
  'transdisciplinary',
  'key stage 3',
  'key stage 4',
  'key stage 5',
  'environment',
  '512',
  'environment',
  'presentation',
  'informal',
  'militar',
  'lengua franca',
  'interesting presentation',
  'esol',
  'superlogo',
  'presentation'],
 ['photosynthesis',
  'discourse',
  'biophysics',
  'metalwork',
  'ibpyp',
