## Ali's code

It seems that the way this link computes the weights of each topic in each document is not the optimal way (as it will not give you a weight if it is very small, but in big models, these small weights can also become important). Therefore instead, you can follow the following simple approach to get the weights:
To get the weights of a topic in the ith document, you call the method get_document_topics of the topic model object and pass the ith element of M1 (or tfidf_M1 in case you use tfidf) to it in the following way (minimum_probability=0 should be set so that none of near to zero entries are not removed) : 
lda_model.get_document_topics(M1[i],minimum_probability=0)
So let us see it in a complete example we had before: 

which gives the output (topics and weights of words in topics):

In [1]:
corpus = ["cryptography can be used for preventing data leakage in computer security",
"supervised learning and unsupervised learning are the two main groups of methods in machine learning",
"while in supervised learning we have access to the target variable in unsupervised learning we do not have such a variable",
"there are some methods in security for reducing the risk of information leakage like authentication and cryptography",
"topic modeling in an unsupervised machine learning model and therefore we do not have target variables"
]
stop_words = ["can","be","for","two","the","for","we","in","not","do","are","to","an","there","some","have","a","and","of","like","while","therefore","such"]

def clean_doc(doc):
    return " ".join([word for word in doc.lower().split() if word not in stop_words])

corpus_clean = [clean_doc(doc) for doc in corpus]
corpus_clean = [doc.split() for doc in corpus_clean]
from gensim import corpora
dictionary = corpora.Dictionary(corpus_clean)
dictionary.doc2bow(corpus_clean[2])
M1 = [dictionary.doc2bow(doc) for doc in corpus_clean]
import gensim
Lda = gensim.models.ldamodel.LdaModel
lda_model = Lda(M1, num_topics=2, id2word = dictionary, passes=5,random_state =0)
topics = lda_model.print_topics(num_topics=5, num_words=10)
for topic in topics:
    print(topic)

(0, '0.084*"learning" + 0.071*"cryptography" + 0.071*"security" + 0.071*"leakage" + 0.070*"methods" + 0.043*"computer" + 0.043*"data" + 0.043*"used" + 0.042*"preventing" + 0.042*"authentication"')
(1, '0.136*"learning" + 0.093*"unsupervised" + 0.082*"target" + 0.080*"variable" + 0.058*"machine" + 0.057*"supervised" + 0.050*"topic" + 0.050*"variables" + 0.050*"modeling" + 0.050*"model"')


Now, the following call gives us the weight of the two topics in the first document:

In [2]:
lda_model.get_document_topics(M1[0],minimum_probability=0)

[(0, 0.9348568), (1, 0.06514323)]

And here is a complete working code with tfidf :
Note that as we expect, the weight should be different here as the model has been changed. 


In [6]:
corpus = ["cryptography can be used for preventing data leakage in computer security",
"supervised learning and unsupervised learning are the two main groups of methods in machine learning",
"while in supervised learning we have access to the target variable in unsupervised learning we do not have such a variable",
"there are some methods in security for reducing the risk of information leakage like authentication and cryptography",
"topic modeling in an unsupervised machine learning model and therefore we do not have target variables"
]
stop_words = ["can","be","for","two","the","for","we","in","not","do","are","to","an","there","some","have","a","and","of","like","while","therefore","such"]

def clean_doc(doc):
    return " ".join([word for word in doc.lower().split() if word not in stop_words])

corpus_clean = [clean_doc(doc) for doc in corpus]
corpus_clean = [doc.split() for doc in corpus_clean]
from gensim import corpora
dictionary = corpora.Dictionary(corpus_clean)
dictionary.doc2bow(corpus_clean[2])
M1 = [dictionary.doc2bow(doc) for doc in corpus_clean]
import gensim
tfidf_model = gensim.models.TfidfModel(M1)
tfidf_M1 = tfidf_model[M1]
Lda = gensim.models.ldamodel.LdaModel
lda_model = Lda(tfidf_M1, num_topics=2, id2word = dictionary, passes=5,random_state =0)
topics = lda_model.print_topics(num_topics=5, num_words=10)
for topic in topics:
    print(topic)
lda_model.get_document_topics(tfidf_M1[0],minimum_probability=0)

(0, '0.058*"variable" + 0.055*"learning" + 0.046*"methods" + 0.045*"main" + 0.045*"cryptography" + 0.045*"groups" + 0.045*"security" + 0.045*"leakage" + 0.045*"supervised" + 0.043*"computer"')
(1, '0.060*"topic" + 0.059*"variables" + 0.059*"modeling" + 0.059*"model" + 0.050*"target" + 0.049*"machine" + 0.043*"learning" + 0.042*"unsupervised" + 0.035*"variable" + 0.035*"supervised"')


[(0, 0.83430034), (1, 0.16569969)]

# Our code

In [51]:
import nltk
import gensim
import re
from gensim import corpora
from gensim.parsing.preprocessing import *
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
#nltk.download('stopwords')      # If this code block complains, comment out this line
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
from nltk.corpus import stopwords
import pyLDAvis.gensim

  and should_run_async(code)


In [16]:
def readFile(filename):
    fileData = ""
    file = open("abstracts/" + filename, 'r', encoding='utf8')
    fileData = file.read().splitlines()
    file.close
    return fileData

  and should_run_async(code)


In [25]:
corpus = []


for i in range (2012,2020):
    conference = readFile('usenix' + str(i) + 'Abstracts.txt')
    for abstract in conference:
        corpus.append(abstract)
        
print(corpus)



  and should_run_async(code)


### Preprocessing of the corpus

Preprocess the raw corpues into a list of words for each document

In [52]:
stop_words = stopwords.words('english')


lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None


def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            # lemmatized_sentence.append(word)
            pass# This part is modified so that we will just have ADJ VERB NOUN ADVERB remained 
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

CUSTOM_FILTERS = [
    strip_tags,                            # Remove HTML tags and such
    strip_punctuation,                     # Remove punctuation
    strip_multiple_whitespaces,            # Remove unecessary space and such
    strip_numeric,                         # Remove numbers
    lambda x: strip_short(x, 2),           # Remove words shorter than 2 chars
    lambda y: y.replace('“', ''),          # Removes quation marks
    lambda z: z.replace('”', ''),          # Removes quation marks
    lambda a: a.lower(),                   # Makes the corpus lowercase
    lambda z: remove_stopwords(z)          # Removes stopwords 
]

def clean_doc(doc):
    lematized_doc = lemmatize_sentence(doc)
    cleanDoc = preprocess_string(lematized_doc, CUSTOM_FILTERS)
    return cleanDoc

corpus_clean = [clean_doc(doc) for doc in corpus]

  and should_run_async(code)


Contiune the preprocessing, now with the list of words. Creating bigrams

In [53]:
bigram = gensim.models.Phrases(corpus_clean, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

def make_bigrams(corpus):
    return [bigram_mod[doc] for doc in corpus]

  and should_run_async(code)


In [54]:
make_bigrams(corpus_clean)

  and should_run_async(code)


[['online',
  'sale',
  'counterfeit',
  'unauthorized',
  'product',
  'drive',
  'robust',
  'underground',
  'advertising',
  'industry',
  'include',
  'email',
  'spam',
  'black',
  'hat',
  'search_engine',
  'optimization',
  'forum',
  'abuse',
  'virtually',
  'encounter',
  'enticement',
  'purchase',
  'drug',
  'prescription',
  'free',
  'online',
  'canadian',
  'pharmacy',
  'site',
  'clearly',
  'economically',
  'motivate',
  'shape',
  'underlie',
  'business',
  'enterprise',
  'understood',
  'precisely',
  'underground',
  'paper',
  'exploit',
  'rare',
  'opportunity',
  'view',
  'organizations—the',
  'glavmed',
  'spamit',
  'rx',
  'promotion',
  'pharmaceutical',
  'affiliate',
  'inside',
  'ground_truth',
  'data',
  'set',
  'include',
  'year',
  'raw',
  'transaction',
  'log',
  'cover',
  'sale',
  'provide',
  'depth',
  'empirical',
  'analysis',
  'worldwide',
  'consumer',
  'demand',
  'key',
  'role',
  'independent',
  'party',
  'advertiser'

Splitting the corpus and making it into a dictionary

In [26]:
#corpus_clean = [doc.split() for doc in corpus_clean]
dictionary = corpora.Dictionary(corpus_clean)
dictionary.doc2bow(corpus_clean[2])
M1 = [dictionary.doc2bow(doc) for doc in corpus_clean]

  and should_run_async(code)


Making the tfidf model

In [27]:
tfidf_model = gensim.models.TfidfModel(M1)
tfidf_M1 = tfidf_model[M1]

  and should_run_async(code)


Setting upp the lda model

In [37]:

Lda = gensim.models.ldamodel.LdaModel
lda_model = Lda(tfidf_M1, num_topics=5, id2word = dictionary, passes=5, random_state =0)

  and should_run_async(code)


### All topics

In [38]:
topics = lda_model.print_topics(num_topics=5, num_words=10)

for topic in topics:
    print(topic)


(0, '0.001*"data" + 0.001*"security" + 0.001*"attacks" + 0.001*"code" + 0.001*"privacy" + 0.001*"memory" + 0.001*"kernel" + 0.001*"user" + 0.001*"access" + 0.001*"web"')
(1, '0.001*"attacks" + 0.001*"software" + 0.001*"malware" + 0.001*"security" + 0.001*"vulnerabilities" + 0.001*"attack" + 0.001*"cache" + 0.001*"analysis" + 0.001*"code" + 0.001*"control"')
(2, '0.001*"privacy" + 0.001*"web" + 0.001*"malware" + 0.001*"devices" + 0.001*"security" + 0.001*"attack" + 0.001*"data" + 0.001*"attacks" + 0.001*"users" + 0.001*"network"')
(3, '0.001*"attack" + 0.001*"attacks" + 0.001*"privacy" + 0.001*"users" + 0.001*"security" + 0.001*"network" + 0.001*"data" + 0.001*"censorship" + 0.001*"user" + 0.001*"password"')
(4, '0.001*"attacks" + 0.001*"memory" + 0.001*"security" + 0.001*"malware" + 0.001*"apps" + 0.001*"data" + 0.001*"password" + 0.001*"user" + 0.001*"analysis" + 0.001*"network"')


  and should_run_async(code)


### Visualisation of topics

In [39]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, tfidf_M1, dictionary)
vis

  and should_run_async(code)


### Topics per document

In [40]:
lda_model.get_document_topics(tfidf_M1[0],minimum_probability=0)

  and should_run_async(code)


[(0, 0.02198147),
 (1, 0.021915898),
 (2, 0.022334285),
 (3, 0.021857508),
 (4, 0.9119108)]

In [141]:
s = "and I said \"Hello Bill\" "

  and should_run_async(code)


In [170]:
print(re.sub('"', '', s))
print(s)

and I said Hello Bill 
and I said "Hello Bill" 


  and should_run_async(code)
