# NSU Distributional Semantics 2019 Course. Seminar 3

Our course repo: https://github.com/disemantics/course2019

![](http://imgur.com/S8WgwBp.png)

In this seminal, we will learn how to implement and use topic modeling algorithms. We will consider **LSA** (latent semantic analysis), **PLSA** (probabilistic latent semantic analysis) and **LDA** (latent Dirichlet allocation).

## Reading data
At first, we need to open load a dataset and select text colomns from it.

In [None]:
import pandas as pd

In [None]:
data_path = "../input/Emails.csv"
data = pd.read_csv(data_path)

In [None]:
data.sample(5)

In [None]:
print(f"Number of Emails: {data.shape[0]}")

## Preprocessing data

We select only the main text column (ExtractedBodyText) witiout NaNs (empty emails).

In [None]:
data = data[pd.notnull(data['ExtractedBodyText'])]
print(data.sample(5)['ExtractedBodyText'])

In [None]:
print(f"Number of Emails: {data.shape[0]}")

On the next step, we need to clear our data from punctuation and stopwords.

In [None]:
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
texts = [tokenizer.tokenize(email.lower()) for email in data['ExtractedBodyText']]

In [None]:
print(texts[5070])

In [None]:
def delete_stopwords(tokenized_sentence: list):
    return list(filter(lambda x: x not in stop_words, tokenized_sentence))

In [None]:
texts = list(filter(lambda x: len(x) > 5, [delete_stopwords(text) for text in texts]))

In [None]:
print(f"Number of Emails: {len(texts)}")

Now it's time to convert texts to bag-of-words format.

In [None]:
from gensim import corpora

In [None]:
corpora_dict = corpora.Dictionary(texts)

In [None]:
print(list(corpora_dict.token2id.items())[::500])

In [None]:
corpora_dict[500]

In [None]:
corpora_dict.id2token[500]

In [None]:
corpus = [corpora_dict.doc2bow(text) for text in texts]

In [None]:
print(corpus[0])

## LSI (LSA)

In [None]:
from gensim.models import LsiModel

In [None]:
model_lsi = LsiModel(corpus, id2word=corpora_dict.id2token, num_topics=10)

In [None]:
str_topics = [topic_w for topic_number, topic_w in model_lsi.print_topics()]
str_topics_split = list(map(lambda x: x.split("+"), str_topics))
str_topics_split = [list(map(lambda x: x.split("*")[1].strip()[1:-1], elem)) for elem in str_topics_split]

In [None]:
for topic in str_topics_split:
    print(topic)

## LDA

In [None]:
from gensim import matutils
from gensim.models.ldamodel import LdaModel

In [None]:
model_lda = LdaModel(corpus, passes=20, num_topics=10, id2word=corpora_dict.id2token)

In [None]:
str_topics = [topic_w for topic_number, topic_w in model_lda.print_topics()]
str_topics_split = list(map(lambda x: x.split("+"), str_topics))
str_topics_split = [list(map(lambda x: x.split("*")[1].strip()[1:-1], elem)) for elem in str_topics_split]

for topic in str_topics_split:
    print(topic)

In [None]:
import pyLDAvis

In [None]:
import pyLDAvis.gensim

data_lda = pyLDAvis.gensim.prepare(model_lda, corpus, corpora_dict)

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.display(data_lda)

# Homework (10 points)
## PLSA
Implement a PLSA model with gensim-like interface.

In [None]:
from gensim.matutils import corpus2dense
import numpy as np
np.random.seed(42)

This pseudocode of EM-algorithm is taken from here (http://www.machinelearning.ru/wiki/images/8/88/Voron-iip9-talk.pdf) (p. 10):

1. initialize $\mathbf{\Phi}$ and $\mathbf{\Theta}$ so that $\forall z \in \mathcal{Z}\,\,\, \sum \limits_{w \in \mathcal{W}} \Phi_{w, z} = 1,\,\Phi_{w, z} \geq 0,\,\, \forall d \in \mathcal{D} 
    \sum \limits_{z \in \mathcal{Z}} \Theta_{z, d} = 1,\, \Theta_{z, d} \geq 0.$ 
2. $passes$ times repeat
3. $\quad \forall w \in \mathcal{W}, z \in \mathcal{Z}\,n_{wz} := 0,\,n_{zd} := 0,\,n_z := 0$.
4. $\quad \forall d \in \mathcal{D},\,w \in d$:
5. $\quad \quad Z_w = \sum\limits_{z \in \mathcal{Z}} \Phi_{w,z} \Theta_{z,d}$
6. $\quad \quad \forall z \in \mathcal{Z}$ such that  $\Phi_{w,z} \Theta_{z,d} > 0$
7. $\quad \quad \quad$ add $\frac{n_{wd}}{Z_w} \Phi_{w,z} \Theta_{z,d}$ to $\,n_{wz},\,n_{zd} ,\,n_z$
8. $\quad \forall w \in \mathcal{W}, z \in \mathcal{Z}\,\, \Phi_{w,z} := n_{wz} / n_{z} $
9. $\quad \forall d \in \mathcal{D}, z \in \mathcal{Z}\,\, \Theta_{z,d} := n_{zd} / n_{d} $

In [None]:
class PlsaModel:
    def __init__(self, corpus=None, id2word=None, num_topics=10, passes=30):
        self.passes = passes
        
        self.num_topics = num_topics
        self.num_documents = len(corpus)
        self.num_words = len(id2word)

        self.id2word = id2word
        
        self.n_wd = corpus2dense(corpus, num_terms=self.num_words)  # [word][document]
        self.n_d = np.sum(self.n_wd, axis=0)
        self.n = np.sum(self.n_d)

        self.phi = np.random.random_sample(size=(self.num_words, self.num_topics))
        self.phi /= np.sum(self.phi, axis=0)
        self.theta_t = np.random.random_sample(size=(self.num_documents, self.num_topics))
        self.theta_t /= np.sum(self.theta_t, axis=1)[:, None]
        
        for i in range(self.passes):
            self._fit()

    def _fit(self):
        # n_zd = # YOUR CODE HERE
        # n_wz = # YOUR CODE HERE
        # n_z = # YOUR CODE HERE
        for d in range(self.num_documents):
            # YOUR CODE HERE
            pass

        # YOUR CODE HERE
        
    def print_topics(self, top_n=10):
        res = []
        for t in range(self.num_topics):
            top_inds = self.phi[:, t].argsort()[-top_n:][::-1]
            top_words = [self.id2word[x] for x in top_inds]
            res.append(top_words)
        return res

In [None]:
model_plsa = PlsaModel(corpus, passes=10, num_topics=10, id2word=corpora_dict.id2token)

In [None]:
for topic in model_plsa.print_topics():
    print(topic)