In [None]:
# make sure the notebook reloads the module each time we modify it
%load_ext autoreload
%autoreload 2

# make sure the displays are nice
# %matplotlib notebook

In [None]:
# Make plots look nice
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks")

# Import generic libraries
import numpy as np
import numpy.random as npr
import pickle as pkl # needed to save and load python objects as byte strings.
from IPython.core.display import display, HTML # needed to display HTML text
import time

# Import sklearn tools
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Finding topics in machine learning papers

JMLR is one the main journals in machine learning. In this practical session, we'll look at abstracts of papers from JMLR, and we'll use a topic model to automatically extract topics. 

In [None]:
with open("../../data/jmlr.pkl", 'rb') as f:
    jmlr_papers = pkl.load(f) # loads a list of Python dictionaries. 
    number_of_papers = len(jmlr_papers)
    print("Just loaded "+str(number_of_papers)+" papers from JMLR.")

In [None]:
# Each entry of the list is a Python dictionary, and corresponds to an abstract.
jmlr_papers[0]

In [None]:
# Here is a random abstract from JMLR, printed nicely
index = npr.randint(number_of_papers)
display(HTML(jmlr_papers[index]['abstract']))

## Looking at data
Let's try to isolate the papers on Bayesian ML and neural networks, to see which proportion of JMLR both topics actually represent.

In [None]:
bayesian_jmlr_papers = []
neural_network_jmlr_papers = []
for paper in jmlr_papers:
    bayesian_keywords = ["Bayesian", "variational Bayes", "Carlo", "MCMC"]
    if any(kwd in paper["abstract"] for kwd in bayesian_keywords):
        bayesian_jmlr_papers.append(paper)
    neural_network_keywords = ["neural net", "Neural net", "deep", "Deep"]
    if any(kwd in paper["abstract"] for kwd in neural_network_keywords):
        neural_network_jmlr_papers.append(paper)
        
number_of_jmlr_papers = len(jmlr_papers)
number_of_Bayesian_jmlr_papers = len(bayesian_jmlr_papers)   
print("There are", str(len(neural_network_jmlr_papers))+" neural network papers out of", number_of_jmlr_papers)
print("There are", str(number_of_Bayesian_jmlr_papers)+" Bayesian papers out of", number_of_jmlr_papers)

In [None]:
# Bayesian networks may not be about Bayesian statistics
print("Out of which the number of papers about Bayesian networks is")
print(np.sum(["Bayesian network" in paper["abstract"] for paper in bayesian_jmlr_papers]))

In [None]:
# plot proportion of Bayesian papers in JMLR
volumes = [int(paper["volume"]) for paper in jmlr_papers] + [20]
bins = np.unique(volumes) - .5
plt.hist([int(paper["volume"]) for paper in jmlr_papers], color="blue", bins=bins, alpha=.3)
bins = np.unique([int(paper["volume"]) for paper in jmlr_papers] + [20]) - .5
plt.hist([int(paper["volume"]) for paper in bayesian_jmlr_papers], color="red", bins=bins)
plt.xticks([5*i for i in range(6)])
plt.xlabel("JMLR volume")
plt.ylabel("Number of papers")
plt.xlim([0,20])
plt.show()

## Run sklearn's Variational Bayes

Use `sklearn`to preprocess the data. Take the 1000 most frequent words in the corpus, not counting English stop words, and perform one-hot encoding. The resulting matrix will be very sparse. You should output an $N\times d$ sparse matrix `tf` of `int`egers, where $N$ is the number of JMLR abstracts, and $d=1000$. Check out `CountVectorizer` from `sklearn`, it does all these things for you.

In [None]:
# Exercise: Extract tf features (term frequencies, i.e., raw term counts)
tf = # TBC

In [None]:
tf # this should print something close to the following:
     # <1898x1000 sparse matrix of type '<class 'numpy.int64'>'
	 # with 85804 stored elements in Compressed Sparse Row format>

Now fit the LDA to `tf`, say with $10$ topics for starters. 

In [None]:
# Exercise: Fit the LDA using online VB
# Check Section 27.3.7 of [Murphy, 2012]for the definition of online VB

In [None]:
# Exercise: Print the top words in each topic, comment

In [None]:
# Exercise: Pick an abstract and assign topics to its words. What is your loss function for that action?

## Running ADVI
Fit the count version of LDA using ADVI and pyMC3. This requires you to decide on how to transform the variables in the model to unconstrained real variables. You can take inspiration from [this tutorial](https://docs.pymc.io/notebooks/lda-advi-aevb.html). Compare your results with sklean's online VB: what do you observe? Why do you think this is the case?

## Bonus exercises
* *Bonus exercise*: play around with topic extraction. Can you recognize topics? Find topics within Bayesian or neural network papers.
* *Bonus exercise* if you like web scraping: get more data, say NeurIPS papers.
* *Bonus exercise* if you like variational Bayes: implement your own variational Bayes class, without `sklearn`.
* *Bonus exercise* implement collapsed Gibbs sampling on, say, a small dataset of 5 abstracts. 
* *Bonus exercise* if you enjoy slow MCMC algorithms like me: implement the collapsed Gibbs sampler, say for a small dataset of 5 abstracts.