In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.special import xlogy
import plotly.express as px

In [None]:
drive.mount('/content/gdrive')

In [None]:
df = pd.read_parquet('gdrive/My Drive/Colab Notebooks/reddit_calculus.parquet')
# df = pd.read_parquet('/Users/paul/data/reddit/reddit_calculus.parquet')

In [None]:
df['subreddit'] = df['url'].apply(lambda url: url.split('/')[4].lower())

In [None]:
cv = CountVectorizer(stop_words='english', max_features=250)
count_matrix = cv.fit_transform(df['text'])
freq = pd.DataFrame(count_matrix.todense(), columns=cv.get_feature_names())

In [None]:
freq

### Information theory and relvance scores

### Review

Let $D$ represent a randomly chosen document from a corpus, and let $T$ represent a randomly chosen term.
Then:

$$I(D,T) = H(D) - H(D \vert T)$$

("Mutual information is entropy minus conditional entropy")

### Notation
- $N$ = number of documents
- $N_t$ = number of documents containing the term $t$
- $\vert d \vert$ = number of terms in document $d$
- $f(t,d)$ = frequency of term $t$ in document $d$

The mutual information is:

$$I(D,T) = \sum_{d,t} \frac{f(t,d)}{\vert d \vert} \left( \log \frac{1}{N_t} - \log \frac{1}{N} \right) = \sum_{d,t} \frac{f(t,d)}{\vert d \vert} \log \frac{N}{N_t}$$

The pointwise mutual information is then:

$$\frac{f(t,d)}{\vert d \vert} \left( \log \frac{1}{N_t} - \log \frac{1}{N} \right)$$

### Assumptions
- All documents have equal weight
- All documents containing a given term have equal weight
- All terms in a given document are equally likely

In [None]:
d = freq.sum(axis=1)
d

In [None]:
normalized_freq = freq.div(d, axis=0)
normalized_freq

In [None]:
N_t = (normalized_freq != 0).sum(axis=0)
N_t

In [None]:
N = len(freq)
N

In [None]:
PMI = normalized_freq * (np.log2(1/N_t) - np.log2(1/N))

In [None]:
PMI

### Exercises
1. Write a function that views documents and the term relevance scores side-by-side.
2. Pick a few subreddits to declare as "irrelevant" and compute the document PMI matrix that you would get if the posts from those subreddits had one tenth of their normal weight. How does this affect the document-level relevance scores?
3. Compute the PMI matrix for the top 20 subreddits, treating all posts in the subreddit as a single document.