# Development: PMI
<b>8/7/2018</b>
<hr>
Space to develop PMI implementation for feature extraction example.

## Load Sample Data

In [1]:
import pandas as pd
from pymongo import MongoClient

  return f(*args, **kwds)


In [3]:
# get mongodb collections
client = MongoClient()

reviews = client.amazon_reviews.reviews
metadata = client.amazon_reviews.metadata

In [11]:
# check collections connected
print(reviews.estimated_document_count())
print(metadata.estimated_document_count())

13272
30000


## Examine Reviews

In [12]:
test = reviews.find_one()
test

In [None]:
# get sample of reviews
sample_reviews = [review for review in reviews.find(limit=100)]

In [None]:
def print_reviews(reviews)
    for r in reviews:
        print(r['reviewText'])
        print('--------------------')

In [None]:
print_reviews(sample_reviews)

## Create Occurance Matrix, Vocabulary

In [139]:
import math
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [98]:
# create count vectorizer
count_vectorizer = CountVectorizer(decode_error='ignore',
                                   stop_words='english',
                                   binary=True,
                                   min_df=0.05)

In [159]:
# create occurance matrix, corpus vocabulary
# NOTE: occurance matrix is term-document (rows are terms, columns are docs)
occurrence_matrix = count_vectorizer.fit_transform((review['reviewText'] for review in sample_reviews)).transpose()
vocab = count_vectorizer.get_feature_names()

## Find PMI for Word

In [160]:
def get_vocab_index(word, vocab):
    for i, w in enumerate(vocab):
        if w == word:
            return i

def count_occurrences(word, vocab, occurrence_matrix):
    """ Count number of documents that contain word """
    i = get_vocab_index(word, vocab)
    if i is None:
        return 0
    else:
        return np.sum(occurrence_matrix[i])

def count_co_occurrences(word1, word2, vocab, occurrence_matrix):
    """ Count number of documents that contain word1 and word2 """
    
    # get vocab indices
    i_word1 = get_vocab_index(word1, vocab)
    i_word2 = get_vocab_index(word2, vocab)
    
    # get co_occurrence_vector, return sum
    if i_word1 and i_word2:
        co_occurrence_vector = np.multiply(occurrence_matrix[i_word1],
                                           occurrence_matrix[i_word2].transpose())
        return np.sum(co_occurrence_vector)
    
    return 0

In [166]:
def pmi(word1, word2, vocab, occurance_matrix):
    """ Get PMI of word1 and word2 """
    num_docs = occurrence_matrix.shape[1]
    
    # find P(word1)
    p_word1 = count_occurrences(word1, vocab, occurrence_matrix) / num_docs
    p_word2 = count_occurrences(word2, vocab, occurrence_matrix) / num_docs
    
    # find P(word1 and word2)
    p_joint = count_co_occurrences(word1, word2, vocab, occurrence_matrix) / num_docs
    if p_joint == 0:
        # NOTE: This assumes a base occurrence frequency of 1 in order to not break the PMI equation
        p_joint = 1 / num_docs
    
    return math.log(p_joint / (p_word1 * p_word2))

In [169]:
pmi('spool', 'best', vocab, occurrence_matrix)

0.010050335853501506

## Conclusions
We have a successful PMI prototype! Let's port it over to a `Polarizer` class to prep for production.