In [None]:
# Initialize Otter
import otter
grader = otter.Notebook("hw4.ipynb")

---

<h1><center>SDSE Homework 4 <br><br> Text Classification with Naive Bayes </center></h1>

---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pickle

The dataset that we will work with is a selection of posts from scikit-learn's ['20 newsgroups' dataset](https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset). We will be working with just two of the 20 newsgroup categories:  'comp.graphics' (computer graphics) and 'rec.motorcycles' (recreation motorcycles). 

The cell below loads the data from a pickle file. The variables are:

+ `Xtrain`: A list of documents used for training
+ `ytrain`: The category of each training document
+ `Xtest`: A list of documents used for testing
+ `ytest`: The category of each testing document
+ `categories`: The set of all categories
+ `vocabulary`: The feature set, ie words used in the model

You should inspect the data to get a better understanding of its structure. For example, use `type` to see the data types of the variables and their components.

In [None]:
with open('hw4_text.pickle','rb') as file:
    Xtrain, ytrain, Xtest, ytest, categories, vocabulary = pickle.load(file)
    
N = Xtrain.shape[0]   # number of documents in the training corpus
K = len(categories)   # number of document categories (classes)
D = len(vocabulary)   # number of words in the vocabulary (features)

In [None]:
type(Xtrain)

# 1. Find the number of training documents for each category

In [None]:
docs_per_category = dict.fromkeys(categories, 0)

# Your code here
for category in docs_per_category.keys():
    docs_per_category[category] = (ytrain == category).sum()

# 2. Create a bag-of-words representation for each document

Our Naive Bayes algorithm will operate on a bag-of-words representation of each document. The first step is to write the `to_bow` method. 

The argument for this method is `doc`, which is an element of `X` (ie a string). It should return a `set` with the unique words that appear in both the document and the vocabulary. The comments in the method provide steps to follow. 

In [None]:
def to_bow(doc):
    bow = set()
    
    # Split `doc` at spaces using the the string's `split` method. Obtain a list.                             
    words = doc.split(' ')
    
    # Keep only unique words from the list, by casting it as a set. 
    bow = set(words)
    
    # From that set, keep only the ones that are present in the vocabulary.
    bow = {word for word in bow if word in vocabulary}
    
    return bow

In [None]:
# Run `to_bow` on every document in `Xtrain`.
Xtrain_bow = np.array([to_bow(doc) for doc in Xtrain])

In [None]:
grader.check("q2")

# 3. Compute the document count for each word in each category

To estimate probabilities for Naive Bayes, we will need to know, for each category and each word, the number of documents of the category that contain the word. Implement the `find_doc_counts_per_word_category` following the steps provided in the code. 

In [None]:
def find_doc_counts_per_word_category(categories,vocabulary,ytrain,Xtrain_bow):

    # Initialize the dictionary
    doc_counters = dict.fromkeys(categories) 
    for category in categories:
        doc_counters[category]  = dict.fromkeys(vocabulary,0)

    # Loop through categories and documents in that category. 
    # For each word in the vocabulary that is also in the document, increment the corresponding counter by 1. 
    for category in categories:
        
        # Filter Xtrain_bow and keep only the documents of this category
        docs_in_category = []
        for bow, labeled_category in zip(Xtrain_bow, ytrain):
            if category == labeled_category:
                docs_in_category.append(bow)

        # For each document in the category, increment the appropriate counters
        for doc in docs_in_category:
            for word in doc:
                doc_counters[category][word] += 1
                    
    return doc_counters

In [None]:
# Run `find_doc_counts_per_word_category`
doccount_per_cat_and_word = find_doc_counts_per_word_category(categories,vocabulary,ytrain,Xtrain_bow)

In [None]:
grader.check("q3")

# 4. Find word frequencies per category

Write the `compute_word_freq` method. 

The argument for this method is the Laplace smoothing factor `alpha`. It also uses global variables, including `doccount_per_cat_and_word`.

For each category and word, compute the Laplace-smoothed ratio of the number of documents containing the word, to the total number of documents in the category. 

Steps:

1. For each category and word in the vocabulary, compute $\rho_{d,k}$  as

$$\rho_{d,k} = \frac{(\text{# documents of category $k$ that contain word $d$}) + \alpha}{(\text{# documents of category $k$})+\alpha K}$$

Store it in `wordfreq[category][word]`.

2. For each category, compute $\rho_k$  as
    
$$\rho_k = \frac{\text{# documents of category $k$}}{\text{Total # documents}}$$

Store it in `catfreq[category]`.

Notice that we are not applying Laplace smoothing to the category frequencies. 

In [None]:
def compute_word_freq(alpha):
    K = len(vocabulary)

    # Initialize `wordfreq` and `catfreq`
    wordfreq = dict.fromkeys(categories)
    for category in categories:
        wordfreq[category] = dict.fromkeys(vocabulary)
    catfreq = dict.fromkeys(categories)
    
    # Step 1, compute wordfreq
    for category in categories:
        for word, word_freq in doccount_per_cat_and_word[category].items():
            wordfreq[category][word] = (word_freq + alpha) / (docs_per_category[category] + alpha*D)
    
    # Step 2, compute catfreq
    for category in categories:
        catfreq[category] = docs_per_category[category] / len(Xtrain)
    
    return wordfreq, catfreq

In [None]:
# Run `compute_word_log_freq` with $\alpha=0.01$.
wordfreq, catfreq = compute_word_freq(0.01)

In [None]:
grader.check("q4")

# 5. Write the Naive Bayes prediction function.

Compute the Naive Bayes prediction of the category for the given test document `xtest`. 

The arguments for this method are 
+ xtest: a single test document as a string.
+ wordfreq, catfreq: the ratios computed in the previous step (with $\alpha=0.1$)

The steps for are:
1. Find the BOW representation of `xtest`.

2. Use the dictionary `score_cat` to store the score for each of the categories.

3. Loop through categories, for each one compute its score with

$$\log\rho_k+ \sum_{d:\:x_d=1} \log\rho_{d,k} + \sum_{d:\:x_d=0} \log(1-\rho_{d,k})$$

Here $x_d$ is the $d$'th word in `xtest`

4. Return the category with the highest score. 

In [None]:
def predict(Xtest, wordfreq, catfreq):

    # 1. Find the BOW representation of Xtest.
    Xtest_bow = to_bow(Xtest)
    
    # 2. Use a dictionary to store the score for each of the categories.
    score_cat = dict.fromkeys(categories,0)

    # 3. Loop through categories, for each one compute its score, and save it in score_cat.
    for category in categories:
        wordfreq, catfreq = compute_word_freq(0.1)
        word_sum = 0
        for word in wordfreq[category]:
            rho_k = catfreq[category]
            rho_dk = wordfreq[category][word]
            word_sum += np.log(rho_dk) if word in Xtest_bow else np.log(1 - rho_dk)
        score_cat[category] = np.log(catfreq[category]) + word_sum
        
    # 4. Return the category with the highest score.
    return max(score_cat, key=score_cat.get)

In [None]:
grader.check("q5")

# 6. Compute accuracy

Accuracy is defined as the number of correct predictions, divided by the total number of predictions. 

In [None]:
def compute_accuracy(Xin, yin, wordfreq, catfreq):

    correct = 0
    
    # count the number of correct predictions
    for i in range(len(Xin)):
        ...
        
    return correct/len(Xin)

In [None]:
grader.check("q6")

# 7. Compute the training and testing errors for a range of $\alpha$





### 7.1. Train the model and compute its test accuracy for logarithmically spaced values of $\alpha$ ranging from $10^{-5}$ to $10^1$

Here 'training the model' means computing the Laplace-smoothed document frequencies with `compute_word_freq`. Do this for a range of $\alpha$'s and store their corresponding accuracies. 

In [None]:
alphas = np.logspace(-5,1,20)
acc = np.empty(len(alphas))
...

In [None]:
grader.check("q7p1")

### 7.2. Plot the accuracies as a function of $\alpha$ using `plt.semilogx`

In [None]:
plt.figure()
...

### 7.3. What is the optimal $\alpha$ and its corresponding accuracy?  [Hint](https://numpy.org/doc/stable/reference/generated/numpy.argmax.html)

In [None]:
best_acc = ...
best_alpha = ...

## Submission

Make sure you have run all cells in your notebook in order before running the cell below, so that all images/graphs appear in the output. The cell below will generate a zip file for you to submit. **Please save before exporting!**

In [None]:
# Save your notebook first, then run this cell to export your submission.
grader.export(pdf=False)