## How to create a confusion matrix  

When building a classification model there are many situations where one will want to take a closer look at the predictions their model made. Specifically, the true positives, false positives, true negatives and false negative (i.e., When the true true class of the example was $X$ how often did my model predict $X$ vs. $Y,Z,...,K$). This is very useful for seeing which classes your model can easily predict and which classes are more difficult. Below is an initial example of what a confusion matrix looks like followed by a more lengthy example using real Yelp data.


In [None]:
# NOTE: libraries that are commented out are imported via preprocess.py file

#import preprocess

# import nltk library
#import nltk; nltk.download('punkt')
#from nltk import sent_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer

# import other libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import *
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import *

# import helper libraries
import collections
from common import utils, vocabulary

In [4]:
# only restaurants 
sample_df=pd.read_csv("sample_review_restaurant_only.csv")

## Preprocess (based on CountVectorizer)

In [5]:
# tokenize

example_text="""Very pleased with the service. Friendly, attentive, and fast. I had vegetable egg rolls and Pad Thai. 
              Pad Thai was exquisite. Not too oily or too dry, just perfect. Just the right amount of food on the plate, 
              the tofu was baked to perfection and made the flavor stand out. The egg rolls were crispy but not over 
              fried and not oily either. Definitely coming back and recommending it to friends."""

def tokenize_text(input_text):
    """
    Args: 
    input_text: a string representing an 
    individual review
        
    Returns:
    input_token: a list containing *raw* 
    tokens for an individual review
        
    """
    input_tokens=[]
        
    # Split sentence
    sents=sent_tokenize(input_text)
            
    # Split word
    for sent in sents:
        input_tokens+=TreebankWordTokenizer().tokenize(sent)
        
    return input_tokens

# example data
input_tokens=tokenize_text(example_text)
print(input_tokens)

['Very', 'pleased', 'with', 'the', 'service', '.', 'Friendly', ',', 'attentive', ',', 'and', 'fast', '.', 'I', 'had', 'vegetable', 'egg', 'rolls', 'and', 'Pad', 'Thai', '.', 'Pad', 'Thai', 'was', 'exquisite', '.', 'Not', 'too', 'oily', 'or', 'too', 'dry', ',', 'just', 'perfect', '.', 'Just', 'the', 'right', 'amount', 'of', 'food', 'on', 'the', 'plate', ',', 'the', 'tofu', 'was', 'baked', 'to', 'perfection', 'and', 'made', 'the', 'flavor', 'stand', 'out', '.', 'The', 'egg', 'rolls', 'were', 'crispy', 'but', 'not', 'over', 'fried', 'and', 'not', 'oily', 'either', '.', 'Definitely', 'coming', 'back', 'and', 'recommending', 'it', 'to', 'friends', '.']


In [8]:
# canonicalize

def canonicalize_tokens(input_tokens):
    """
    Args:
    input_tokens: a list containing *raw* 
    tokens for an individual review
    
    Returns:
    input_tokens: a list containing *canonicalized* 
    tokens for an individual review
    
    """
    input_tokens=utils.canonicalize_words(input_tokens)
    return input_tokens

# example data
canonical_tokens=canonicalize_tokens(input_tokens)
print(canonical_tokens)

['very', 'pleased', 'with', 'the', 'service', '.', 'friendly', ',', 'attentive', ',', 'and', 'fast', '.', 'i', 'had', 'vegetable', 'egg', 'rolls', 'and', 'pad', 'thai', '.', 'pad', 'thai', 'was', 'exquisite', '.', 'not', 'too', 'oily', 'or', 'too', 'dry', ',', 'just', 'perfect', '.', 'just', 'the', 'right', 'amount', 'of', 'food', 'on', 'the', 'plate', ',', 'the', 'tofu', 'was', 'baked', 'to', 'perfection', 'and', 'made', 'the', 'flavor', 'stand', 'out', '.', 'the', 'egg', 'rolls', 'were', 'crispy', 'but', 'not', 'over', 'fried', 'and', 'not', 'oily', 'either', '.', 'definitely', 'coming', 'back', 'and', 'recommending', 'it', 'to', 'friends', '.']


In [9]:
# preprocessor (combines tokenization, canonicalization)

def preprocessor(raw_text):
    """
    Args:
    raw_text: a string representing an
    individual review
    
    Returns:
    preprocessed_text: a string representing 
    a preprocessed individual review
    
    """
    # tokenize
    tokens=tokenize_text(raw_text)
    
    # canonicalize
    canonical_tokens=canonicalize_tokens(tokens)
    
    # rejoin string
    preprocessed_text=(" ").join(canonical_tokens) 
    return preprocessed_text

# example data
preprocessed_text=preprocessor(example_text) ## punctuations left as distinct characters...
print(preprocessed_text)

very pleased with the service . friendly , attentive , and fast . i had vegetable egg rolls and pad thai . pad thai was exquisite . not too oily or too dry , just perfect . just the right amount of food on the plate , the tofu was baked to perfection and made the flavor stand out . the egg rolls were crispy but not over fried and not oily either . definitely coming back and recommending it to friends .


## Multinomial NB

train Multinomial NB linear classifier to identify words with largest weights

In [33]:
# get text, labels data
text=sample_df["text"].tolist() # list of strings
labels=sample_df["stars"].tolist() # list of integers

# recode labels (combine ratings 1,2 and 4,5)
def recode(x):
    if x==1 or x==2 or x==3: x=0 # includes ratings 1,2 & 3
    else: x=1 # includes ratings 4,5
#     else: x=1 # includes only rating 3    
    return x

recoded_labels=list(map(recode, labels))

# using recoded labels
train_data, test_data, train_labels, test_labels=train_test_split(text, recoded_labels, test_size=.2)

# examine train, test shapes
print("train, test set size: %d, %d" %(len(train_data), len(test_data))) # train_data: 129023, test_data: 32256
print("train, test label size: %d, %d" %(len(train_labels), len(test_labels)))
print("")

# examine train set examples
print("example:")
print("text: %s" %(train_data[1]))
print("label: %d" %(train_labels[1]))

train, test set size: 129023, 32256
train, test label size: 129023, 32256

example:
text: Wings are fine, better from most of the places. Other locations-way better then this one. Just look like a fast food- not even a bar. Nice people working there, but service just not right - fun, but not right. One of the girls had a shot with customers on the next table. Not professional at all.
label: 0


In [34]:
collections.Counter(recoded_labels)

Counter({0: 56256, 1: 105023})

In [11]:
# countvectorizer
stop_words=["that", "in", "we", "for", "of", "it", "was", "to", "and", "the", "but", "is", "with", "i", "you"]
vec=CountVectorizer(preprocessor=preprocessor, stop_words=stop_words)
vec_train_data=vec.fit_transform(train_data) 
vec_test_data=vec.transform(test_data) 
print("vocabulary size: %d" %(vec_train_data.shape[1]))

# train model (no hyperparameter tuning)
mnb=MultinomialNB()
mnb.fit(vec_train_data, train_labels)
pred_labels=mnb.predict(vec_test_data)
    
# assess model
f1=f1_score(test_labels, pred_labels, average="weighted") 
accuracy=accuracy_score(test_labels, pred_labels)
print("multinomial nb f1 score: %.3f" %(f1))
print("multinomical nb accuracy score: %.3f" %(accuracy))

vocabulary size: 92672
multinomial nb f1 score: 0.794
multinomical nb accuracy score: 0.799


In [31]:
# Create confusion matrix
pd.crosstab(test_labels, pred_labels, rownames=['Actual Rating'], colnames=['Predicted Rating'])

Predicted Rating,0,1,2
Actual Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5009,1160,597
1,958,1661,1880
2,827,1058,19106


In [12]:
# examine features w/ greatest weights

def top_features(coefs, num_feats):
    """
    Args: 
    coefs: array of shape (num_labels, vocab_size)
    num_feats: number of top features
    
    Prints:
    top num_feats features with great weights by
    rating category
    
    """   
    # identify top coefs per rating category
    top_indices=np.argsort(coefs, axis=1)[:,-num_feats:] # (num_labels, num_feats) 
    
    # display feature, weight
    for r in range(top_indices.shape[0]):
        print("rating category %d" %(r+1))
        for c in range(top_indices.shape[1]):
            feat=vec.get_feature_names()[top_indices[r,c]]
            weight=round(coefs[r, top_indices[r,c]], 2)
            print(feat, weight)
        print("")

top_features(mnb.coef_, num_feats=10)

rating category 1
at -4.81
DG -4.76
had -4.75
on -4.69
were -4.66
food -4.62
they -4.55
this -4.48
my -4.47
not -4.44

rating category 2
had -4.84
DG -4.74
were -4.72
this -4.71
food -4.7
they -4.67
good -4.66
on -4.65
my -4.63
not -4.57

rating category 3
great -4.82
place -4.81
have -4.78
good -4.73
had -4.73
on -4.66
they -4.65
food -4.64
my -4.51
this -4.47

