### Description of dataset:

The data is IMDB - review dataset. It's available from the website "http://ai.stanford.edu/~amaas/data/sentiment/". 

The review is binary case which could be positive/negative review. It provides 25,000 reviews for training, and 25,000 reviews for testing. The distribution of class in training and testing is balanced, which means it has 12500 positive reviews and 12500 negative reviews in both training and testing.

In this assignment, we will use countvectorizer which means each token feature will be binary. 

In [1]:
import glob

import numpy as np

from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

### Load the data 

In [2]:
def load_imdb(path):

    print("Loading the imdb reviews data")

    train_neg_files = glob.glob(path+"/train/neg/*.txt")
    train_pos_files = glob.glob(path+"/train/pos/*.txt")

    train_corpus = []
    y_train = []
    
    for tnf in train_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        line = f.read()
        train_corpus.append(line)
        y_train.append(0)
        f.close()


    for tpf in train_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        line = f.read()
        train_corpus.append(line)
        y_train.append(1)
        f.close()
    
    print("Training Data loaded.")
    print()
    
    test_neg_files = glob.glob(path+"/test/neg/*.txt")
    test_pos_files = glob.glob(path+"/test/pos/*.txt")

    test_corpus = []

    y_test = []

    for tnf in test_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        test_corpus.append(f.read())
        y_test.append(0)
        f.close()

    for tpf in test_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        test_corpus.append(f.read())
        y_test.append(1)
        f.close()

    print("Testing Data loaded.")
    print()
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    return train_corpus, test_corpus, y_train, y_test

In [3]:
imdb_path = "../aclImdb"

In [4]:
train_corpus, test_corpus, y_train, y_test = load_imdb(imdb_path)

Loading the imdb reviews data
Training Data loaded.

Testing Data loaded.



### Binarize and vectorize the data into matrix

In [5]:
vectorizer = CountVectorizer(min_df=5, max_df=1.0, binary=True)

X_train = vectorizer.fit_transform(train_corpus)

X_test = vectorizer.transform(test_corpus)

In [6]:
X_train.shape

(25000, 27272)

### Train into the model

In [7]:
clf = BernoulliNB()
clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)

accuracy_score(y_test, y_predict)

0.82904

### Analysis of model

In [8]:
def get_bias(clf):
    return clf.class_log_prior_[1] - clf.class_log_prior_[0]

In [9]:
# It should be zero since the data is balanced as 50% vs 50%

get_bias(clf)

0.0

In [10]:
# In this case, we only need to calculate the evidence (negative and positive) for all the features 

def evidences(clf, X):
    
    X = X.todense()
    
    X_presence = X.copy()
    
    X_absence = 1 - X
        
    absence_log_prob_ = np.log(1 - np.exp(clf.feature_log_prob_))
    
    presence_log_ratios = clf.feature_log_prob_[1] - clf.feature_log_prob_[0]
    
    absence_log_ratios = absence_log_prob_[1] - absence_log_prob_[0]
    
    presence_neg_log_ratios = presence_log_ratios * (presence_log_ratios<0)
    presence_pos_log_ratios = presence_log_ratios * (presence_log_ratios>0)
    
    absence_neg_log_ratios = absence_log_ratios * (absence_log_ratios<0)
    absence_pos_log_ratios = absence_log_ratios * (absence_log_ratios>0)
    
    p_neg_evi = np.dot(X_presence, presence_neg_log_ratios)
    p_pos_evi = np.dot(X_presence, presence_pos_log_ratios)
    
    a_neg_evi = np.dot(X_absence, absence_neg_log_ratios)
    a_pos_evi = np.dot(X_absence, absence_pos_log_ratios)
    
    return p_neg_evi.item(0), p_pos_evi.item(0), a_neg_evi.item(0), a_pos_evi.item(0)

In [11]:
def pos_neg_evidence(clf, x_vector):
    
    p_neg_evi, p_pos_evi, a_neg_evi, a_pos_evi = evidences(clf, x_vector)
    
    return p_neg_evi + a_neg_evi, p_pos_evi + a_pos_evi

In [12]:
evidence = []

for i in X_test:
    evidence.append(pos_neg_evidence(clf, i))

In [13]:
evidence = np.asarray(evidence)

evidence.shape

(25000, 2)

In [14]:
evidence[0]

array([-36.61238513,  30.5457996 ])

In [15]:
neg_evidence = evidence[:, 0]
pos_evidence = evidence[:, 1]

In [16]:
# a) the total positive log-evidence 
# b) the total negative log-evidence
# c) probability distribution
# d) top 3 features values that contribute most to the positive evidence
# e) top 3 feature values that contribute the most to the negative evidence. 
# Print this information on the following object types


def print_infor(X_vector, index, evidence, proba, clf, vec):
    
    neg_evi = evidence[:, 0]
    pos_evi = evidence[:, 1]
    
    print("The total positive log-evidence: %s" %str(pos_evi[index]))
    print("The total negative log-evidence: %s" %str(neg_evi[index]))
           
    print("The probability distribution: %s" %str(proba[index]))
    
    top_features(X_vector, clf, vec)
    
    return None

In [54]:
def top_features(X_vector, clf, vec):
    
    X_vector = X_vector.todense()
    
    X_presence = X_vector.copy()
    
    X_absence = 1 - X_vector
        
    # p(x=absence|y) = 1 - p(x=presence|y)    
    # ln[p(x=absence|y)] = ln[1 - e^ln[p=presence|y]]
    absence_log_prob_ = np.log(1 - np.exp(clf.feature_log_prob_))
    
    # log-ratio => ln[p(x=presence|y=true)/p(x=absence|y=false)] = ln[p(x=presence|y=true)] - ln[p(x=absence|y=true)]
    presence_log_ratios = clf.feature_log_prob_[1] - clf.feature_log_prob_[0]
    
    # log-ratio => ln[p(x=absense|y=true)/p(x=presence|y=false)] = ln[p(x=absense|y=true)] - ln[p(x=presence|y=true)]
    absence_log_ratios = absence_log_prob_[1] - absence_log_prob_[0]
    
    presence_neg_log_ratios = presence_log_ratios * (presence_log_ratios<0)
    presence_pos_log_ratios = presence_log_ratios * (presence_log_ratios>0)
    
    absence_neg_log_ratios = absence_log_ratios * (absence_log_ratios<0)
    absence_pos_log_ratios = absence_log_ratios * (absence_log_ratios>0)
    
    p_neg_evi = np.multiply(X_presence, presence_neg_log_ratios)
    p_pos_evi = np.multiply(X_presence, presence_pos_log_ratios)
    
    a_neg_evi = np.multiply(X_absence, absence_neg_log_ratios)
    a_pos_evi = np.multiply(X_absence, absence_pos_log_ratios)
    
    positive = np.add(p_pos_evi, a_pos_evi).A1
    negative = np.add(p_neg_evi, a_neg_evi).A1
    
    total = np.add(positive, negative)
        
    features_name = vec.get_feature_names()
    
    top_positive_idx = np.argsort(positive)[::-1][:3]
    print("")
    print("Top 3 features values that contribute the most to the positive evidence")
       
    for idx in top_positive_idx:
        
        print("Feature name: %s \t\t value: %s \t\t Evidence: %s" %(features_name[idx], str(X_vector.A1[idx]), str(positive[idx])))
        
    top_negative_idx = np.argsort(negative)[:3]
    print("")
    print("Top 3 features values that contribute the most to the negative evidence")
    for idx in top_negative_idx:
        print("Feature name: %s \t\t value: %s \t\t Evidence: %s" %(features_name[idx], str(X_vector.A1[idx]), str(negative[idx])))
            
#     print(np.count_nonzero(p_pos_evi))
#     print(np.count_nonzero(a_pos_evi))
#     print(np.count_nonzero(positive))
    
    return None 

#### 1. The most positive object with respect to the probabilities.

In [55]:
idx = np.argmax(y_proba[:,1])

idx 

870

In [56]:
print_infor(X_test[idx], idx, evidence, y_proba, clf, vectorizer)

The total positive log-evidence: 126.092040458
The total negative log-evidence: -64.9802025856
The probability distribution: [  2.88048775e-27   1.00000000e+00]

Top 3 features values that contribute the most to the positive evidence
Feature name: pinjar 		 value: 1 		 Evidence: 2.19722457734
Feature name: maintained 		 value: 1 		 Evidence: 1.68639895357
Feature name: sadness 		 value: 1 		 Evidence: 1.6635051337

Top 3 features values that contribute the most to the negative evidence
Feature name: existent 		 value: 1 		 Evidence: -1.99720344344
Feature name: adage 		 value: 1 		 Evidence: -1.94591014906
Feature name: bearings 		 value: 1 		 Evidence: -1.79175946923


#### 2. The most negative object with respect to the probabilities.

In [57]:
idx = np.argmin(y_proba[:,1])

idx 

10505

In [58]:
print_infor(X_test[idx], idx, evidence, y_proba, clf, vectorizer)

The total positive log-evidence: 48.0671904571
The total negative log-evidence: -137.652105367
The probability distribution: [  1.00000000e+00   1.24098289e-39]

Top 3 features values that contribute the most to the positive evidence
Feature name: sloth 		 value: 1 		 Evidence: 1.38629436112
Feature name: ferocious 		 value: 1 		 Evidence: 1.20397280433
Feature name: joy 		 value: 1 		 Evidence: 0.962571484438

Top 3 features values that contribute the most to the negative evidence
Feature name: manos 		 value: 1 		 Evidence: -3.33220451018
Feature name: unwatchable 		 value: 1 		 Evidence: -2.9856819377
Feature name: waster 		 value: 1 		 Evidence: -2.63905732962


#### 3. The object that has the largest positive evidence.

In [59]:
idx = np.argmax(pos_evidence)

print(evidence[idx])

idx 

[-163.3492882   194.84244513]


18112

In [60]:
print_infor(X_test[idx], idx, evidence, y_proba, clf, vectorizer)

The total positive log-evidence: 194.842445129
The total negative log-evidence: -163.349288197
The probability distribution: [  2.10230502e-14   1.00000000e+00]

Top 3 features values that contribute the most to the positive evidence
Feature name: quibble 		 value: 1 		 Evidence: 3.04452243772
Feature name: genesis 		 value: 1 		 Evidence: 2.07944154168
Feature name: beckett 		 value: 1 		 Evidence: 2.07944154168

Top 3 features values that contribute the most to the negative evidence
Feature name: clowns 		 value: 1 		 Evidence: -2.63905732962
Feature name: lamest 		 value: 1 		 Evidence: -2.25129179861
Feature name: worst 		 value: 1 		 Evidence: -2.19282896586


#### 4. The object that has the largest (in magnitude) negative evidence.

In [61]:
idx = np.argmin(neg_evidence)

print(evidence[idx])

idx 

[-163.3492882   194.84244513]


18112

In [62]:
print_infor(X_test[idx], idx, evidence, y_proba, clf, vectorizer)

The total positive log-evidence: 194.842445129
The total negative log-evidence: -163.349288197
The probability distribution: [  2.10230502e-14   1.00000000e+00]

Top 3 features values that contribute the most to the positive evidence
Feature name: quibble 		 value: 1 		 Evidence: 3.04452243772
Feature name: genesis 		 value: 1 		 Evidence: 2.07944154168
Feature name: beckett 		 value: 1 		 Evidence: 2.07944154168

Top 3 features values that contribute the most to the negative evidence
Feature name: clowns 		 value: 1 		 Evidence: -2.63905732962
Feature name: lamest 		 value: 1 		 Evidence: -2.25129179861
Feature name: worst 		 value: 1 		 Evidence: -2.19282896586


#### 5. The most uncertain object (the probabilities are closest to 0.5)

In [63]:
uncerts = np.min(y_proba, axis=1)

idx = np.argmax(uncerts)

print(evidence[idx])

idx

[-33.01112016  33.01051477]


15608

In [64]:
print_infor(X_test[idx], idx, evidence, y_proba, clf, vectorizer)

The total positive log-evidence: 33.0105147676
The total negative log-evidence: -33.0111201556
The probability distribution: [ 0.50015135  0.49984865]

Top 3 features values that contribute the most to the positive evidence
Feature name: donovan 		 value: 1 		 Evidence: 1.29928298413
Feature name: joy 		 value: 1 		 Evidence: 0.962571484438
Feature name: carol 		 value: 1 		 Evidence: 0.826678573184

Top 3 features values that contribute the most to the negative evidence
Feature name: poor 		 value: 1 		 Evidence: -1.20207912055
Feature name: cow 		 value: 1 		 Evidence: -0.944461608841
Feature name: slammer 		 value: 1 		 Evidence: -0.916290731874
