## Bayesian Logistic Regression for IMDB 50K Dataset

### Data Pre-Processing

Removing rare/stop words, applying bag-of-words vector representation to words in review using tokenizers (tf-idf)

In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

imdb = pd.read_csv("IMDB Dataset.csv")
imdb.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


In [2]:
imdb.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


#### Remove Stop Words and other Rare Words

In [3]:
#list of stopwords from NLTK
# stopword_list=nltk.corpus.stopwords.words('english')

#remove HTML tags
def strip_html(html_text: str):
    return BeautifulSoup(html_text).get_text()

#make sure to add more cleaning (remove punctuation, )
def clean_review(review: str):
    return strip_html(review)

#clean each review in dataset
imdb.review.apply(clean_review)

#tags should be removed from the 2nd review
imdb.head(3)



Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


#### Split training dataset

80% training (40K), 20% test split (10K)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(imdb.review, imdb.sentiment, test_size=0.2, random_state=42)
print("{} {}\n{} {}".format(
    X_train.shape,
    y_train.shape,
    X_test.shape,
    y_test.shape))

(40000,) (40000,)
(10000,) (10000,)


#### Term Frequency-Inverse Document Frequency (TF-IDF)

Provides additional information on how important the word is for statistical analysis

TF-IDF = TF*IDF
- TF = (# occurrences of term `t` in review)/total # words in review). Measures how frequently `t` appears in the review 
- IDF = log_((# documents)/(# documents with term `t`)). Measures how important `t` is (if more frequent among documents, less important)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

#initialize TF-IDF Vectorizer (no l1/l2 normalization)
tf_idf = TfidfVectorizer(norm=None, ngram_range=(1,1))

#fit vocabulary on training dataset and transform
#NOTE: fit_transform equivalent to using fit() then transform()
X_train = tf_idf.fit_transform(X_train)

#use vocabulary to transform test dataset
X_test = tf_idf.transform(X_test)

print(X_train.shape)
print(X_test.shape)

(40000, 93003)
(10000, 93003)


In [6]:
from sklearn.preprocessing import LabelBinarizer

#vectorize dependent binary variable y
lb=LabelBinarizer()

#if positive, output = 1
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)

### Stochastic MAP Estimation

Performs stochastic gradient ascent by taking a random sample from the training dataset per iteration

In [102]:
from numpy.random import randint, sample
from scipy.sparse import csr, random
from scipy.special import expit
from sklearn.utils.fixes import scipy
from scipy.sparse import csr_matrix

def stochastic_regression_map(lamb: float, X: csr_matrix, y: np.ndarray, l_rate: float, batch_size: int, max_iters: int):
    
    converged = False

    #extract dimensions of X 
    num_samples, num_features = X.shape #[N x P]

    #initialize beta as 0 coefficients for features
    beta = np.zeros((num_features, 1))

    #use max_iters to control number of gradient steps
    num_iters = 0

    #array to keep track and plot losses during SGD training
    sgd_losses = []

    while (not converged) and (num_iters < max_iters):
        #fetch one random sample of data -> turn to minibatch
        mini_grads = []
        mini_obj = []

        # for batch in range(batch_size):
        sample_idx = randint(0, num_samples)
        y_sample = y[sample_idx].item() #[1 x 1]
        X_sample = X[sample_idx].toarray() #[1 x P]

        #calculate predicted expectation -> logistic_fn(beta * x_i)
        linear = np.matmul(X_sample, beta).item()
        y_pred = expit(linear)

        #calculate objective fn to store in sgd_losses
        obj = y_sample*np.log(y_pred) + (1 - y_sample)*np.log(1 - y_pred)

        #calculate gradient (regularization + grad scaled up by num_samples)
        grad = np.transpose(lamb*beta) + num_samples*(float(y_sample) - y_pred)*X_sample

        #perform gradient step
        beta = beta - l_rate*np.transpose(grad)
        num_iters += 1
        sgd_losses.append(-obj)
        
    
    #gradient ascent finished, return coefficient
    return (beta, sgd_losses)

In [103]:
#perform stochastic gradient ascent

beta, sgd_losses = stochastic_regression_map(
    lamb=0.1, 
    X=X_train, 
    y=y_train, 
    l_rate=0.0001, 
    batch_size=1,
    max_iters=6)

print(sgd_losses)

[0.6931471805599453, nan, nan, nan, inf, 560.8236093987434]


  obj = y_sample*np.log(y_pred) + (1 - y_sample)*np.log(1 - y_pred)
  obj = y_sample*np.log(y_pred) + (1 - y_sample)*np.log(1 - y_pred)


In [9]:
np.zeros(5) + 0.01*np.ones(5)

array([0.01, 0.01, 0.01, 0.01, 0.01])