In [11]:
import json
import string
import numpy as np
import math
import sys
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from functools import lru_cache

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\soham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\soham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# utility
table = str.maketrans('', '', string.punctuation) # puctuation table

def _stem(doc, p_stemmer, en_stop, return_tokens):
    tokens = word_tokenize(doc.lower())
    stopped_tokens = filter(lambda token: token not in en_stop, tokens)
    stemmed_tokens = map(lambda token: p_stemmer(token), stopped_tokens)
    if not return_tokens:
        return ' '.join(stemmed_tokens)
    return list(stemmed_tokens)


def getStemmedDocuments(docs, return_tokens=True):
    """
        Args:
            docs: str/list(str): document or list of documents that need to be processed
            return_tokens: bool: return a re-joined string or tokens
        Returns:
            str/list(str): processed document or list of processed documents
        Example: 
            new_text = "It is important to by very pythonly while you are pythoning with python.
                All pythoners have pythoned poorly at least once."
            print(getStemmedDocuments(new_text))
        Reference: https://pythonprogramming.net/stemming-nltk-tutorial/
    """
    en_stop = set(stopwords.words('english'))
    ps = PorterStemmer()
    p_stemmer = lru_cache(maxsize=None)(ps.stem) # use this function to stem
    if isinstance(docs, list):
        output_docs = []
        for item in docs:
            output_docs.append(_stem(item, p_stemmer, en_stop, return_tokens))
        return output_docs
    else:
        return _stem(docs, p_stemmer, en_stop, return_tokens)

def indicator(exp):
    if exp:
        return 1
    else:
        return 0

In [6]:
vocabulary = {}

# load training data
with open("C:/IITD/sem5/col774-ml/col774_yelp_data/col774_yelp_data/train.json") as train_file:
    m = sum(1 for line in train_file)
    y = np.zeros(m, np.int)
    x = []

with open("C:/IITD/sem5/col774-ml/col774_yelp_data/col774_yelp_data/train.json") as train_file:
    i = -1
    for line in train_file:
        i += 1
        if i % 50000 == 0:
            print("train", i, len(vocabulary))
        review = json.loads(line)
        y[i] = int(review["stars"])
        
        xi = []
        for word in review["text"].split():
            word = word.translate(table).lower()
            if word:
                if word not in vocabulary:
                    vocabulary[word] = len(vocabulary) + 1
                xi.append(vocabulary[word])
        x.append(xi)

# load test data
with open("C:/IITD/sem5/col774-ml/col774_yelp_data/col774_yelp_data/test.json") as test_file:
    m_test = sum(1 for line in test_file)
    y_test = np.zeros(m_test, np.int)
    x_test = []

with open("C:/IITD/sem5/col774-ml/col774_yelp_data/col774_yelp_data/test.json") as test_file:
    i = -1
    for line in test_file:
        i += 1
        if i % 50000 == 0:
            print("test", i, len(vocabulary))
        review = json.loads(line)
        y_test[i] = int(review["stars"])
        
        xi = []
        for word in review["text"].split():
            word = word.translate(table).lower()
            if word:
                if word not in vocabulary:
                    vocabulary[word] = len(vocabulary) + 1
                xi.append(vocabulary[word])
        x_test.append(xi)

train 0 0
train 50000 79624
train 100000 119077
train 150000 152151
train 200000 181127
train 250000 208287
train 300000 233418
train 350000 257386
train 400000 280159
train 450000 302198
train 500000 323631
test 0 337998
test 50000 357942
test 100000 377337


In [7]:
r = 5 # number of classes
V = len(vocabulary)

# y takes values in {1, 2, ..., r}
# x takes values in {1, 2, ..., V}

In [8]:
# evaluate phi
phi = np.zeros(r)
for i in range(m):
    k = y[i] - 1
    phi[k] += 1
phi = phi / m

# evaluate theta
theta_numerator = np.zeros((V, r))
theta_denominator = np.zeros((r))
for i in range(m):
    if i % 50000 == 0:
        print(i)
    ni = len(x[i])
    k = y[i] - 1
    for j in range(ni):
        l = x[i][j] - 1
        theta_numerator[l][k] += 1
    theta_denominator[k] += ni
    

theta = np.zeros((V, r))
for j in range(V):
    for k in range(r):
        theta[j][k] = (theta_numerator[j][k] + 1) / (theta_denominator[k] + V)

0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000


In [9]:
def predict(x, phi, theta):
    """
        Returns: Most expected class label
    """
    max_prob = -math.inf
    argmax_k = -1
    for k in range(r):
        n = len(x)
        summation = 0
        for i in range(n):
            l = x[i] - 1
            summation += np.log(theta[l][k])
        prob_k = summation + np.log(phi[k])
        if prob_k > max_prob:
            max_prob = prob_k
            argmax_k = k
    return argmax_k + 1

In [45]:
# train set accuracy

train_count = 0
for i in range(m):
    if i % 50000 == 0:
        print("train", i)
    train_count += indicator(predict(x[i], phi, theta) == y[i])

print("Train set accuracy", (train_count / m) * 100)

train 0
train 50000
train 100000
train 150000
train 200000
train 250000
train 300000
train 350000
train 400000
train 450000
train 500000
Train set accuracy 0.6463939035881482


In [10]:
# test set accuracy

test_predictions = np.zeros(m_test, np.int)
test_count = 0
for i in range(m_test):
    if i % 50000 == 0:
        print("test", i)
    test_predictions[i] = predict(x_test[i], phi, theta)
    test_count += indicator(test_predictions[i] == y_test[i])

print("Test set accuracy", (test_count / m_test) * 100)

test 0
test 50000
test 100000
Test set accuracy 60.525134985566645


In [34]:
# random prediction and majority prediction

majority_k = np.argmax(np.bincount(y)) # max occuring class

random_count = 0
majority_count = 0
for i in range(len(y_test)):
    random_count += indicator(y_test[i] == np.random.randint(r))
    majority_count += indicator(y_test[i] == majority_k)

print("Random prediction accuracy", (random_count / m_test) * 100)
print("Majority prediction accuracy", (majority_count / m_test) * 100)

5
Random prediction accuracy 0.11215393589494309
Majority prediction accuracy 0.439895900327555


In [16]:
# confusion matrix
confusion_matrix = np.zeros((r, r), np.int)

for i in range(m_test):
    confusion_matrix[test_predictions[i]-1][y_test[i]-1] += 1

print(confusion_matrix)

[[14751  3018  1461  1115  3046]
 [ 3173  2758  1316   490   215]
 [ 1204  3154  4468  1820   387]
 [  626  1470  6276 18523 14741]
 [  415   438  1010  7410 40433]]
