In [1]:
import json
import string
import numpy as np
import math
import sys
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from functools import lru_cache

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\soham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\soham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [74]:
# utility
digits_and_punctuation = str.maketrans('', '', string.digits + string.punctuation)

def json_writer(data, fname):
    """
        Write multiple json files
        Args:
            data: list(dict): list of dictionaries to be written as json
            fname: str: output file name
    """
    with open(fname, mode="w") as fp:
        for line in data:
            json.dump(line, fp)
            fp.write("\n")


def json_reader(fname):
    """
        Read multiple json files
        Args:
            fname: str: input file
        Returns:
            generator: iterator over documents 
    """
    for line in open(fname, mode="r"):
        yield json.loads(line)


def _stem(doc, p_stemmer, en_stop, return_tokens):
    tokens = word_tokenize(doc.lower())
    stopped_tokens = filter(lambda token: token not in en_stop, tokens)
    stemmed_tokens = map(lambda token: p_stemmer(token), stopped_tokens)
    if not return_tokens:
        return ' '.join(stemmed_tokens)
    return list(stemmed_tokens)

def _stem3(doc, p_stemmer, en_stop, return_tokens):
    tokens = word_tokenize(doc.lower())
    processed_tokens = []
    for token in tokens:
        if token not in en_stop:
            token = token.translate(digits_and_punctuation)
            if token:
                processed_tokens.append(token)

    # stopped_tokens = filter(lambda token: token not in en_stop, tokens)
    stemmed_tokens = map(lambda token: p_stemmer(token), processed_tokens)
    if not return_tokens:
        return ' '.join(stemmed_tokens)
    return list(stemmed_tokens)


def getStemmedDocuments(docs, return_tokens=True):
    """
        Args:
            docs: str/list(str): document or list of documents that need to be processed
            return_tokens: bool: return a re-joined string or tokens
        Returns:
            str/list(str): processed document or list of processed documents
        Example: 
            new_text = "It is important to by very pythonly while you are pythoning with python.
                All pythoners have pythoned poorly at least once."
            print(getStemmedDocuments(new_text))
        Reference: https://pythonprogramming.net/stemming-nltk-tutorial/
    """
    # en_stop = set(stopwords.words('english')).union(["n't", "'s"]) # add additional stopwords

    en_stop = set(stopwords.words('english')).union(set(string.punctuation)).union(["n't", "'s"]) # add punctuations and additional stopwords


    ps = PorterStemmer()
    p_stemmer = lru_cache(maxsize=None)(ps.stem)
    if isinstance(docs, list):
        output_docs = []
        doc_count = -1
        for item in docs:
            doc_count += 1
            if doc_count % 50000 == 0:
                print("doc", doc_count)
            output_docs.append(_stem(item, p_stemmer, en_stop, return_tokens))
        return output_docs
    else:
        return _stem(docs, p_stemmer, en_stop, return_tokens)

def indicator(exp):
    if exp:
        return 1
    else:
        return 0

In [36]:
#idf
train_json = "C:/IITD/sem5/col774-ml/datasets/col774_yelp_data/col774_yelp_data/train.json"
test_json = "C:/IITD/sem5/col774-ml/datasets/col774_yelp_data/col774_yelp_data/test.json"

vocabulary = {}
idf = {}

x = []
y = []
docs = []

for review in json_reader(train_json):
    y.append(int(review["stars"]))
    docs.append(review["text"])

docs = getStemmedDocuments(docs)
i = -1
for doc in docs:
    i += 1
    if i % 50000 == 0:
        print("train", i, len(vocabulary))
    xi = []
    unique_words_in_doc = set()
    for word in doc:
        if word not in vocabulary:
            vocabulary[word] = len(vocabulary) + 1
        xi.append(vocabulary[word])
        if word not in unique_words_in_doc:
            unique_words_in_doc.add(word)
            if vocabulary[word] not in idf:
                idf[vocabulary[word]] = 1
            else:
                idf[vocabulary[word]] += 1
    x.append(xi)

doc 0
doc 50000
doc 100000
doc 150000
doc 200000
doc 250000
doc 300000
doc 350000
doc 400000
doc 450000
doc 500000
train 0 0
train 50000 49698
train 100000 73107
train 150000 92562
train 200000 109577
train 250000 125339
train 300000 140057
train 350000 153935
train 400000 167009
train 450000 179645
train 500000 192098


In [75]:
# bigram
train_json = "C:/IITD/sem5/col774-ml/datasets/col774_yelp_data/col774_yelp_data/train.json"
test_json = "C:/IITD/sem5/col774-ml/datasets/col774_yelp_data/col774_yelp_data/test.json"

vocabulary = {}
idf = {}

x = []
y = []
docs = []

for review in json_reader(train_json):
    y.append(int(review["stars"]))
    docs.append(review["text"])

docs = getStemmedDocuments(docs)
i = -1
for doc in docs:
    i += 1
    if i % 50000 == 0:
        print("train", i, len(vocabulary))
    xi = []

    for w in range(len(doc)-1):
        bg = (doc[w], doc[w+1])
        if bg not in vocabulary:
            vocabulary[bg] = len(vocabulary) + 1
        xi.append(vocabulary[bg])

    x.append(xi)

doc 0
doc 50000
doc 100000
doc 150000
doc 200000
doc 250000
doc 300000
doc 350000
doc 400000
doc 450000
doc 500000
train 0 0
train 50000 1208884
train 100000 2004812
train 150000 2672199
train 200000 3264531
train 250000 3809075
train 300000 4319510
train 350000 4794161
train 400000 5245203
train 450000 5681575
train 500000 6100473


In [37]:
x_test = []
y_test = []
docs_test = []

for review in json_reader(test_json):
    y_test.append(int(review["stars"]))
    docs_test.append(review["text"])

docs_test = getStemmedDocuments(docs_test)
i = -1
for doc in docs_test:
    i += 1
    if i % 50000 == 0:
        print("test", i)
    xi = []
    for word in doc:
        if word in vocabulary:
            xi.append(vocabulary[word])

    x_test.append(xi)

doc 0
doc 50000
doc 100000
test 0
test 50000
test 100000


In [76]:
x_test = []
y_test = []
docs_test = []

for review in json_reader(test_json):
    y_test.append(int(review["stars"]))
    docs_test.append(review["text"])

docs_test = getStemmedDocuments(docs_test)
i = -1
for doc in docs_test:
    i += 1
    if i % 50000 == 0:
        print("test", i)
    xi = []
    for w in range(len(doc)-1):
        bg = (doc[w], doc[w+1])
        if bg in vocabulary:
            xi.append(vocabulary[bg])

    x_test.append(xi)

doc 0
doc 50000
doc 100000
test 0
test 50000
test 100000


In [55]:
# tri-grams
vocabulary = {}
x = []

i = -1
for doc in docs:
    i += 1
    if i % 50000 == 0:
        print("train", i, len(vocabulary))
    xi = []
    for w in range(len(doc)-2):
        word = (doc[w], doc[w+1], doc[w+2])
        if word not in vocabulary:
            vocabulary[word] = len(vocabulary) + 1
        xi.append(vocabulary[word])
    x.append(xi)

x_test = []

i = -1
for doc in docs_test:
    i += 1
    if i % 50000 == 0:
        print("test", i, len(vocabulary))
    xi = []
    for w in range(len(doc)-2):
        word = (doc[w], doc[w+1], doc[w+2])
        if word not in vocabulary:
            vocabulary[word] = len(vocabulary) + 1
        xi.append(vocabulary[word])

    x_test.append(xi)

train 0 0
train 50000 2512663
train 100000 4835822
train 150000 7046756
train 200000 9187084
train 250000 11277370
train 300000 13323260
train 350000 15317962
train 400000 17277846
train 450000 19225713
train 500000 21142332
test 0 22453011
test 50000 24313219
test 100000 26157342


In [77]:
m = len(y)
m_test = len(y_test)

# df = idf
def _idf(word):
    return np.log(m/(1 + df[word]))

# print(_idf(1))

# _idfs = np.array([_idf(word) for word in df.keys()])
# print(len(vocabulary), len(df))
# list(df.keys())
# _idfs = _idfs / np.linalg.norm(_idfs)
# print(np.min(_idfs), np.max(_idfs))

In [40]:
mini = np.argmin(_idfs) + 1
maxi = np.argmax(_idfs) + 1

for key, val in vocabulary.items():
    if val == maxi:
        print(key)

sambusk


In [78]:
r = 5 # number of classes
V = len(vocabulary)

# y takes values in {1, 2, ..., r}; parameterized by phi
# x takes values in {1, 2, ..., V}; parameterized by theta
V

6379340

In [79]:
# evaluate phi
phi = np.zeros(r)
for i in range(m):
    k = y[i] - 1
    phi[k] += 1
phi = phi / m

# evaluate theta
theta_numerator = np.zeros((V, r))
theta_denominator = np.zeros((r))
for i in range(m):
    if i % 50000 == 0:
        print(i)
    ni = len(x[i])
    k = y[i] - 1
    for j in range(ni):
        l = x[i][j] - 1
        theta_numerator[l][k] += 1
    theta_denominator[k] += ni
    

theta = np.zeros((V, r))
for j in range(V):
    if j % 500000 == 0:
        print("theta", j)
    for k in range(r):
        theta[j][k] = (theta_numerator[j][k] + 1) / (theta_denominator[k] + V)

0
50000
100000
150000
200000
250000
300000
350000
400000
450000
500000
theta 0
theta 500000
theta 1000000
theta 1500000
theta 2000000
theta 2500000
theta 3000000
theta 3500000
theta 4000000
theta 4500000
theta 5000000
theta 5500000
theta 6000000


In [71]:
np.sum(theta[:, 3])

0.999999999999999

In [72]:
def predict(x, phi, theta):
    """
        Returns: Most expected class label
    """
    max_prob = -math.inf
    argmax_k = -1
    for k in range(r):
        n = len(x)
        summation = 0
        for i in range(n):
            l = x[i] - 1
            summation += np.log(theta[l][k])
        prob_k = summation + np.log(phi[k])
        if prob_k > max_prob:
            max_prob = prob_k
            argmax_k = k
    return argmax_k + 1

In [80]:
# test set accuracy

test_predictions = np.zeros(m_test, np.int)
test_count = 0
for i in range(m_test):
    if i % 50000 == 0:
        print("test", i)
    test_predictions[i] = predict(x_test[i], phi, theta)    
    test_count += indicator(test_predictions[i] == y_test[i])

print("Test set accuracy", (test_count / m_test) * 100)
# Test set accuracy 63.10294799503433

test 0
test 50000
test 100000
Test set accuracy 63.297387038394234


In [None]:
# random prediction and majority prediction

majority_k = np.argmax(np.bincount(y)) # max occuring class

random_count = 0
majority_count = 0
for i in range(len(y_test)):
    random_count += indicator(y_test[i] == np.random.randint(r))
    majority_count += indicator(y_test[i] == majority_k)

print("Random prediction accuracy", (random_count / m_test) * 100)
print("Majority prediction accuracy", (majority_count / m_test) * 100)

In [45]:
# confusion matrix
confusion_matrix = np.zeros((r, r), np.int)

for i in range(m_test):
    confusion_matrix[test_predictions[i]-1][y_test[i]-1] += 1

print(confusion_matrix)

[[13494  2489  1215  1027  2954]
 [ 4860  4161  2352  1149   660]
 [  984  2843  5369  3507  1035]
 [  362   903  4490 16134 14421]
 [  469   442  1105  7541 39752]]
