In [45]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import sys
import re
from operator import itemgetter

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams

# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import classification_report, accuracy_score, mean_squared_error
# from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score



In [2]:

def _stem(doc, p_stemmer, en_stop, return_tokens):
    tokens = word_tokenize(doc.lower())
    stopped_tokens = filter(lambda token: token not in en_stop, tokens)
    stemmed_tokens = map(lambda token: p_stemmer.stem(token), stopped_tokens)
    if not return_tokens:
        return ' '.join(stemmed_tokens)
    return list(stemmed_tokens)

def getStemmedDocuments(docs, return_tokens=False):
    """
        Args:
            docs: str/list(str): document or list of documents that need to be processed
            return_tokens: bool: return a re-joined string or tokens
        Returns:
            str/list(str): processed document or list of processed documents
        Example: 
            new_text = "It is important to by very pythonly while you are pythoning with python. \
                All pythoners have pythoned poorly at least once."
            print(getStemmedDocuments(new_text))
        Reference: https://pythonprogramming.net/stemming-nltk-tutorial/
    """
    en_stop = set(stopwords.words('english'))
    ps = PorterStemmer()
    if isinstance(docs, list):
        output_docs = []
        for item in docs:
            output_docs.append(_stem(item, ps, en_stop, return_tokens))
        return output_docs
    else:
        return _stem(docs, ps, en_stop, return_tokens)

In [6]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
LaplaceSmoothing = 1
BASE_DIR = "../"
train_path = os.path.join(BASE_DIR, 'data', 'reviews_Digital_Music_5.json', 'Music_Review_train.json')
test_path  = os.path.join(BASE_DIR, 'data', 'reviews_Digital_Music_5.json', 'Music_Review_test.json')

In [4]:
def load_data(filename):
    return pd.read_json(filename, lines=True)

In [5]:
def clean_data(line):
    line = line.strip().lower()
    line = re.sub(r'[^\w\s]','',line)
    line = re.sub('\r?\n',' ',line)
    return line

def preprocessing(df, stemming):
    ColumnsToDrop = ['reviewerID', 'asin', 'reviewerName', 'unixReviewTime', 'reviewTime']
    df = df.drop(ColumnsToDrop, axis=1)
    
    df['reviewText'] = df['reviewText'].apply(lambda x: clean_data(x))
    df['summary']    = df['summary'].apply(lambda x: clean_data(x))
    
    if stemming:
        df['reviewText'] = df['reviewText'].apply(lambda x: getStemmedDocuments(x))
        df['summary']    = df['summary'].apply(lambda x: getStemmedDocuments(x))
    return df

In [6]:
def Vocab_generation(df1, df2):
    VocabSize = 0
    Vocab = dict()
    
    min_val = min(df2)
    max_val = max(df2)
    
    VocabClassSize = [0]*(max_val - min_val + 1)
    ExampleSize    = [0]*(max_val - min_val + 1)    
        
    for i,review in enumerate(df1):
        for word in review.split():
            if len(word) > 0 and word not in Vocab:
                Vocab[word] = 1
                VocabSize  += 1

    VocabClass = []
    for i in range(max_val - min_val + 1):
        d = Vocab.copy()
        VocabClass.append(d)
    
    for i,review in enumerate(df1):
        ExampleSize[df2[i]-min_val] += 1
        for word in review.split():
            if len(word) > 0:
                VocabClassSize[df2[i]-min_val] += 1
                VocabClass[df2[i]-min_val][word] += 1
    
    return Vocab, VocabClass, VocabSize, VocabClassSize, ExampleSize

In [72]:
part_d = False

train_data = preprocessing(load_data(train_path), part_d)
test_data  = preprocessing(load_data(test_path), part_d)

X_train = train_data['reviewText'].copy()
Y_train = train_data['overall'].copy()

X_test = test_data['reviewText'].copy()
Y_test = test_data['overall'].copy()

In [41]:
Vocab, VocabClass, VocabSize, VocabClassSize, phi = Vocab_generation(X_train, Y_train)

In [42]:
# v1, v2, v3, v4, v5 = Vocab, VocabClass, VocabSize, VocabClassSize, phi
# print(phi)
# print(VocabClassSize)
# # print(VocabClass)

[2529, 2638, 5634, 13267, 25932]
[327247, 468906, 1178141, 2886449, 4931491]


In [11]:
# [2529, 2638, 5634, 13267, 25932]
# [328626, 470556, 1183227, 2902715, 4954673]
# [328626, 470556, 1183227, 2902715, 4954673]

# for i in range(len(VocabClass)):
#     for x in VocabClass[i]:
#         if(VocabClass[i][x] <= 0):
#             print("ERROR",i,x)
#         else:
#             np.log(float(VocabClass[i][x])/float(VocabClassSize[i] + VocabSize))
# print("DONE")

DONE


In [43]:
for i in range(len(VocabClass)):
    for x in VocabClass[i]:
        VocabClass[i][x] = np.log(float(VocabClass[i][x])/float(VocabClassSize[i] + VocabSize))
    
for i in range(len(VocabClass)):
    phi[i] = np.log(float(phi[i])/float(len(X_train)))

In [7]:
def predict(X,Y,phi,VocabClass,VocabClassSize):
    Y_pred = [0]*(Y.shape[0])
    count = 0
    minval = min(Y)
    
    for i,review in enumerate(X):
        class_prob = [0.0]*len(VocabClass)

        for word in review.split():
            for j in range(len(VocabClass)):
                if word in VocabClass[j]:
                    class_prob[j] += VocabClass[j][word]
                else:
                    class_prob[j] += (float(LaplaceSmoothing)/float(VocabSize))
                
        for j in range(len(VocabClass)):
            class_prob[j] += phi[j]
            
        # class_label = max(enumerate(class_prob, key=lambda x: x[1]))[0] + min(Y)
        class_label, element = max(enumerate(class_prob), key=itemgetter(1))
        class_label += minval
        
        Y_pred[i]   = class_label
        
        if class_label == Y[i]:
            count+=1
        
        # print(class_prob, element, "Predicted:", class_label, "Correct: ", Y[i])
        # print(i)
        
    return count, Y_pred

In [58]:
def f1_score(A,B):
    conf_mat = np.zeros((5,5));
    for i in range(B.shape[0]):
        conf_mat[int(B[i]-1)][int(A[i]-1)] += 1;
    
    precision = np.zeros(5);
    recall = np.zeros(5);
    f1 = np.zeros(5);

    for i in range(5):
        precision[i] = conf_mat[i,i]/(np.sum(conf_mat, axis = 0)[i])
        recall[i] = conf_mat[i,i]/(np.sum(conf_mat, axis = 1)[i])
        f1[i] = (2*precision[i]*recall[i])/(precision[i]+recall[i]);

    return f1

In [None]:

correct, Y_pred_train = predict(X_train, Y_train, phi, VocabClass, VocabClassSize)
print("Train Set Accuracy = {}%".format(round(100*float(correct)/float(len(X_train)),2)))
# print("Macro F1 score =", f1_score(Y_pred_train, Y_train))

correct, Y_pred_test = predict(X_test, Y_test, phi, VocabClass, VocabClassSize)
print("Test Set Accuracy  = {}%".format(round(100*float(correct)/float(len(X_test)),2)))
# print("Macro F1 score =", f1_score(Y_pred_test, Y_test))

In [46]:
# Part B

# Random Guessing
def random_guessing(Y_test):
    Y_pred = np.zeros(len(Y_test))
    count = 0
    for i in range (Y_test.shape[0]):
        class_label = np.random.randint(1,6)
        Y_pred[i]   = class_label
        if(class_label == Y_test[i]):
            count+=1
    return count, Y_pred

correct, Y_pred = random_guessing(Y_test)
print("Random Prediction Accuracy (Test Set) = {}%".format(round(100*float(correct)/float(len(X_test)),2)))

# Majority Prediction
def majority_prediction(Y_test):
    Y_pred = np.zeros(len(Y_test))
    count = 0
    class_label, element = max(enumerate(phi), key=itemgetter(1))
    class_label += 1
    
    for i in range (Y_test.shape[0]):        
        Y_pred[i]   = class_label
        if(class_label == Y_test[i]):
            count+=1
    return count, Y_pred

correct, Y_pred = majority_prediction(Y_test)
print("Majority Prediction Accuracy (Test Set) = {}%".format(round(100*float(correct)/float(len(X_test)),2)))

Random Prediction Accuracy (Test Set) = 20.79%
Majority Prediction Accuracy (Test Set) = 66.09%


In [47]:
# Part C
correct, Y_pred_test = predict(X_test, Y_test, phi, VocabClass, VocabClassSize)

In [48]:
confusion_matrix = np.zeros((5,5))
for i in range(len(Y_test)):
    confusion_matrix[Y_test[i]-1][Y_pred_test[i]-1] += 1
print(confusion_matrix.astype(int))

[[   2    0    7   36  183]
 [   0    0    7  102  217]
 [   1    0    4  349  732]
 [   2    0    1  462 2643]
 [   7    1    7  397 8840]]


In [14]:
# Part D
part_d = True
train_data = preprocessing(load_data(train_path), part_d)
test_data  = preprocessing(load_data(test_path), part_d)

X_train = train_data['reviewText'].copy()
Y_train = train_data['overall'].copy()

X_test = test_data['reviewText'].copy()
Y_test = test_data['overall'].copy()

In [13]:
Vocab, VocabClass, VocabSize, VocabClassSize, phi = Vocab_generation(X_train, Y_train)

In [None]:
for i in range(len(VocabClass)):
    for x in VocabClass[i]:
        VocabClass[i][x] = np.log(float(VocabClass[i][x])/float(VocabClassSize[i] + VocabSize))

for i in range(len(VocabClass)):
    phi[i] = np.log(float(phi[i])/float(len(X_train)))

In [16]:
correct, Y_pred_train = predict(X_train, Y_train, phi, VocabClass, VocabClassSize)
print("Train Set Accuracy (stemmed data) = {}%".format(round(100*float(correct)/float(len(X_train)),5)))

correct, Y_pred_test = predict(X_test, Y_test, phi, VocabClass, VocabClassSize)
print("Test Set Accuracy (stemmed data)  = {}%".format(round(100*float(correct)/float(len(X_test)),5)))

Train Set Accuracy (stemmed data) = 69.704%
Test Set Accuracy (stemmed data)  = 66.2%


In [37]:
# Part E
# Bigrams

def bigrams(X):
    for i in range(X.shape[0]):
        review = X[i]
        token  = nltk.word_tokenize(review)
        bigrams = list(ngrams(token,2))
        bigrams = list(map(lambda x: "_".join(x), bigrams))
        X[i] = " ".join(bigrams)
    return X

X_train_b = bigrams(X_train.copy())
X_test_b  = bigrams(X_test.copy())

In [38]:
Vocab, VocabClass, VocabSize, VocabClassSize, phi = Vocab_generation(X_train_b, Y_train)

In [39]:
for i in range(len(VocabClass)):
    for x in VocabClass[i]:
        VocabClass[i][x] = np.log(float(VocabClass[i][x])/float(VocabClassSize[i] + VocabSize))

for i in range(len(VocabClass)):
    phi[i] = np.log(float(phi[i])/float(len(X_train)))

In [40]:
correct, Y_pred_train = predict(X_train_b, Y_train, phi, VocabClass, VocabClassSize)
print("Train Set Accuracy (bigram data) = {}%".format(round(100*float(correct)/float(len(X_train_b)),5)))

correct, Y_pred_test = predict(X_test_b, Y_test, phi, VocabClass, VocabClassSize)
print("Test Set Accuracy (bigram data)  = {}%".format(round(100*float(correct)/float(len(X_test_b)),5)))

Train Set Accuracy (bigram data) = 96.236%
Test Set Accuracy (bigram data)  = 66.61429%


In [56]:
# print(Y_test.shape, len(Y_pred_test))

In [67]:
# print(Y_pred_test.dtype)
from sklearn.metrics import f1_score
print(f1_score(np.array(Y_pred_test),Y_test,average=None))
print(f1_score(np.array(Y_pred_test),Y_test,average='macro'))

[0.00873362 0.         0.00910747 0.12780749 0.80337948]
0.18980561268791482


In [32]:
# Part E
# Skip grams
from nltk.util import skipgrams
def gen_skipgrams(X,skip_dist=1):
    for i in range(X.shape[0]):
        review = X[i].split()
        sg = list(skipgrams(review,2,skip_dist))
        sg = list(map(lambda x: "_".join(x), sg))
        X[i] = " ".join(sg)
    return X

X_train_s = gen_skipgrams(X_train.copy())
X_test_s  = gen_skipgrams(X_test.copy())

In [33]:
Vocab, VocabClass, VocabSize, VocabClassSize, phi = Vocab_generation(X_train_s, Y_train)

In [34]:
for i in range(len(VocabClass)):
    for x in VocabClass[i]:
        VocabClass[i][x] = np.log(float(VocabClass[i][x])/float(VocabClassSize[i] + VocabSize))

for i in range(len(VocabClass)):
    phi[i] = np.log(float(phi[i])/float(len(X_train)))

In [35]:
print(VocabClassSize,VocabSize)

[347214, 496841, 1260629, 3132210, 5319120] 4475388


In [36]:
correct1, Y_pred_train = predict(X_train_s, Y_train, phi, VocabClass, VocabClassSize)
print("Train Set Accuracy (skip-gram data) = {}%".format(round(100*float(correct1)/float(len(X_train_s)),5)))

correct2, Y_pred_test = predict(X_test_s, Y_test, phi, VocabClass, VocabClassSize)
print("Test Set Accuracy (skip-gram data)  = {}%".format(round(100*float(correct2)/float(len(X_test_s)),5)))

Train Set Accuracy (bigram data) = 96.232%
Test Set Accuracy (bigram data)  = 66.45714%


In [73]:
X_train_summary = train_data['summary'].copy()
Y_train_summary = train_data['overall'].copy()

X_test_summary = test_data['summary'].copy()
Y_test_summary = test_data['overall'].copy()

In [74]:
Vocab, VocabClass, VocabSize, VocabClassSize, phi = Vocab_generation(X_train_summary, Y_train_summary)

In [75]:
for i in range(len(VocabClass)):
    for x in VocabClass[i]:
        VocabClass[i][x] = np.log(float(VocabClass[i][x])/float(VocabClassSize[i] + VocabSize))

for i in range(len(VocabClass)):
    phi[i] = np.log(float(phi[i])/float(len(X_train_summary)))

In [76]:
correct, Y_pred_train = predict(X_train_summary, Y_train_summary, phi, VocabClass, VocabClassSize)
print("Train Set Accuracy (stemmed data) = {}%".format(round(100*float(correct)/float(len(X_train_summary)),5)))

correct, Y_pred_test = predict(X_test_summary, Y_test_summary, phi, VocabClass, VocabClassSize)
print("Test Set Accuracy (stemmed data)  = {}%".format(round(100*float(correct)/float(len(X_test_summary)),5)))

Train Set Accuracy (stemmed data) = 66.81%
Test Set Accuracy (stemmed data)  = 67.27143%


In [78]:
X_train_bi = gen_skipgrams(X_train_summary.copy())
X_test_bi  = gen_skipgrams(X_test_summary.copy())

In [79]:
Vocab, VocabClass, VocabSize, VocabClassSize, phi = Vocab_generation(X_train_bi, Y_train_summary)

In [80]:
for i in range(len(VocabClass)):
    for x in VocabClass[i]:
        VocabClass[i][x] = np.log(float(VocabClass[i][x])/float(VocabClassSize[i] + VocabSize))

for i in range(len(VocabClass)):
    phi[i] = np.log(float(phi[i])/float(len(X_train_bi)))

In [81]:
correct, Y_pred_train_bi = predict(X_train_bi, Y_train_summary, phi, VocabClass, VocabClassSize)
print("Train Set Accuracy (bigram data) = {}%".format(round(100*float(correct)/float(len(X_train_bi)),5)))

correct, Y_pred_test_bi = predict(X_test_bi, Y_test_summary, phi, VocabClass, VocabClassSize)
print("Test Set Accuracy (bigram data)  = {}%".format(round(100*float(correct)/float(len(X_test_bi)),5)))

Train Set Accuracy (bigram data) = 84.664%
Test Set Accuracy (bigram data)  = 66.80714%


In [60]:
# Train Set Accuracy (bigram data) = 95.592%
# Test Set Accuracy (bigram data)  = 66.32857%
# [517029, 741306, 1882494, 4678420, 7939810] 6369181
print(VocabClassSize,VocabSize)
# duffelzaar
# # Part E
# vectorizer = TfidfVectorizer(preprocessor=None,
#                             tokenizer = word_tokenize,
#                             analyzer='word',
#                             stop_words=None,
#                             strip_accents=None, 
#                             lowercase=True,
#                             ngram_range=(1,3), 
#                             min_df=0.0001, 
#                             max_df=0.9,
#                             binary=False,
#                             norm='l2',
#                             use_idf=1,
#                             smooth_idf=1, 
#                             sublinear_tf=1)

# X_train = vectorizer.fit_transform(X_train)
# X_test  = vectorizer.transform(X_test)

# mnb = MultinomialNB()
# mnb.fit(X_train,Y_train)

# pred_mnb = mnb.predict(X_test)
# print("Feature Engineering Score:",round(accuracy_score(Y_test,pred_mnb),3));

Feature Engineering Score: 0.661


In [70]:
# Part F
# correct, Y_pred_test = predict(X_test, Y_test, phi, VocabClass, VocabClassSize)
print("Test Set Accuracy = {}%".format(round(100*float(correct)/14000.0),2))
print("Macro F1 score = ".format(f1_score(Y_pred_test, Y_test)))

Test Set Accuracy = 67%
Macro F1 score = 


  p1[i] = conf_mat[i,i]/(np.sum(conf_mat, axis = 0)[i])


In [31]:
def add(x):
    return x+2
L = [1,2,3]
df_temp = pd.DataFrame(L)

In [33]:
df_temp.apply(lambda x: add(x))

Unnamed: 0,0
0,3
1,4
2,5


In [34]:
df_temp

Unnamed: 0,0
0,1
1,2
2,3
