In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.dummy import DummyClassifier




# This function opens a file and retrieves all lines in this file.
# It then removes all whitespace from this line and then creates a list with
# where the first item is genre, the second item is the sentiment, and the
# third is the id number of the review. Everything after this are the words
# of the review. To retrieve sentiment the variable use_sentiment must be True.
# To use genre's the variable use_sentiment must be False. One of these
# variables will be used as labels. It then returns the documents and labels.
def read_corpus(corpus_file, use_sentiment):
    documents = []
    labels = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            tokens = line.strip().split()

            documents.append(tokens[3:])

            if use_sentiment:
                # 2-class problem: positive vs negative
                labels.append( tokens[1] )
            else:
                # 6-class problem: books, camera, dvd, health, music, software
                labels.append( tokens[0] )

    return documents, labels
    
# a dummy function that just returns its input
def identity(x):
    return x



In [19]:
# The program reads a textfile and retrieves the data
# and the labels linked to the data. After this it
# splits the data in training data and test data.
# The same goes for the labels.
X, Y = read_corpus('trainset.txt', use_sentiment=True)
split_point = int(0.75*len(X))
Xtrain = X[:split_point]
Ytrain = Y[:split_point]
Xtest = X[split_point:]
Ytest = Y[split_point:]
total_instances = len(Xtest)


# Create dummy classifer baseline
dummy = DummyClassifier(strategy='stratified')
# "Train" model
dummyscore= dummy.fit(Xtrain, Ytrain)

baselineguess = dummyscore.predict(Xtest)



#Get score
print("accuracy score: ", accuracy_score(Ytest, baselineguess))
print(classification_report(Ytest, baselineguess))




# A TF_IDF vectorizer creates a score scale based on frequency of input
# within different documents. Every word will have a different score for
# a different document. This score can be used as feature for the classifier.
# The classifier learns from these features in order to make calculated
# predictions.
# let's use the TF-IDF vectorizer
tfidf = False




# we use a dummy function as tokenizer and preprocessor,
# since the texts are already preprocessed and tokenized.
if tfidf:
    vec = TfidfVectorizer(preprocessor = identity,
                          tokenizer = identity)
#The CountVectorizer creates a score scale based on frequency of input only
# This can be used to create a baseline to see how other machine learning
# techniques compare.    
else:
    vec = CountVectorizer(preprocessor = identity,
                          tokenizer = identity)

# combine the vectorizer with a Naive Bayes classifier
classifier = Pipeline( [('vec', vec),
                        ('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False))] )


# Here the classifier learns which feautures are linked to what label.
classifier.fit(Xtrain, Ytrain)



# Here the classifier predicts the label of features based on the
# learned process in the step before.  
Yguess = classifier.predict(Xtest)



# Here the classifier compares the gold standard labels with the
# predict labels retrieved from the step before.
print("accuracy score: ", accuracy_score(Ytest, Yguess))

# Here the system tries to predict the labels of the test data, where
# Yguess are the predicted labels and Xtest is the data

print(classification_report(Ytest, Yguess))

#prints confusion matrix

labels=['pos','neg']
cm=confusion_matrix(Ytest, Yguess,)
c = 0
print("{0}".format("|"), *labels, sep="{0:20}".format("|"))
print("_"*50)
for h in labels:
    print("{0:<20}".format(h), *cm[c], sep="{0:<10}".format("|"))

    c += 1

# print prior probabilities per class

print("\n")
c = 0    
for h in labels:
    print("Prior probabilty {0}: {1} \n".format(h, (sum(cm[c]) / total_instances)))
    c += 1
    

    

# print posterior probabilities per class

print("posterior probability: ",classifier. predict_proba(X))





  k in range(self.n_outputs_)).T


accuracy score:  0.506
              precision    recall  f1-score   support

         neg       0.49      0.53      0.51       731
         pos       0.52      0.48      0.50       769

   micro avg       0.51      0.51      0.51      1500
   macro avg       0.51      0.51      0.51      1500
weighted avg       0.51      0.51      0.51      1500

accuracy score:  0.798
              precision    recall  f1-score   support

         neg       0.76      0.85      0.80       731
         pos       0.84      0.75      0.79       769

   micro avg       0.80      0.80      0.80      1500
   macro avg       0.80      0.80      0.80      1500
weighted avg       0.80      0.80      0.80      1500

||                   pos|                   neg
__________________________________________________
pos                 |         624|         107
neg                 |         196|         573


Prior probabilty pos: 0.48733333333333334 

Prior probabilty neg: 0.5126666666666667 

posterior probabil

In [28]:
# Comments are the same as previous cells
X, Y = read_corpus('trainset.txt', use_sentiment=False)
split_point = int(0.75*len(X))
Xtrain = X[:split_point]
Ytrain = Y[:split_point]
Xtest = X[split_point:]
Ytest = Y[split_point:]
total_instances = len(Xtest)

# Create dummy classifer
dummy = DummyClassifier(strategy='stratified')
dummyscore= dummy.fit(Xtrain, Ytrain)
baselineguess = dummyscore.predict(Xtest)
print("accuracy score: ", accuracy_score(Ytest, baselineguess))
print(classification_report(Ytest, baselineguess))

tfidf = False

if tfidf:
    vec = TfidfVectorizer(preprocessor = identity,
                          tokenizer = identity)
else:
    vec = CountVectorizer(preprocessor = identity,
                          tokenizer = identity)

# combine the vectorizer with a Naive Bayes classifier
classifier = Pipeline( [('vec', vec),
                        ('cls', MultinomialNB())] )


classifier.fit(Xtrain, Ytrain)

Yguess = classifier.predict(Xtest)

print("accuracy score: ", accuracy_score(Ytest, Yguess))


print(classification_report(Ytest, Yguess))

labels=['books', 'camera', 'dvd', 'health', 'music', 'software']
cm=confusion_matrix(Ytest, Yguess, labels=labels)
c = 0
print("{0}".format(""), *labels, sep="{0:10}".format("|"))
print("_"*50)
for h in labels:
    print("{0:<20}".format(h), *cm[c], sep="{0:<8}".format("|"))
    c += 1

print("\n")
c = 0    
for h in labels:
    print("Prior probabilty {0}: {1} \n".format(h, (sum(cm[c]) / total_instances)))
    c += 1


print("posterior probability: ",classifier. predict_proba(X))
        


    

  k in range(self.n_outputs_)).T


accuracy score:  0.17066666666666666
              precision    recall  f1-score   support

       books       0.19      0.21      0.20       233
      camera       0.14      0.14      0.14       258
         dvd       0.18      0.18      0.18       242
      health       0.17      0.15      0.16       243
       music       0.16      0.17      0.16       260
    software       0.18      0.17      0.18       264

   micro avg       0.17      0.17      0.17      1500
   macro avg       0.17      0.17      0.17      1500
weighted avg       0.17      0.17      0.17      1500

accuracy score:  0.894
              precision    recall  f1-score   support

       books       0.91      0.91      0.91       233
      camera       0.82      0.94      0.87       258
         dvd       0.88      0.87      0.87       242
      health       0.98      0.79      0.88       243
       music       0.95      0.92      0.93       260
    software       0.86      0.93      0.89       264

   micro avg     