In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


# COMMENT THIS
def read_corpus(corpus_file, use_sentiment):
    documents = []
    labels = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            tokens = line.strip().split()

            documents.append(tokens[3:])

            if use_sentiment:
                # 2-class problem: positive vs negative
                labels.append( tokens[1] )
            else:
                # 6-class problem: books, camera, dvd, health, music, software
                labels.append( tokens[0] )

    return documents, labels
    
# a dummy function that just returns its input
def identity(x):
    return x



In [24]:
# COMMENT THIS
X, Y = read_corpus('trainset.txt', use_sentiment=True)
split_point = int(0.75*len(X))
Xtrain = X[:split_point]
Ytrain = Y[:split_point]
Xtest = X[split_point:]
Ytest = Y[split_point:]

# let's use the TF-IDF vectorizer
tfidf = True

# we use a dummy function as tokenizer and preprocessor,
# since the texts are already preprocessed and tokenized.
if tfidf:
    vec = TfidfVectorizer(preprocessor = identity,
                          tokenizer = identity)
else:
    vec = CountVectorizer(preprocessor = identity,
                          tokenizer = identity)

# combine the vectorizer with a Naive Bayes classifier
classifier = Pipeline( [('vec', vec),
                        ('cls', MultinomialNB())] )


# COMMENT THIS
classifier.fit(Xtrain, Ytrain)

# COMMENT THIS  
Yguess = classifier.predict(Xtest)

# COMMENT THIS
print(accuracy_score(Ytest, Yguess))

#print(precision_recall_fscore_support(Ytest, Yguess, average='macro'))
#print(precision_recall_fscore_support(Ytest, Yguess, average='micro'))
#print(precision_recall_fscore_support(Ytest, Yguess, average='weighted'))


print(classification_report(Ytest, Yguess))


labels=['pos','neg']
cm=confusion_matrix(Ytest, Yguess,)
c = 0
print("{0}".format("|"), *labels, sep="{0:20}".format("|"))
print("_"*50)
for h in labels:
    print("{0:<20}".format(h), *cm[c], sep="{0:<10}".format("|"))
    c += 1
    
classifier.fit(X, Y)
print(classifier.predict(X[2:100]))
        

0.782
              precision    recall  f1-score   support

         neg       0.71      0.93      0.81       731
         pos       0.91      0.64      0.75       769

   micro avg       0.78      0.78      0.78      1500
   macro avg       0.81      0.79      0.78      1500
weighted avg       0.81      0.78      0.78      1500

||                   pos|                   neg
__________________________________________________
pos                 |         683|         48
neg                 |         279|         490
['neg' 'pos' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg'
 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos' 'pos' 'neg' 'pos'
 'neg' 'neg' 'pos' 'neg' 'neg' 'pos' 'pos' 'neg' 'neg' 'pos' 'neg' 'neg'
 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'pos' 'pos'
 'pos' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'pos' 'pos' 'pos' 'pos' 'pos'
 'neg' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg'
 'pos' 'pos' 'pos' 'neg' 'pos' 'neg' 

In [6]:
# COMMENT THIS
X, Y = read_corpus('trainset.txt', use_sentiment=False)
split_point = int(0.75*len(X))
Xtrain = X[:split_point]
Ytrain = Y[:split_point]
Xtest = X[split_point:]
Ytest = Y[split_point:]

# let's use the TF-IDF vectorizer
tfidf = True

# we use a dummy function as tokenizer and preprocessor,
# since the texts are already preprocessed and tokenized.
if tfidf:
    vec = TfidfVectorizer(preprocessor = identity,
                          tokenizer = identity)
else:
    vec = CountVectorizer(preprocessor = identity,
                          tokenizer = identity)

# combine the vectorizer with a Naive Bayes classifier
classifier = Pipeline( [('vec', vec),
                        ('cls', MultinomialNB())] )


# COMMENT THIS
classifier.fit(Xtrain, Ytrain)

# COMMENT THIS  
Yguess = classifier.predict(Xtest)

# COMMENT THIS
print(accuracy_score(Ytest, Yguess))

#print(precision_recall_fscore_support(Ytest, Yguess, average='macro'))
#print(precision_recall_fscore_support(Ytest, Yguess, average='micro'))
#print(precision_recall_fscore_support(Ytest, Yguess, average='weighted'))



print(classification_report(Ytest, Yguess))

labels=['books', 'camera', 'dvd', 'health', 'music', 'software']
cm=confusion_matrix(Ytest, Yguess, labels=labels)
c = 0
print("{0}".format(""), *labels, sep="{0:10}".format("|"))
print("_"*50)
for h in labels:
    print("{0:<20}".format(h), *cm[c], sep="{0:<8}".format("|"))
    c += 1

        

0.9066666666666666
              precision    recall  f1-score   support

       books       0.94      0.91      0.93       233
      camera       0.83      0.94      0.88       258
         dvd       0.88      0.91      0.89       242
      health       0.97      0.79      0.87       243
       music       0.96      0.95      0.95       260
    software       0.89      0.93      0.91       264

   micro avg       0.91      0.91      0.91      1500
   macro avg       0.91      0.91      0.91      1500
weighted avg       0.91      0.91      0.91      1500

|         books|         camera|         dvd|         health|         music|         software
__________________________________________________
books               |       213|       2|       12|       1|       0|       5
camera              |       1|       242|       2|       3|       1|       9
dvd                 |       5|       6|       220|       0|       9|       2
health              |       2|       33|       4|       192| 