# NLTK TOKENIZER AND TF-IDF VECTORIZER

In [1]:
import sys
sys.path.append('../')
from tokenizer import tokenizer
from vectorizer import vectorizer

import numpy as np
import pandas as pd
import sklearn as sk

[nltk_data] Downloading package punkt to /home/flo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/flo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv('../../_data/Reviews.csv') # Loading the dataset
X, y = data['Text'], data['Score']

In [3]:
# FOR TESTING : only select first 20000 samples
# X, y = X[:20000], y[:20000]

In [4]:
tokenized_documents = tokenizer(X)

In [5]:
tokenized_documents

[['bought',
  'several',
  'vitality',
  'canned',
  'dog',
  'food',
  'products',
  'found',
  'good',
  'quality',
  'product',
  'looks',
  'like',
  'stew',
  'processed',
  'meat',
  'smells',
  'better',
  'labrador',
  'finicky',
  'appreciates',
  'product',
  'better'],
 ['product',
  'arrived',
  'labeled',
  'jumbo',
  'salted',
  'peanuts',
  'peanuts',
  'actually',
  'small',
  'sized',
  'unsalted',
  'sure',
  'error',
  'vendor',
  'intended',
  'represent',
  'product',
  'jumbo'],
 ['confection',
  'around',
  'centuries',
  'light',
  'pillowy',
  'citrus',
  'gelatin',
  'nuts',
  'case',
  'filberts',
  'cut',
  'tiny',
  'squares',
  'liberally',
  'coated',
  'powdered',
  'sugar',
  'tiny',
  'mouthful',
  'heaven',
  'chewy',
  'flavorful',
  'highly',
  'recommend',
  'yummy',
  'treat',
  'familiar',
  'story',
  'c',
  'lewis',
  'lion',
  'witch',
  'wardrobe',
  'treat',
  'seduces',
  'edmund',
  'selling',
  'brother',
  'sisters',
  'witch'],
 ['looki

In [6]:
X, vect = vectorizer(tokenized_documents)



In [7]:
# most frequent words
print("Top 10 most frequent words in the dataset")
print(vect.get_feature_names_out()[:10])

# least frequent words
print("Top 10 least frequent words in the dataset")
print(vect.get_feature_names_out()[-10:])

Top 10 most frequent words in the dataset
['0' '00' '000' '0000' '000001' '00001' '000013' '0000soo' '0001'
 '000111052']
Top 10 least frequent words in the dataset
['¾' 'â' 'çay' 'çaykur' 'çelem' 'être' 'île' 'ît' 'ø' 'þ']


In [8]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

print(X_train[0])

(454763, 120144) (113691, 120144) (454763,) (113691,)
  (0, 94340)	0.26983932044526726
  (0, 107461)	0.356573063664095
  (0, 60888)	0.6837599652000029
  (0, 31216)	0.37896669799521565
  (0, 25542)	0.19195720719400375
  (0, 79539)	0.28524356089653274
  (0, 69704)	0.22960104461763894
  (0, 66863)	0.13405538070127823


# Naive Bayes Classifier
### Model starts here

In [10]:
from sklearn.naive_bayes import MultinomialNB

# Multinomial Naive Bayes
nb = MultinomialNB()
naive_bayes = nb.fit(X_train, y_train)
predicted = naive_bayes.predict(X_test)

# Metrics
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predicted))
print(metrics.classification_report(y_test, predicted))

[[ 1296     0     3    14  9013]
 [  132    11     1    14  5697]
 [   59     1    25    32  8368]
 [   38     4     3   201 15877]
 [   44    14    13    39 72792]]
              precision    recall  f1-score   support

           1       0.83      0.13      0.22     10326
           2       0.37      0.00      0.00      5855
           3       0.56      0.00      0.01      8485
           4       0.67      0.01      0.02     16123
           5       0.65      1.00      0.79     72902

    accuracy                           0.65    113691
   macro avg       0.61      0.23      0.21    113691
weighted avg       0.65      0.65      0.53    113691



In [11]:
example = '''
awful
'''

# tokenize and vectorize it, then try to predict
test = tokenizer([example])
test_tfidf = vect.transform(test)
print(naive_bayes.predict(test_tfidf))

[1]
