# NLTK TOKENIZER AND TF-IDF VECTORIZER

In [1]:
import sys
sys.path.append('../')
from tokenizer import tokenizer
from vectorizer import vectorizer

import numpy as np
import pandas as pd
import sklearn as sk

[nltk_data] Downloading package punkt to /home/assil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data = pd.read_csv('../../_data/Reviews.csv') # Loading the dataset
X, y = data['Text'], data['Score']

In [3]:
# FOR TESTING : only select first 20000 samples
# X, y = X[:20000], y[:20000]

In [5]:
tokenized_documents = tokenizer(X)

In [10]:
X, vect = vectorizer(tokenized_documents)



In [11]:
# most frequent words
print("Top 10 most frequent words in the dataset")
print(vect.get_feature_names_out()[:10])

# least frequent words
print("Top 10 least frequent words in the dataset")
print(vect.get_feature_names_out()[-10:])

Top 10 most frequent words in the dataset
['0' '00' '000' '0000' '000001' '00001' '000013' '0000soo' '0001'
 '000111052']
Top 10 least frequent words in the dataset
['¾' 'â' 'çay' 'çaykur' 'çelem' 'être' 'île' 'ît' 'ø' 'þ']


In [13]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

print(X_train[0])

(454763, 120297) (113691, 120297) (454763,) (113691,)
  (0, 94440)	0.2476460880247669
  (0, 107570)	0.3272463189045511
  (0, 60950)	0.6275233729847669
  (0, 31243)	0.34779816409007885
  (0, 25567)	0.17616947504652833
  (0, 79631)	0.26178338973622206
  (0, 25422)	0.10376272503403408
  (0, 106024)	0.11820467973703584
  (0, 40683)	0.17006292327236913
  (0, 78761)	0.16403902113711577
  (0, 69776)	0.21071725355716414
  (0, 107849)	0.10560844196570854
  (0, 76937)	0.11164550726217257
  (0, 108271)	0.08235972398533828
  (0, 61826)	0.1684136904878422
  (0, 61973)	0.08010732780470854
  (0, 66932)	0.12302984811317055
  (0, 55845)	0.10944036920045735


# Naive Bayes Classifier
### Model starts here

In [15]:
from sklearn.naive_bayes import MultinomialNB

# Multinomial Naive Bayes
nb = MultinomialNB()
naive_bayes = nb.fit(X_train, y_train)
predicted = naive_bayes.predict(X_test)

# Metrics
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predicted))
print(metrics.classification_report(y_test, predicted))

[[  880     0     1    12  9433]
 [   87     5     1    11  5751]
 [   45     1     8    25  8406]
 [   25     4     4   135 15955]
 [   36    14    10    33 72809]]
              precision    recall  f1-score   support

           1       0.82      0.09      0.15     10326
           2       0.21      0.00      0.00      5855
           3       0.33      0.00      0.00      8485
           4       0.62      0.01      0.02     16123
           5       0.65      1.00      0.79     72902

    accuracy                           0.65    113691
   macro avg       0.53      0.22      0.19    113691
weighted avg       0.61      0.65      0.52    113691



In [40]:
example = '''
awful
'''

# tokenize and vectorize it, then try to predict
test = tokenizer([example])
test_tfidf = vect.transform(test)
print(naive_bayes.predict(test_tfidf))

[1]
