# NLTK TOKENIZER AND TF-IDF VECTORIZER

In [1]:
import sys
sys.path.append('../')
from tokenizer import tokenizer
from vectorizer import vectorizer

import numpy as np
import pandas as pd
import sklearn as sk

[nltk_data] Downloading package punkt to /home/assil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data = pd.read_csv('../../_data/Reviews.csv') # Loading the dataset
X, y = data['Text'], data['Score']

In [3]:
# FOR TESTING : only select first 20000 samples
X, y = X[:20000], y[:20000]

In [4]:
tokenized_documents = tokenizer(X)

In [5]:
X, vect = vectorizer(tokenized_documents)



In [6]:
# most frequent words
print("Top 10 most frequent words in the dataset")
print(vect.get_feature_names_out()[:10])

# least frequent words
print("Top 10 least frequent words in the dataset")
print(vect.get_feature_names_out()[-10:])

Top 10 most frequent words in the dataset
['0' '00' '000' '0003' '000kwh' '002' '008' '01' '0100' '0174']
Top 10 least frequent words in the dataset
['zuke' 'zukes' 'zupas' 'zuppa' 'zwieback' 'º' '¼' '½' 'çaykur' 'ît']


In [7]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

print(X_train[0])

(16000, 26658) (4000, 26658) (16000,) (4000,)
  (0, 20554)	0.3845362652831886
  (0, 6390)	0.3436631403149654
  (0, 14429)	0.3724722072082089
  (0, 4235)	0.20728370948487648
  (0, 17569)	0.2629960113653433
  (0, 23859)	0.15524178896215457
  (0, 11671)	0.2679384409265626
  (0, 8352)	0.23213222149909707
  (0, 21911)	0.29415770046654777
  (0, 16681)	0.19747406219111086
  (0, 1912)	0.14002887633461686
  (0, 26519)	0.14117053473926772
  (0, 26200)	0.13006169326147435
  (0, 23803)	0.12657510519567464
  (0, 841)	0.08667788302140157
  (0, 10901)	0.15433885114321025
  (0, 23821)	0.16630262302915325
  (0, 1653)	0.164344730408501
  (0, 23811)	0.15903521582936178
  (0, 11595)	0.13181371070406692
  (0, 12252)	0.08195440863309632


# Feed Forward Neural Network
### Model starts here

In [9]:
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

2024-04-18 16:24:40.237799: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
# Convert sparse matrices to dense matrices
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Convert labels to categorical format
num_classes = np.max(y) + 1
y_train_cat = to_categorical(y_train, num_classes)
y_test_cat = to_categorical(y_test, num_classes)

# Neural Network Model
model = Sequential()
model.add(Dense(512, activation='relu', input_dim=X_train_dense.shape[1]))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Train the model
model.fit(X_train_dense, y_train_cat, epochs=10, batch_size=32)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-04-18 16:24:49.882720: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-18 16:24:49.919045: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-18 16:24:49.919218: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.co

Epoch 1/10


I0000 00:00:1713450293.674961    8201 service.cc:145] XLA service 0x7bb7d00036d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1713450293.675007    8201 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce GTX 1080 Ti, Compute Capability 6.1
2024-04-18 16:24:53.700568: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-04-18 16:24:53.815687: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m 40/500[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 4ms/step - accuracy: 0.5302 - loss: 1.4909

I0000 00:00:1713450294.396002    8201 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6424 - loss: 1.0310
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8149 - loss: 0.4989
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9292 - loss: 0.2115
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9865 - loss: 0.0482
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9988 - loss: 0.0072
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9992 - loss: 0.0036
Epoch 7/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9992 - loss: 0.0027
Epoch 8/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9995 - loss: 0.0022
Epoch 9/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7bb8aedb87c0>

In [11]:
# Evaluate the model
predictions_prob = model.predict(X_test_dense)
predictions = np.argmax(predictions_prob, axis=1)

# Metrics
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predictions))
print(metrics.classification_report(y_test, predictions))

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[[ 206   65   23   10   49]
 [  56   88   44   22   36]
 [  35   71  103   55   78]
 [  25   34   83  181  244]
 [  85   56  108  213 2030]]
              precision    recall  f1-score   support

           1       0.51      0.58      0.54       353
           2       0.28      0.36      0.31       246
           3       0.29      0.30      0.29       342
           4       0.38      0.32      0.35       567
           5       0.83      0.81      0.82      2492

    accuracy                           0.65      4000
   macro avg       0.46      0.48      0.46      4000
weighted avg       0.66      0.65      0.65      4000



In [14]:
# Save the model with current date and time in model folder
import datetime

# Create a folder named _models in the current directory
import os
if not os.path.exists('_models'):
    os.makedirs('_models')

model.save(f'_models/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}.keras')

In [13]:
# test the model
test_sentences = [
    'This is a good product',
    'This is a bad product',
    'This is a product',
    'This is a very good product',
    'This is a very bad product',
    'That was bad'
]

for sentence in test_sentences:
    test_tokenized = tokenizer([sentence])
    test_vec = vect.transform(test_tokenized)
    test_vec_dense = test_vec.toarray()  # Convert sparse tensor to dense tensor
    result = model.predict(test_vec_dense)
    predicted_score = result.argmax() + 1
    print(f"Test sentence: {sentence}")
    print(f"Predicted score: {predicted_score}")
    print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step
Test sentence: This is a good product
Predicted score: 5

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Test sentence: This is a bad product
Predicted score: 2

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Test sentence: This is a product
Predicted score: 6

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Test sentence: This is a very good product
Predicted score: 5

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Test sentence: This is a very bad product
Predicted score: 2

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Test sentence: That was bad
Predicted score: 2

