# NLTK TOKENIZER AND TF-IDF VECTORIZER

In [1]:
import sys
sys.path.append('../')
from tokenizer import tokenizer
from vectorizer import vectorizer

import numpy as np
import pandas as pd
import sklearn as sk

[nltk_data] Downloading package punkt to /home/assil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/assil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv('../../_data/Reviews.csv') # Loading the dataset
X, y = data['Text'], data['Score']

In [3]:
# FOR TESTING : only select first 20000 samples
# X, y = X[:20000], y[:20000]

In [4]:
tokenized_documents = tokenizer(X)

In [5]:
X, vect = vectorizer(tokenized_documents)



In [6]:
# most frequent words
print("Top 10 most frequent words in the dataset")
print(vect.get_feature_names_out()[:10])

# least frequent words
print("Top 10 least frequent words in the dataset")
print(vect.get_feature_names_out()[-10:])

Top 10 most frequent words in the dataset
['0' '00' '000' '0000' '000001' '00001' '000013' '0000soo' '0001'
 '000111052']
Top 10 least frequent words in the dataset
['¾' 'â' 'çay' 'çaykur' 'çelem' 'être' 'île' 'ît' 'ø' 'þ']


In [7]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

print(X_train[0])

(454763, 120144) (113691, 120144) (454763,) (113691,)
  (0, 94340)	0.26983932044526726
  (0, 107461)	0.356573063664095
  (0, 60888)	0.6837599652000029
  (0, 31216)	0.37896669799521565
  (0, 25542)	0.19195720719400375
  (0, 79539)	0.28524356089653274
  (0, 69704)	0.22960104461763894
  (0, 66863)	0.13405538070127823


# Feed Forward Neural Network
### Model starts here

In [9]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

2024-04-27 16:48:53.703076: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import to_categorical

def batch_generator(X, y, batch_size=32, num_classes=None, timesteps=1):
    num_samples = X.shape[0]
    features_per_timestep = X.shape[1] // timesteps
    while True:
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            X_batch = X[start:end].toarray()  # Convert only this slice to dense
            X_batch = X_batch.reshape(-1, timesteps, features_per_timestep)  # Reshape for LSTM
            y_batch = to_categorical(y[start:end], num_classes=num_classes)
            yield (X_batch, y_batch)


# Setup the model
model = Sequential([
    LSTM(512, input_shape=(None, X_train.shape[1]), activation='relu', return_sequences=True),
    LSTM(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(np.max(y_train) + 1, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model using a generator
# Assuming `timesteps` is set to 1, you might want to adjust this based on how you wish to frame your sequences
train_generator = batch_generator(X_train, y_train, batch_size=32, num_classes=np.max(y_train) + 1, timesteps=1)
model.fit(train_generator, epochs=5, steps_per_epoch=int(np.ceil(X_train.shape[0] / 32)))

Epoch 1/5


I0000 00:00:1714229481.522712   13327 service.cc:145] XLA service 0x71455de19850 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1714229481.522750   13327 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce GTX 1080 Ti, Compute Capability 6.1
2024-04-27 16:51:21.597787: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-04-27 16:51:21.926132: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m    5/14212[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m8:02[0m 34ms/step - accuracy: 0.4571 - loss: 1.7873 

I0000 00:00:1714229483.278716   13327 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m482s[0m 34ms/step - accuracy: 0.7171 - loss: 0.7816
Epoch 2/5
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m474s[0m 33ms/step - accuracy: 0.8264 - loss: 0.4832
Epoch 3/5
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m474s[0m 33ms/step - accuracy: 0.9152 - loss: 0.2445
Epoch 4/5
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m474s[0m 33ms/step - accuracy: 0.9653 - loss: 0.1050
Epoch 5/5
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m474s[0m 33ms/step - accuracy: 0.9851 - loss: 0.0474


<keras.src.callbacks.history.History at 0x7145500d2580>

In [16]:
test_generator = batch_generator(X_test, y_test, batch_size=32, num_classes=np.max(y_train) + 1, timesteps=1)

In [17]:
# Evaluate the model
predictions_prob = model.predict(test_generator, steps=int(np.ceil(X_test.shape[0] / 32)))
predictions = np.argmax(predictions_prob, axis=1)

# Metrics
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predictions))
print(metrics.classification_report(y_test, predictions))

[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 10ms/step
[[ 7274  1525   475   329   723]
 [  863  3366   608   414   604]
 [  559  1067  4331  1180  1348]
 [  420   622   967  8907  5207]
 [  990  1079  1270  4433 65130]]
              precision    recall  f1-score   support

           1       0.72      0.70      0.71     10326
           2       0.44      0.57      0.50      5855
           3       0.57      0.51      0.54      8485
           4       0.58      0.55      0.57     16123
           5       0.89      0.89      0.89     72902

    accuracy                           0.78    113691
   macro avg       0.64      0.65      0.64    113691
weighted avg       0.79      0.78      0.78    113691



In [None]:
# Save the model with current date and time in model folder
import datetime

# Create a folder named _models in the current directory
import os
if not os.path.exists('_models'):
    os.makedirs('_models')
    
model.save(f'_models/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}.keras')

In [None]:
# test the model
test_sentences = [
    'This is a good product',
    'This is a bad product',
    'This is a product',
    'This is a very good product',
    'This is a very bad product',
    'That was bad'
]

for sentence in test_sentences:
    test_tokenized = tokenizer([sentence])
    test_vec = vect.transform(test_tokenized)
    test_vec_dense = test_vec.toarray()  # Convert sparse tensor to dense tensor
    test_vec_lstm = test_vec_dense.reshape(-1, timesteps, features_per_timestep)
    result = model.predict(test_vec_lstm)
    predicted_score = result.argmax() + 1
    print(f"Test sentence: {sentence}")
    print(f"Predicted score: {predicted_score}")
    print()