In [1]:
!pip install pandas
!pip install scikit-learn
!pip install nltk
!pip install tiktoken

Collecting pandas
  Downloading pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m505.5/505.5 kB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytz, tzdata, pandas
Successfully ins

In [2]:
import pandas as pd
import tiktoken
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def tokenizer(X):
    enc = tiktoken.encoding_for_model("gpt-4")
    stop_words = set(stopwords.words('english'))
    num_tokens = 0
    num_sentences = 0
    tokenized_documents = []
    for text in X:
        tokenized_document = enc.encode(text)

        filtered_tokens = [token for token in tokenized_document if token not in stop_words]

        num_tokens += len(filtered_tokens)
        num_sentences += text.count('.') + text.count('!') + text.count('?')
        
        tokenized_documents.append(filtered_tokens)

    print("Number of tokens: ", num_tokens)
    print("Number of sentences: ", num_sentences)
    
    return tokenized_documents


import sklearn.feature_extraction.text as sk_text

def vectorizer(tokenized_documents):
    # Conversion des bytes en chaînes de caractères si nécessaire
    tokenized_documents_str = []
    for document in tokenized_documents:
        if isinstance(document, list):
            # Convertir les bytes en chaînes de caractères
            tokenized_document_str = [token.decode('utf-8') if isinstance(token, bytes) else str(token) for token in document]
            tokenized_documents_str.append(tokenized_document_str)
        else:
            # Si le document n'est pas une liste, le convertir en liste de chaîne de caractères
            tokenized_documents_str.append([str(document)])

    # Initialiser le vectoriseur
    vectorizer = sk_text.CountVectorizer(lowercase=False, preprocessor=None, tokenizer=lambda x: x)
    
    # Adapter et transformer les données
    X = vectorizer.fit_transform(tokenized_documents_str)
    
    return X, vectorizer

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# NLTK TOKENIZER AND TF-IDF VECTORIZER

In [3]:
import sys
sys.path.append('../')
from tokenizer import tokenizer
from vectorizer import vectorizer

import numpy as np
import pandas as pd
import sklearn as sk

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
data = pd.read_csv('../../_data/Reviews.csv') # Loading the dataset
X, y = data['Text'], data['Score']

In [5]:
# FOR TESTING : only select first 20000 samples
# X, y = X[:20000], y[:20000]

In [6]:
tokenized_documents = tokenizer(X)

Number of tokens:  58325048
Number of sentences:  3661772


In [7]:
X, vect = vectorizer(tokenized_documents)



In [8]:
# most frequent words
print("Top 10 most frequent words in the dataset")
print(vect.get_feature_names_out()[:10])

# least frequent words
print("Top 10 least frequent words in the dataset")
print(vect.get_feature_names_out()[-10:])

Top 10 most frequent words in the dataset
['0' '1' '10' '100004' '100005' '10001' '100011' '100012' '100014'
 '100016']
Top 10 least frequent words in the dataset
['99975' '9998' '99981' '99984' '99985' '99986' '9999' '99990' '99992'
 '99994']


In [9]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

print(X_train[0])

(454763, 53351) (113691, 53351) (454763,) (113691,)
  (0, 34404)	1
  (0, 2531)	1
  (0, 23716)	1
  (0, 19965)	2
  (0, 22878)	1
  (0, 23523)	1
  (0, 899)	2
  (0, 5502)	1
  (0, 1672)	1
  (0, 30112)	1
  (0, 0)	1
  (0, 51388)	1
  (0, 39869)	1
  (0, 20900)	1
  (0, 932)	2
  (0, 52850)	1
  (0, 36993)	1
  (0, 14313)	1
  (0, 6293)	1
  (0, 24280)	1
  (0, 2731)	1
  (0, 656)	1
  (0, 1347)	1


# Feed Forward Neural Network
### Model starts here

In [11]:
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Add
from tensorflow.keras.utils import to_categorical
from sklearn import metrics
import datetime
import os

2024-05-05 14:49:17.283367: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-05 14:49:17.319262: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
def batch_generator(X, y, batch_size=32, num_classes=None):
    num_samples = X.shape[0]
    while True:
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            X_batch = X[start:end].toarray()  # Convert only this slice to dense
            y_batch = to_categorical(y[start:end], num_classes=num_classes)
            yield (X_batch, y_batch)

# Setup the model
inputs = Input(shape=(X_train.shape[1],))
x = Dense(512, activation='relu')(inputs)
x = Dropout(0.2)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.2)(x)
residual = Dense(64, activation='relu')(x)  # Adjust the residual shape

x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Add()([x, residual])  # Add residual connection
x = Dense(32, activation='relu')(x)
outputs = Dense(np.max(y_train) + 1, activation='softmax')(x)

model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model using a generator
train_generator = batch_generator(X_train, y_train, batch_size=32, num_classes=np.max(y_train) + 1)
model.fit(train_generator, epochs=20, steps_per_epoch=int(np.ceil(X_train.shape[0] / 32)))

Epoch 1/20


2024-05-05 14:49:19.088055: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 78746 MB memory:  -> device: 0, name: NVIDIA H100 PCIe, pci bus id: 0000:b5:00.0, compute capability: 9.0
I0000 00:00:1714920560.756364    8681 service.cc:145] XLA service 0x7f0390005010 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1714920560.756413    8681 service.cc:153]   StreamExecutor device (0): NVIDIA H100 PCIe, Compute Capability 9.0
2024-05-05 14:49:20.805273: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-05-05 14:49:21.056887: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8900


[1m   27/14212[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:24[0m 6ms/step - accuracy: 0.5019 - loss: 1.6054

I0000 00:00:1714920565.401725    8681 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 9ms/step - accuracy: 0.7192 - loss: 0.7683
Epoch 2/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 9ms/step - accuracy: 0.8047 - loss: 0.5345
Epoch 3/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 9ms/step - accuracy: 0.8570 - loss: 0.3983
Epoch 4/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 9ms/step - accuracy: 0.8923 - loss: 0.3020
Epoch 5/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 9ms/step - accuracy: 0.9173 - loss: 0.2346
Epoch 6/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 9ms/step - accuracy: 0.9359 - loss: 0.1850
Epoch 7/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 9ms/step - accuracy: 0.9484 - loss: 0.1518
Epoch 8/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 9ms/step - accuracy: 0.9579 - loss: 0.1253
Epo

<keras.src.callbacks.history.History at 0x7f0bc74f2600>

In [13]:
test_generator = batch_generator(X_test, y_test, batch_size=32, num_classes=np.max(y_train) + 1)

In [14]:
# Evaluate the model
predictions_prob = model.predict(test_generator, steps=int(np.ceil(X_test.shape[0] / 32)))
predictions = np.argmax(predictions_prob, axis=1)

# Metrics
print("Confusion Matrix:")
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

print("\nClassification Report:")
print(metrics.classification_report(y_test, predictions))

[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 8ms/step
Confusion Matrix:
[[ 7723   859   444   231  1069]
 [  915  3115   751   312   762]
 [  550   691  4685  1053  1506]
 [  246   311  1038  9166  5362]
 [  562   357   757  4000 67226]]

Classification Report:
              precision    recall  f1-score   support

           1       0.77      0.75      0.76     10326
           2       0.58      0.53      0.56      5855
           3       0.61      0.55      0.58      8485
           4       0.62      0.57      0.59     16123
           5       0.89      0.92      0.90     72902

    accuracy                           0.81    113691
   macro avg       0.69      0.66      0.68    113691
weighted avg       0.80      0.81      0.80    113691



In [17]:
# # Save the model with current date and time in model folder

# Create a folder named _models in the current directory
if not os.path.exists('_models'):
    os.makedirs('_models')
    
model.save(f'_models/feedforward_2_feedforward_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}.keras')

In [16]:
# test the model
test_sentences = [
    'This is a good product',
    'This is a bad product',
    'This is a product',
    'This is a very good product',
    'This is a very bad product',
    'That was bad'
]

for sentence in test_sentences:
    test_tokenized = tokenizer([sentence])
    test_vec = vect.transform(test_tokenized)
    test_vec_dense = test_vec.toarray()  # Convert sparse tensor to dense tensor
    test_vec_lstm = test_vec_dense.reshape(1, -1)
    result = model.predict(test_vec_lstm)
    predicted_class = result.argmax()
    predicted_score = predicted_class
    print(f"Test sentence: {sentence}")
    print(f"Predicted score: {predicted_score}")
    print()

Number of tokens:  5
Number of sentences:  0


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 497ms/step
Test sentence: This is a good product
Predicted score: 4

Number of tokens:  5
Number of sentences:  0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Test sentence: This is a bad product
Predicted score: 4

Number of tokens:  4
Number of sentences:  0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Test sentence: This is a product
Predicted score: 4

Number of tokens:  6
Number of sentences:  0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Test sentence: This is a very good product
Predicted score: 4

Number of tokens:  6
Number of sentences:  0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Test sentence: This is a very bad product
Predicted score: 4

Number of tokens:  3
Number of sentences:  0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Test sentence: That was bad
Predicted score: 4

