In [1]:
!pip install pandas
!pip install scikit-learn
!pip install nltk
!pip install tiktoken



In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import sklearn as sk
import sklearn.model_selection

import tiktoken

nltk.download('punkt')
nltk.download('stopwords')

def tokenizer(X):
    stop_words = set(stopwords.words('english'))
    num_tokens = 0
    num_sentences = 0
    tokenized_documents = []
    for text in X:
        sentences = sent_tokenize(text)
        num_sentences += len(sentences)
        tokenized_sentences = [nltk.RegexpTokenizer(r"\w+").tokenize(s) for s in sentences]
        tokenized_document = [word.lower() for sentence in tokenized_sentences for word in sentence if word.lower() not in stop_words]
        num_tokens += len(tokenized_document)
        tokenized_documents.append(tokenized_document)
    print("Number of tokens: ", num_tokens)
    print("Number of sentences: ", num_sentences)
    return tokenized_documents

import sklearn as sk

# tf-idf vectorizer
def vectorizer(tokenized_documents):
    ## tokenized_documents is a list of lists, where each inner list contains tokens
    vectorizer = sk.feature_extraction.text.TfidfVectorizer(lowercase=False, preprocessor=None, tokenizer=lambda x: x)
    X = vectorizer.fit_transform(tokenized_documents)
    return X, vectorizer

[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# NLTK TOKENIZER AND TF-IDF VECTORIZER

In [3]:
import sys
sys.path.append('../')
from tokenizer import tokenizer
from vectorizer import vectorizer

import numpy as np
import pandas as pd
import sklearn as sk

[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
data = pd.read_csv('../../_data/Reviews.csv') # Loading the dataset
X, y = data['Text'], data['Score']

In [5]:
# FOR TESTING : only select first 20000 samples
# X, y = X[:20000], y[:20000]

In [6]:
tokenized_documents = tokenizer(X)

Number of tokens:  23767229
Number of sentences:  2832806


In [7]:
X, vect = vectorizer(tokenized_documents)



In [8]:
# most frequent words
print("Top 10 most frequent words in the dataset")
print(vect.get_feature_names_out()[:10])

# least frequent words
print("Top 10 least frequent words in the dataset")
print(vect.get_feature_names_out()[-10:])

Top 10 most frequent words in the dataset
['0' '00' '000' '0000' '000001' '00001' '000013' '0000soo' '0001'
 '000111052']
Top 10 least frequent words in the dataset
['¾' 'â' 'çay' 'çaykur' 'çelem' 'être' 'île' 'ît' 'ø' 'þ']


In [9]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

print(X_train[0])

(454763, 120144) (113691, 120144) (454763,) (113691,)
  (0, 94340)	0.2698393204452672
  (0, 107461)	0.356573063664095
  (0, 60888)	0.6837599652000029
  (0, 31216)	0.37896669799521565
  (0, 25542)	0.19195720719400375
  (0, 79539)	0.28524356089653274
  (0, 69704)	0.22960104461763894
  (0, 66863)	0.13405538070127823


# Feed Forward Neural Network
### Model starts here

In [18]:
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Add, Input


In [21]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Add
from tensorflow.keras.utils import to_categorical

def batch_generator(X, y, batch_size=32, num_classes=None):
    num_samples = X.shape[0]
    while True:
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            X_batch = X[start:end].toarray()  # Convert only this slice to dense
            y_batch = to_categorical(y[start:end], num_classes=num_classes)
            yield (X_batch, y_batch)

# Setup the model
inputs = Input(shape=(X_train.shape[1],))
x = Dense(512, activation='relu')(inputs)
x = Dropout(0.2)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.2)(x)
residual = Dense(64, activation='relu')(x)  # Adjust the residual shape

x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Add()([x, residual])  # Add residual connection
x = Dense(32, activation='relu')(x)
outputs = Dense(np.max(y_train) + 1, activation='softmax')(x)

model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model using a generator
train_generator = batch_generator(X_train, y_train, batch_size=32, num_classes=np.max(y_train) + 1)
model.fit(train_generator, epochs=20, steps_per_epoch=int(np.ceil(X_train.shape[0] / 32)))

Epoch 1/20


[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 13ms/step - accuracy: 0.7210 - loss: 0.7699
Epoch 2/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 13ms/step - accuracy: 0.8226 - loss: 0.4919
Epoch 3/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 12ms/step - accuracy: 0.8914 - loss: 0.3090
Epoch 4/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 12ms/step - accuracy: 0.9313 - loss: 0.1980
Epoch 5/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 12ms/step - accuracy: 0.9549 - loss: 0.1328
Epoch 6/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 12ms/step - accuracy: 0.9694 - loss: 0.0907
Epoch 7/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 12ms/step - accuracy: 0.9781 - loss: 0.0666
Epoch 8/20
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 12ms/step - accuracy: 0.9834 - loss: 0.

<keras.src.callbacks.history.History at 0x7f901813b530>

In [29]:
test_generator = batch_generator(X_test, y_test, batch_size=32, num_classes=np.max(y_train) + 1)

In [30]:
# Evaluate the model
predictions_prob = model.predict(test_generator, steps=int(np.ceil(X_test.shape[0] / 32)))
predictions = np.argmax(predictions_prob, axis=1)

# Metrics
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predictions))
print(metrics.classification_report(y_test, predictions))

[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 11ms/step
[[ 7490   852   579   229  1176]
 [  880  2955   725   320   975]
 [  534   559  4566   976  1850]
 [  324   245   852  8656  6046]
 [  709   347   777  2856 68213]]
              precision    recall  f1-score   support

           1       0.75      0.73      0.74     10326
           2       0.60      0.50      0.55      5855
           3       0.61      0.54      0.57      8485
           4       0.66      0.54      0.59     16123
           5       0.87      0.94      0.90     72902

    accuracy                           0.81    113691
   macro avg       0.70      0.65      0.67    113691
weighted avg       0.80      0.81      0.80    113691



In [31]:
# Save the model with current date and time in model folder
import datetime

# Create a folder named _models in the current directory
import os
if not os.path.exists('_models'):
    os.makedirs('_models')
    
model.save(f'_models/config1_feedforward_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}.keras')

In [35]:
# test the model
test_sentences = [
    'This is a good product',
    'This is a bad product',
    'This is a product',
    'This is a very good product',
    'This is a very bad product',
    'That was bad'
]

for sentence in test_sentences:
    test_tokenized = tokenizer([sentence])
    test_vec = vect.transform(test_tokenized)
    test_vec_dense = test_vec.toarray()  # Convert sparse tensor to dense tensor
    result = model.predict(test_vec_dense)
    predicted_score = result.argmax() + 1
    print(f"Test sentence: {sentence}")
    print(f"Predicted score: {predicted_score}")
    print()

Number of tokens:  2
Number of sentences:  1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
Test sentence: This is a good product
Predicted score: 6

Number of tokens:  2
Number of sentences:  1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step


Test sentence: This is a bad product
Predicted score: 5

Number of tokens:  1
Number of sentences:  1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Test sentence: This is a product
Predicted score: 5

Number of tokens:  2
Number of sentences:  1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Test sentence: This is a very good product
Predicted score: 6

Number of tokens:  2
Number of sentences:  1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Test sentence: This is a very bad product
Predicted score: 5

Number of tokens:  1
Number of sentences:  1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Test sentence: That was bad
Predicted score: 2

