In [1]:
!pip install scipy==1.12
!pip install pandas
!pip install scikit-learn
!pip install nltk
!pip install tiktoken
!pip install nltk



In [8]:
import pandas as pd
import tiktoken
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def tokenizer(X):
    enc = tiktoken.encoding_for_model("gpt-4")
    stop_words = set(stopwords.words('english'))
    num_tokens = 0
    num_sentences = 0
    tokenized_documents = []
    for text in X:
        words = text.split()  # Split the text into words
        filtered_words = [word.lower() for word in words if word.lower() not in stop_words]  # Convert words to lowercase and filter stop words
        tokenized_document = enc.encode(' '.join(filtered_words))  # Join the filtered words back into a string and encode using tiktoken
        num_tokens += len(tokenized_document)
        num_sentences += text.count('.') + text.count('!') + text.count('?')
        tokenized_documents.append(tokenized_document)

    print("Number of tokens: ", num_tokens)
    print("Number of sentences: ", num_sentences)
    
    return tokenized_documents


import gensim.downloader as api
from numpy import zeros
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def vectorizer(tokenized_documents, enc):
    model = api.load("word2vec-google-news-300")
    
    def avg_word2vec(tokens):
        token_strings = [enc.decode([token]) for token in tokens]  # Decode tokens back to strings
        missing_words = [token for token in token_strings if token not in model]
        word_vectors = [model[token] for token in token_strings if token in model]

        if word_vectors:
            num_dimensions = len(word_vectors[0])
            if missing_words:
                zero_vector = zeros(num_dimensions)
                word_vectors.extend([zero_vector] * len(missing_words))
            average_vector = sum(word_vectors) / len(word_vectors)
            return average_vector
        else:
            num_dimensions = model.vector_size
            return zeros(num_dimensions)
    
    document_vectors = [avg_word2vec(doc) for doc in tokenized_documents]
    X = np.array(document_vectors)

    scaler = MinMaxScaler()
    scaler.fit(X)
    X_train_scaled = scaler.transform(X)

    vocabulary = list(set(token for doc in tokenized_documents for token in enc.decode(doc)))

    return X_train_scaled, vocabulary

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# NLTK TOKENIZER AND TF-IDF VECTORIZER

In [9]:
import sys
sys.path.append('../')
# from tokenizer import tokenizer
# from vectorizer import vectorizer

import numpy as np
import pandas as pd
import sklearn as sk

In [19]:
data = pd.read_csv('../../_data/Reviews.csv') # Loading the dataset
X, y = data['Text'], data['Score']

In [20]:
# FOR TESTING : only select first 20000 samples
X, y = X[:200000], y[:200000]

In [21]:
tokenized_documents = tokenizer(X)

Number of tokens:  12773575
Number of sentences:  1302317


In [22]:
enc = tiktoken.encoding_for_model("gpt-4")
X, vect = vectorizer(tokenized_documents, enc)

In [23]:
# # most frequent words
# print("Top 10 most frequent words in the dataset")
# print(vect.get_feature_names_out()[:10])

# # least frequent words
# print("Top 10 least frequent words in the dataset")
# print(vect.get_feature_names_out()[-10:])

In [24]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

print(X_train[0])

(160000, 300) (40000, 300) (160000,) (40000,)
[0.63630278 0.46479931 0.43123729 0.37989813 0.60258515 0.30517694
 0.41166594 0.54868687 0.43423209 0.33750527 0.50238564 0.79075246
 0.6842003  0.44283142 0.68830531 0.22426465 0.38972015 0.20454589
 0.52977253 0.68738067 0.55681391 0.76875116 0.22240424 0.70632729
 0.46330502 0.76051096 0.84100513 0.22424888 0.63869493 0.53754322
 0.69663735 0.50475268 0.76935531 0.74744753 0.67809982 0.3625221
 0.62996209 0.43798118 0.57280615 0.49809471 0.59097812 0.79662837
 0.51119905 0.49107841 0.38324905 0.71669829 0.6724564  0.67691804
 0.58044379 0.36729628 0.55860431 0.16584562 0.50315903 0.40578889
 0.59203016 0.33915914 0.74831582 0.6105731  0.47957498 0.60404713
 0.61928413 0.43721145 0.70007524 0.37689899 0.54552025 0.75336552
 0.63602663 0.33722209 0.38485085 0.43493873 0.40610902 0.46498717
 0.51215474 0.49104474 0.63359436 0.54892731 0.5059294  0.695404
 0.45264965 0.55433466 0.78148639 0.59408344 0.62307023 0.54244139
 0.23445355 0.59460

# Feed Forward Neural Network
### Model starts here

In [26]:
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Add
from tensorflow.keras.utils import to_categorical
from sklearn import metrics
import datetime
import os

In [27]:
def batch_generator(X, y, batch_size=32, num_classes=None):
    num_samples = X.shape[0]
    while True:
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            X_batch = X[start:end]#.toarray()  # Convert only this slice to dense
            y_batch = to_categorical(y[start:end], num_classes=num_classes)
            yield (X_batch, y_batch)

# Setup the model
inputs = Input(shape=(X_train.shape[1],))
x = Dense(512, activation='relu')(inputs)
x = Dropout(0.2)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.2)(x)
residual = Dense(64, activation='relu')(x)  # Adjust the residual shape

x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Add()([x, residual])  # Add residual connection
x = Dense(32, activation='relu')(x)
outputs = Dense(np.max(y_train) + 1, activation='softmax')(x)

model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model using a generator
train_generator = batch_generator(X_train, y_train, batch_size=32, num_classes=np.max(y_train) + 1)
model.fit(train_generator, epochs=20, steps_per_epoch=int(np.ceil(X_train.shape[0] / 32)))

Epoch 1/20


[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step - accuracy: 0.6270 - loss: 1.1714
Epoch 2/20
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step - accuracy: 0.6292 - loss: 1.1495
Epoch 3/20
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 4ms/step - accuracy: 0.6292 - loss: 1.1493
Epoch 4/20
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step - accuracy: 0.6292 - loss: 1.1502
Epoch 5/20
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step - accuracy: 0.6292 - loss: 1.1501
Epoch 6/20
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 4ms/step - accuracy: 0.6292 - loss: 1.1500
Epoch 7/20
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 4ms/step - accuracy: 0.6292 - loss: 1.1500
Epoch 8/20
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step - accuracy: 0.6292 - loss: 1.1499
Epoch 9/20
[1m1331/5000[0

KeyboardInterrupt: 

In [29]:
test_generator = batch_generator(X_test, y_test, batch_size=32, num_classes=np.max(y_train) + 1)

In [30]:
# Evaluate the model
predictions_prob = model.predict(test_generator, steps=int(np.ceil(X_test.shape[0] / 32)))
predictions = np.argmax(predictions_prob, axis=1)

# Metrics
print("Confusion Matrix:")
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

print("\nClassification Report:")
print(metrics.classification_report(y_test, predictions))

[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step
Confusion Matrix:
[[    0     0     0     0  3660]
 [    0     0     0     0  2045]
 [    0     0     0     0  3168]
 [    0     0     0     0  5838]
 [    0     0     0     0 25289]]

Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00      3660
           2       0.00      0.00      0.00      2045
           3       0.00      0.00      0.00      3168
           4       0.00      0.00      0.00      5838
           5       0.63      1.00      0.77     25289

    accuracy                           0.63     40000
   macro avg       0.13      0.20      0.15     40000
weighted avg       0.40      0.63      0.49     40000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
# # Save the model with current date and time in model folder

# Create a folder named _models in the current directory
if not os.path.exists('_models'):
    os.makedirs('_models')
    
model.save(f'_models/config3_feedforward_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}.keras')

In [33]:
# Test the model
test_sentences = [
    'This is a good product',
    'This is a bad product',
    'This is a product',
    'This is a very good product',
    'This is a very bad product',
    'That was bad'
]

for sentence in test_sentences:
    test_tokenized = tokenizer([sentence])
    test_vec, _ = vectorizer(test_tokenized, enc)
    test_vec_lstm = test_vec.reshape(1, -1)
    result = model.predict(test_vec_lstm)
    predicted_class = result.argmax()
    predicted_score = predicted_class
    print(f"Test sentence: {sentence}")
    print(f"Predicted score: {predicted_score}")
    print()

Number of tokens:  2
Number of sentences:  0
