In [1]:
!pip install scipy==1.12
!pip install pandas
!pip install scikit-learn
!pip install nltk
!pip install tiktoken
!pip install gensim
!pip install spacy



In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import pandas as pd
import spacy
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def tokenizer(X):
    nlp = spacy.load("en_core_web_sm")
    stop_words = set(stopwords.words('english'))
    num_tokens = 0
    num_sentences = 0
    tokenized_data = []
    
    for doc in nlp.pipe(X, batch_size=5, disable=["parser", "ner"]):
        tokens = [token.text for token in doc if token.text not in stop_words]
        #num_tokens += len(tokens)
        #num_sentences += len(list(doc.sents))
        tokenized_data.append(tokens)

    # print("Number of tokens: ", num_tokens)
    # print("Number of sentences: ", num_sentences)
    
    return tokenized_data

import gensim.downloader as api
from numpy import zeros
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def vectorizer(tokenized_documents):
    model = api.load("word2vec-google-news-300")
    
    def avg_word2vec(tokens):
      missing_words = [token for token in tokens if token not in model]
      word_vectors = [model[token] for token in tokens if token in model]
    
      if missing_words:
          num_dimensions = len(next(iter(word_vectors)))
          zero_vector = zeros(num_dimensions)
          word_vectors.extend([zero_vector] * len(missing_words))
    
      if word_vectors:
          average_vector = sum(word_vectors) / len(word_vectors)
          return average_vector
      else:
          num_dimensions = model.vector_size
          return zeros(num_dimensions)
    
    document_vectors = [avg_word2vec(doc) for doc in tokenized_documents]
    X = np.array(document_vectors)

    scaler = MinMaxScaler()
    scaler.fit(X)
    X_train_scaled = scaler.transform(X)

    return X_train_scaled, None

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# NLTK TOKENIZER AND TF-IDF VECTORIZER

In [4]:
import sys
sys.path.append('../')
from tokenizer import tokenizer
from vectorizer import vectorizer

import numpy as np
import pandas as pd
import sklearn as sk

[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
data = pd.read_csv('../../_data/Reviews.csv') # Loading the dataset
X, y = data['Text'], data['Score']

In [6]:
# FOR TESTING : only select first 20000 samples
# X, y = X[:2000], y[:2000]

In [7]:
tokenized_documents = tokenizer(X)

In [None]:
X, vect = vectorizer(tokenized_documents)

In [None]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

print(X_train[0])

# Feed Forward Neural Network
### Model starts here

In [None]:
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Add
from tensorflow.keras.utils import to_categorical
from sklearn import metrics
import datetime
import os

In [None]:
def batch_generator(X, y, batch_size=32, num_classes=None):
    num_samples = X.shape[0]
    while True:
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            X_batch = X[start:end]#.toarray()  # Convert only this slice to dense
            y_batch = to_categorical(y[start:end], num_classes=num_classes)
            yield (X_batch, y_batch)

# Setup the model
inputs = Input(shape=(X_train.shape[1],))
x = Dense(512, activation='relu')(inputs)
x = Dropout(0.2)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.2)(x)
residual = Dense(64, activation='relu')(x)  # Adjust the residual shape

x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Add()([x, residual])  # Add residual connection
x = Dense(32, activation='relu')(x)
outputs = Dense(np.max(y_train) + 1, activation='softmax')(x)

model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model using a generator
train_generator = batch_generator(X_train, y_train, batch_size=32, num_classes=np.max(y_train) + 1)
model.fit(train_generator, epochs=20, steps_per_epoch=int(np.ceil(X_train.shape[0] / 32)))

In [None]:
test_generator = batch_generator(X_test, y_test, batch_size=32, num_classes=np.max(y_train) + 1)

In [None]:
# Evaluate the model
predictions_prob = model.predict(test_generator, steps=int(np.ceil(X_test.shape[0] / 32)))
predictions = np.argmax(predictions_prob, axis=1)

# Metrics
print("Confusion Matrix:")
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

print("\nClassification Report:")
print(metrics.classification_report(y_test, predictions))

In [None]:
# # Save the model with current date and time in model folder

# Create a folder named _models in the current directory
if not os.path.exists('_models'):
    os.makedirs('_models')
    
model.save(f'_models/config3_feedforward_{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}.keras')

In [None]:
# test the model
test_sentences = [
    'This is a good product',
    'This is a bad product',
    'This is a product',
    'This is a very good product',
    'This is a very bad product',
    'That was bad'
]

for sentence in test_sentences:
    test_tokenized = tokenizer([sentence])
    test_vec = vectorizer(test_tokenized)[0]
    test_vec_lstm = test_vec.reshape(1, -1)
    result = model.predict(test_vec_lstm)
    predicted_class = result.argmax()
    predicted_score = predicted_class
    print(f"Test sentence: {sentence}")
    print(f"Predicted score: {predicted_score}")
    print()