In [15]:
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
#nltk.download('stopwords')
import nltk
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [16]:
data = pd.read_csv('../_data/Reviews.csv')

data

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [17]:
# preprocessing

data['Text'] = data['Text'].str.lower()
data['Text'] = data['Text'].str.replace(r'\W', ' ')
data['Text'] = data['Text'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
data['Text'] = data['Text'].str.replace(r'\^[a-zA-Z]\s+', ' ')
data['Text'] = data['Text'].str.replace(r'\s+', ' ')
data['Text'] = data['Text'].str.replace(r'\s+', ' ')
data['Text'] = data['Text'].str.strip()

# split data

X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(data['Text'], data['Score'], test_size=0.2)

In [18]:
# Convert scores to categorical format
num_classes = 5
y_train_cat = to_categorical(y_train - 1, num_classes)
y_test_cat = to_categorical(y_test - 1, num_classes)

In [19]:
# Vectorization
vectorizer = TfidfVectorizer(max_features=1000, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
X_train_vec = vectorizer.fit_transform(X_train).toarray()
X_test_vec = vectorizer.transform(X_test).toarray()

# Neural Network Model
model = Sequential()
model.add(Dense(512, activation='relu', input_dim=X_train_vec.shape[1]))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))  # Change to softmax for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy',  # Changed to categorical_crossentropy
              optimizer='adam',
              metrics=['accuracy'])

# Train the model
model.fit(X_train_vec, y_train_cat, epochs=10, batch_size=32)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_vec, y_test_cat)
print(f'Loss: {loss}, Accuracy: {accuracy}')

[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 878us/step - accuracy: 0.7788 - loss: 1.2337
Loss: 1.2400438785552979, Accuracy: 0.7780914902687073


In [None]:
# Save the model with current date and time in model folder
import datetime
model.save(f'_models/{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}.keras')

In [None]:
# test the model

test = ['This is a good product']
test_vec = vectorizer.transform(test)
result = model.predict(test_vec)
print(result)
print(f'Predicted score: {result.argmax() + 1}')

test = ['This is a bad product']
test_vec = vectorizer.transform(test)
result = model.predict(test_vec)
print(result)
print(f'Predicted score: {result.argmax() + 1}')

test = ['This is a product']
test_vec = vectorizer.transform(test)
result = model.predict(test_vec)
print(result)
print(f'Predicted score: {result.argmax() + 1}')

test = ['This is a very good product']
test_vec = vectorizer.transform(test)
result = model.predict(test_vec)
print(result)
print(f'Predicted score: {result.argmax() + 1}')

test = ['This is a very bad product']
test_vec = vectorizer.transform(test)
result = model.predict(test_vec)
print(result)
print(f'Predicted score: {result.argmax() + 1}')

test = ['That was bad']




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[[0.12998785 0.16304216 0.17141286 0.17161053 0.3639466 ]]
Predicted score: 5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[[0.58646053 0.05305136 0.04656776 0.08802016 0.22590026]]
Predicted score: 1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[[0.28769112 0.01987454 0.03460139 0.3229956  0.33483735]]
Predicted score: 5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[[0.12998785 0.16304216 0.17141286 0.17161053 0.3639466 ]]
Predicted score: 5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[[0.58646053 0.05305136 0.04656776 0.08802016 0.22590026]]
Predicted score: 1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[[1.8285264e-04 7.8903645e-07 4.6153221e-07 4.9896254e-05 9.9976605e-01]]
Predicted score: 5
