In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import TextVectorization, Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [None]:
max_features = 12000
embedding_dim = 32
sequence_length = 600
epochs = 4
batch_size = 16
validation_split = 0.1
verbose = 1

In [None]:
!wget https://stuyai.org/download/reviews.csv

--2024-10-21 22:41:25--  https://stuyai.org/download/reviews.csv
Resolving stuyai.org (stuyai.org)... 167.99.227.188
Connecting to stuyai.org (stuyai.org)|167.99.227.188|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14966021 (14M) [text/csv]
Saving to: ‘reviews.csv’


2024-10-21 22:41:26 (29.8 MB/s) - ‘reviews.csv’ saved [14966021/14966021]



In [None]:
# Load data
df = pd.read_csv("reviews.csv")
df.dropna(inplace=True)

df.head(10)

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5
5,love monaco staff husband stayed hotel crazy w...,5
6,"cozy stay rainy city, husband spent 7 nights m...",5
7,"excellent staff, housekeeping quality hotel ch...",4
8,"hotel stayed hotel monaco cruise, rooms genero...",5
9,excellent stayed hotel monaco past w/e delight...,5


In [None]:
df = df[df['Rating'] != 3]
def combine_ratings(rating):
    if rating in [4, 5]:
        return 1
    elif rating in [1, 2]:
        return 0
    else:
        return 1  # For any ratings not covered (though should be none)

df['Rating'] = df['Rating'].apply(combine_ratings)
# Split data into features and labels
texts = df['Review'].values
labels = df['Rating'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Rating'] = df['Rating'].apply(combine_ratings)


In [None]:
# Define TextVectorization layer
text_vectorizer = TextVectorization(
    max_tokens=max_features,
    ragged=False,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    pad_to_max_tokens=True,
    output_sequence_length=sequence_length
)

# Fit TextVectorization layer
text_vectorizer.adapt(texts)

# Convert texts to sequences
X = text_vectorizer(texts).numpy()
y = labels


In [None]:
model = Sequential([
    # Embedding layer
    Embedding(input_dim=max_features, output_dim=embedding_dim),

    # Convolutional layers for feature extraction
    Conv1D(128, 2, activation='relu', padding='same'),
    MaxPooling1D(2),
    Dropout(0.5),

    Conv1D(128, 2, activation='relu', padding='same'),
    MaxPooling1D(2),
    Dropout(0.5),

    # Flatten the output for the dense layers
    Flatten(),

    # Dense layers for classification
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(2, activation='softmax')  # Output layer for 5 classes
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split, verbose=verbose)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


Epoch 1/50
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 127ms/step - accuracy: 0.8288 - loss: 0.4143 - val_accuracy: 0.9505 - val_loss: 0.1490
Epoch 2/50
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 148ms/step - accuracy: 0.9573 - loss: 0.1249 - val_accuracy: 0.9597 - val_loss: 0.1231
Epoch 3/50
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 123ms/step - accuracy: 0.9779 - loss: 0.0697 - val_accuracy: 0.9543 - val_loss: 0.1434
Epoch 4/50
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 124ms/step - accuracy: 0.9880 - loss: 0.0366 - val_accuracy: 0.9467 - val_loss: 0.1549
Epoch 5/50
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 123ms/step - accuracy: 0.9926 - loss: 0.0236 - val_accuracy: 0.9471 - val_loss: 0.2101
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.9555 - loss: 0.1165
Test Accuracy: 0.9563


In [None]:
# Preprocess new reviews
def predict(new_reviews):

  X_new = text_vectorizer(new_reviews)

  # Make predictions
  predictions = model.predict(X_new)
  predicted_classes = tf.argmax(predictions, axis=1).numpy()

  # Decode predicted classes
  label_encoder = LabelBinarizer()
  label_encoder.fit(y)
  decoded_predictions = label_encoder.inverse_transform(predicted_classes.reshape(-1, 1))
  # Output results
  for review, pred, decoded_pred in zip(new_reviews, predictions, decoded_predictions):
      print(f"Review: {review}")
      print(pred)


In [None]:
inp = input()
while(inp != "quit"):
  predict([inp])
  inp = input()

My recent stay at Absalon Hotel was, unfortunately, a series of disappointments from the moment we arrived.  The first issue we encountered was the state of the reception area. It was dirty and looked nothing like the photos on the hotel’s website, which already gave us a bad first impression.  After checking in we got a small 11-13m2 room. The cleanliness of the room was unacceptable. The floor was dirty, the toilet had fecal stains, there was hair from previous guests on the floor, and worst of all, the bed was full of body hair from other people. For a hotel that claims to be four stars, the hygiene standards were shockingly low.  We did make a complaint about the state of the room, and to their credit, the hotel staff cleaned the room and gave us a voucher for a cappuccino at the bar.  The breakfast was another major letdown. The selection was extremely limited, with only two types of cheese, a couple of varieties of ham and sausage, boiled eggs, bread, toast, croissants, and some 

KeyboardInterrupt: Interrupted by user

In [None]:
import numpy as np

# Extract the embedding weights
embedding_layer = model.layers[0]  # Assuming the embedding layer is the first layer
embeddings = embedding_layer.get_weights()[0]  # Shape: (vocab_size, embedding_dim)

# Save the embeddings to a file
np.savetxt('embeddings.tsv', embeddings, delimiter='\t')
# Get the vocabulary
vocab = text_vectorizer.get_vocabulary()

# Save the metadata (words) to a file
with open('metadata.tsv', 'w') as f:
    for word in vocab:
        f.write(f"{word}\n")


In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
from nltk import pos_tag, word_tokenize

# Get vocabulary from TextVectorization
vocab = text_vectorizer.get_vocabulary()

# Filter adjectives from vocabulary using POS tagging
adjectives = []
for word in vocab:
    word_tokens = word_tokenize(word)
    pos_tags = pos_tag(word_tokens)

    # Check if any token is an adjective
    if any(tag in ['JJ', 'JJR', 'JJS'] for _, tag in pos_tags):
        adjectives.append(word)

print(f"Number of adjectives: {len(adjectives)}")
# Get embeddings from the model
embeddings = model.layers[0].get_weights()[0]  # Shape: (vocab_size, embedding_dim)

# Create an index map of the vocabulary
vocab_index = {word: idx for idx, word in enumerate(vocab)}

# Filter embeddings to only include adjectives
adjective_embeddings = np.array([embeddings[vocab_index[word]] for word in adjectives])

# Save the filtered embeddings
np.savetxt('adjective_embeddings.tsv', adjective_embeddings, delimiter='\t')

# Save the adjectives as metadata
with open('adjective_metadata.tsv', 'w') as f:
    for word in adjectives:
        f.write(f"{word}\n")


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Number of adjectives: 627


https://projector.tensorflow.org/