<a href="https://colab.research.google.com/github/samwondim/nlp-recommendation-image-classification/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras


In [None]:
filename = 'https://github.com/lmassaron/datasets/releases/download/1.0/imdb_50k.feather'
reviews = pd.read_feather(filename)

print(reviews.review.sample(1).values[0])

In [None]:
train = reviews.sample(3000, random_state=42)
sampled_idx = train.index
valid = (reviews[~reviews.index.isin(train.index)].sample(1000, random_state=42))
sampled_idx.append(valid.index)
test = reviews[~reviews.index.isin(sampled_idx)]


In [None]:
# Instantiate a tokenizer
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train.review)

In [None]:
def tokenize_and_pad(tokenizer, df, maxlen):
  sequences = tokenizer.texts_to_sequences(df.review)
  pad = keras.preprocessing.sequence.pad_sequences
  padded_seqs = pad(sequences, maxlen)

  return padded_seqs, df.sentiment

In [None]:
X, y = tokenize_and_pad(tokenizer, train, maxlen=256)
Xv, yv = tokenize_and_pad(tokenizer, valid, maxlen=256)
Xt, yt = tokenize_and_pad(tokenizer, test, maxlen=256)

In [None]:
# model = keras.models.Sequential()
tokenizer.index_word

In [None]:
model = keras.models.Sequential()
voc = len(tokenizer.index_word) + 1
feats = 8
seq_len = 256

model.add(keras.layers.Embedding(voc, feats, input_length=seq_len))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

model.summary()

In [None]:
history = model.fit(X, y, epochs=20, batch_size=16, validation_data=(Xv, yv))

In [None]:
from sklearn.metrics import accuracy_score

predictions = (model.predict(Xt)>=0.5).astype(int)
test_accuracy = accuracy_score(yt, predictions)

print(f"Accuracy on test set: {test_accuracy}")